diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
commit | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch) | |
tree | 64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp | |
parent | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff) | |
download | ydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp | 322 |
1 files changed, 161 insertions, 161 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp index 94e71f1d60..cccac55952 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp @@ -22,13 +22,13 @@ /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are -/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these -/// get.active.lane.mask intrinsic and attempts to convert them to VCTP -/// instructions. This will be picked up by the ARM Low-overhead loop pass later -/// in the backend, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. -// -//===----------------------------------------------------------------------===// +/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these +/// get.active.lane.mask intrinsic and attempts to convert them to VCTP +/// instructions. This will be picked up by the ARM Low-overhead loop pass later +/// in the backend, which performs the final transformation to a DLSTP or WLSTP +/// tail-predicated loop. +// +//===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" @@ -47,7 +47,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -57,8 +57,8 @@ using namespace llvm; #define DESC "Transform predicated vector loops to use MVE tail predication" cl::opt<TailPredication::Mode> EnableTailPredication( - "tail-predication", cl::desc("MVE tail-predication pass options"), - cl::init(TailPredication::Enabled), + "tail-predication", cl::desc("MVE tail-predication pass options"), + cl::init(TailPredication::Enabled), cl::values(clEnumValN(TailPredication::Disabled, "disabled", "Don't tail-predicate loops"), clEnumValN(TailPredication::EnabledNoReductions, @@ -103,18 +103,18 @@ public: bool runOnLoop(Loop *L, LPPassManager&) override; private: - /// Perform the relevant checks on the loop and convert active lane masks if - /// possible. - bool TryConvertActiveLaneMask(Value *TripCount); + /// Perform the relevant checks on the loop and convert active lane masks if + /// possible. + bool TryConvertActiveLaneMask(Value *TripCount); - /// Perform several checks on the arguments of @llvm.get.active.lane.mask - /// intrinsic. E.g., check that the loop induction variable and the element - /// count are of the form we expect, and also perform overflow checks for - /// the new expressions that are created. - bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); + /// Perform several checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic. E.g., check that the loop induction variable and the element + /// count are of the form we expect, and also perform overflow checks for + /// the new expressions that are created. + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -155,7 +155,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { continue; Intrinsic::ID ID = Call->getIntrinsicID(); - if (ID == Intrinsic::start_loop_iterations || + if (ID == Intrinsic::start_loop_iterations || ID == Intrinsic::test_set_loop_iterations) return cast<IntrinsicInst>(&I); } @@ -174,23 +174,23 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { return false; } - LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); - bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); + bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); - return Changed; + return Changed; } // The active lane intrinsic has this form: // -// @llvm.get.active.lane.mask(IV, TC) +// @llvm.get.active.lane.mask(IV, TC) // // Here we perform checks that this intrinsic behaves as expected, // which means: // -// 1) Check that the TripCount (TC) belongs to this loop (originally). -// 2) The element count (TC) needs to be sufficiently large that the decrement -// of element counter doesn't overflow, which means that we need to prove: +// 1) Check that the TripCount (TC) belongs to this loop (originally). +// 2) The element count (TC) needs to be sufficiently large that the decrement +// of element counter doesn't overflow, which means that we need to prove: // ceil(ElementCount / VectorWidth) >= TripCount // by rounding up ElementCount up: // ((ElementCount + (VectorWidth - 1)) / VectorWidth @@ -199,122 +199,122 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { // 3) The IV must be an induction phi with an increment equal to the // vector width. bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { + Value *TripCount) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - Value *ElemCount = ActiveLaneMask->getOperand(1); - bool Changed = false; - if (!L->makeLoopInvariant(ElemCount, Changed)) - return false; - - auto *EC= SE->getSCEV(ElemCount); - auto *TC = SE->getSCEV(TripCount); - int VectorWidth = - cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); - if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) - return false; - ConstantInt *ConstElemCount = nullptr; - - // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to - // this loop. The scalar tripcount corresponds the number of elements - // processed by the loop, so we will refer to that from this point on. - if (!SE->isLoopInvariant(EC, L)) { - LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); + Value *ElemCount = ActiveLaneMask->getOperand(1); + bool Changed = false; + if (!L->makeLoopInvariant(ElemCount, Changed)) + return false; + + auto *EC= SE->getSCEV(ElemCount); + auto *TC = SE->getSCEV(TripCount); + int VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); + if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) + return false; + ConstantInt *ConstElemCount = nullptr; + + // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to + // this loop. The scalar tripcount corresponds the number of elements + // processed by the loop, so we will refer to that from this point on. + if (!SE->isLoopInvariant(EC, L)) { + LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); return false; } - if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { - ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); - if (!TC) { - LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " - "set.loop.iterations\n"); - return false; - } - - // Calculate 2 tripcount values and check that they are consistent with - // each other. The TripCount for a predicated vector loop body is - // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we - // work it out here. - uint64_t TC1 = TC->getZExtValue(); - uint64_t TC2 = - (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; - - // If the tripcount values are inconsistent, we can't insert the VCTP and - // trigger tail-predication; keep the intrinsic as a get.active.lane.mask - // and legalize this. - if (TC1 != TC2) { - LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " - << TC1 << " from set.loop.iterations, and " - << TC2 << " from get.active.lane.mask\n"); - return false; - } - } else if (!ForceTailPredication) { - // 2) We need to prove that the sub expression that we create in the - // tail-predicated loop body, which calculates the remaining elements to be - // processed, is non-negative, i.e. it doesn't overflow: - // - // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 - // - // This is true if: - // - // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth - // - // which what we will be using here. - // - auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); - // ElementCount + (VW-1): - auto *ECPlusVWMinus1 = SE->getAddExpr(EC, - SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); - - // Ceil = ElementCount + (VW-1) / VW - auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); - - // Prevent unused variable warnings with TC - (void)TC; - LLVM_DEBUG( - dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; - dbgs() << "ARM TP: - TripCount = "; TC->dump(); - dbgs() << "ARM TP: - ElemCount = "; EC->dump(); - dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; - dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); - ); - - // As an example, almost all the tripcount expressions (produced by the - // vectoriser) look like this: - // - // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4) - // - // and "ElementCount + (VW-1) / VW": - // - // Ceil = ((3 + %N) /u 4) - // - // Check for equality of TC and Ceil by calculating SCEV expression - // TC - Ceil and test it for zero. - // - bool Zero = SE->getMinusSCEV( - SE->getBackedgeTakenCount(L), - SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), - SE->getNegativeSCEV(VW)), - VW)) - ->isZero(); - - if (!Zero) { - LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); - return false; - } + if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { + ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); + if (!TC) { + LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " + "set.loop.iterations\n"); + return false; + } + + // Calculate 2 tripcount values and check that they are consistent with + // each other. The TripCount for a predicated vector loop body is + // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we + // work it out here. + uint64_t TC1 = TC->getZExtValue(); + uint64_t TC2 = + (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; + + // If the tripcount values are inconsistent, we can't insert the VCTP and + // trigger tail-predication; keep the intrinsic as a get.active.lane.mask + // and legalize this. + if (TC1 != TC2) { + LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " + << TC1 << " from set.loop.iterations, and " + << TC2 << " from get.active.lane.mask\n"); + return false; + } + } else if (!ForceTailPredication) { + // 2) We need to prove that the sub expression that we create in the + // tail-predicated loop body, which calculates the remaining elements to be + // processed, is non-negative, i.e. it doesn't overflow: + // + // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 + // + // This is true if: + // + // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth + // + // which what we will be using here. + // + auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); + // ElementCount + (VW-1): + auto *ECPlusVWMinus1 = SE->getAddExpr(EC, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); + + // Prevent unused variable warnings with TC + (void)TC; + LLVM_DEBUG( + dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; + dbgs() << "ARM TP: - TripCount = "; TC->dump(); + dbgs() << "ARM TP: - ElemCount = "; EC->dump(); + dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; + dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); + ); + + // As an example, almost all the tripcount expressions (produced by the + // vectoriser) look like this: + // + // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4) + // + // and "ElementCount + (VW-1) / VW": + // + // Ceil = ((3 + %N) /u 4) + // + // Check for equality of TC and Ceil by calculating SCEV expression + // TC - Ceil and test it for zero. + // + bool Zero = SE->getMinusSCEV( + SE->getBackedgeTakenCount(L), + SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), + SE->getNegativeSCEV(VW)), + VW)) + ->isZero(); + + if (!Zero) { + LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); + return false; + } } - // 3) Find out if IV is an induction phi. Note that we can't use Loop + // 3) Find out if IV is an induction phi. Note that we can't use Loop // helpers here to get the induction variable, because the hardware loop is - // no longer in loopsimplify form, and also the hwloop intrinsic uses a - // different counter. Using SCEV, we check that the induction is of the + // no longer in loopsimplify form, and also the hwloop intrinsic uses a + // different counter. Using SCEV, we check that the induction is of the // form i = i + 4, where the increment must be equal to the VectorWidth. auto *IV = ActiveLaneMask->getOperand(0); auto *IVExpr = SE->getSCEV(IV); auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr); - + if (!AddExpr) { LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); return false; @@ -324,11 +324,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); return false; } - auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0)); - if (!Base || !Base->isZero()) { - LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); - return false; - } + auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0)); + if (!Base || !Base->isZero()) { + LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); + return false; + } auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1)); if (!Step) { LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; @@ -339,29 +339,29 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue - << " doesn't match vector width " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue + << " doesn't match vector width " << VectorWidth << "\n"); return false; } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { + Value *TripCount) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); - unsigned VectorWidth = - cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); + unsigned VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); // Insert a phi to count the number of elements processed by the loop. - Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); + Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); - // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and - // thus represent the effect of tail predication. + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and + // thus represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); - ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); + ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); Intrinsic::ID VCTPID; switch (VectorWidth) { @@ -390,36 +390,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } -bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { - SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; - for (auto *BB : L->getBlocks()) - for (auto &I : *BB) - if (auto *Int = dyn_cast<IntrinsicInst>(&I)) - if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) - ActiveLaneMasks.push_back(Int); - - if (ActiveLaneMasks.empty()) +bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { + SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (auto *Int = dyn_cast<IntrinsicInst>(&I)) + if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) + ActiveLaneMasks.push_back(Int); + + if (ActiveLaneMasks.empty()) return false; LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - for (auto *ActiveLaneMask : ActiveLaneMasks) { + for (auto *ActiveLaneMask : ActiveLaneMasks) { LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { + if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount); } - // Remove dead instructions and now dead phis. - for (auto *II : ActiveLaneMasks) - RecursivelyDeleteTriviallyDeadInstructions(II); - for (auto I : L->blocks()) - DeleteDeadPHIs(I); + // Remove dead instructions and now dead phis. + for (auto *II : ActiveLaneMasks) + RecursivelyDeleteTriviallyDeadInstructions(II); + for (auto I : L->blocks()) + DeleteDeadPHIs(I); return true; } |