diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:39 +0300 |
commit | e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch) | |
tree | 64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp | |
parent | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff) | |
download | ydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp')
-rw-r--r-- | contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp | 360 |
1 files changed, 180 insertions, 180 deletions
diff --git a/contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp b/contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp index 44419a8726..8d51bb2610 100644 --- a/contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/libs/llvm12/lib/CodeGen/MachineScheduler.cpp @@ -18,7 +18,7 @@ #include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveInterval.h" @@ -74,8 +74,8 @@ using namespace llvm; #define DEBUG_TYPE "machine-scheduler" -STATISTIC(NumClustered, "Number of load/store pairs clustered"); - +STATISTIC(NumClustered, "Number of load/store pairs clustered"); + namespace llvm { cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden, @@ -129,15 +129,15 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, cl::desc("Enable memop clustering."), cl::init(true)); -static cl::opt<bool> - ForceFastCluster("force-fast-cluster", cl::Hidden, - cl::desc("Switch to fast cluster algorithm with the lost " - "of some fusion opportunities"), - cl::init(false)); -static cl::opt<unsigned> - FastClusterThreshold("fast-cluster-threshold", cl::Hidden, - cl::desc("The threshold for fast cluster"), - cl::init(1000)); +static cl::opt<bool> + ForceFastCluster("force-fast-cluster", cl::Hidden, + cl::desc("Switch to fast cluster algorithm with the lost " + "of some fusion opportunities"), + cl::init(false)); +static cl::opt<unsigned> + FastClusterThreshold("fast-cluster-threshold", cl::Hidden, + cl::desc("The threshold for fast cluster"), + cl::init(1000)); // DAG subtrees must have at least this many nodes. static const unsigned MinSubtreeSize = 8; @@ -240,13 +240,13 @@ char PostMachineScheduler::ID = 0; char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; -INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", - "PostRA Machine Instruction Scheduler", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", - "PostRA Machine Instruction Scheduler", false, false) +INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", + "PostRA Machine Instruction Scheduler", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", + "PostRA Machine Instruction Scheduler", false, false) PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry()); @@ -1115,7 +1115,7 @@ updateScheduledPressure(const SUnit *SU, void ScheduleDAGMILive::updatePressureDiffs( ArrayRef<RegisterMaskPair> LiveUses) { for (const RegisterMaskPair &P : LiveUses) { - Register Reg = P.RegUnit; + Register Reg = P.RegUnit; /// FIXME: Currently assuming single-use physregs. if (!Register::isVirtualRegister(Reg)) continue; @@ -1315,7 +1315,7 @@ void ScheduleDAGMILive::computeDFSResult() { /// The cyclic path estimation identifies a def-use pair that crosses the back /// edge and considers the depth and height of the nodes. For example, consider /// the following instruction sequence where each instruction has unit latency -/// and defines an eponymous virtual register: +/// and defines an eponymous virtual register: /// /// a->b(a,c)->c(b)->d(c)->exit /// @@ -1340,7 +1340,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { unsigned MaxCyclicLatency = 0; // Visit each live out vreg def to find def/use pairs that cross iterations. for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { - Register Reg = P.RegUnit; + Register Reg = P.RegUnit; if (!Register::isVirtualRegister(Reg)) continue; const LiveInterval &LI = LIS->getInterval(Reg); @@ -1544,12 +1544,12 @@ public: void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster, - ScheduleDAGInstrs *DAG); - void collectMemOpRecords(std::vector<SUnit> &SUnits, - SmallVectorImpl<MemOpInfo> &MemOpRecords); - bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, - DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups); + void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster, + ScheduleDAGInstrs *DAG); + void collectMemOpRecords(std::vector<SUnit> &SUnits, + SmallVectorImpl<MemOpInfo> &MemOpRecords); + bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, + DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1585,179 +1585,179 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII, } // end namespace llvm -// Sorting all the loads/stores first, then for each load/store, checking the -// following load/store one by one, until reach the first non-dependent one and -// call target hook to see if they can cluster. -// If FastCluster is enabled, we assume that, all the loads/stores have been -// preprocessed and now, they didn't have dependencies on each other. +// Sorting all the loads/stores first, then for each load/store, checking the +// following load/store one by one, until reach the first non-dependent one and +// call target hook to see if they can cluster. +// If FastCluster is enabled, we assume that, all the loads/stores have been +// preprocessed and now, they didn't have dependencies on each other. void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster, - ScheduleDAGInstrs *DAG) { - // Keep track of the current cluster length and bytes for each SUnit. - DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo; + ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster, + ScheduleDAGInstrs *DAG) { + // Keep track of the current cluster length and bytes for each SUnit. + DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo; // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; - - // Seek for the next load/store to do the cluster. - unsigned NextIdx = Idx + 1; - for (; NextIdx < End; ++NextIdx) - // Skip if MemOpb has been clustered already or has dependency with - // MemOpa. - if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && - (FastCluster || - (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && - !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)))) - break; - if (NextIdx == End) + + // Seek for the next load/store to do the cluster. + unsigned NextIdx = Idx + 1; + for (; NextIdx < End; ++NextIdx) + // Skip if MemOpb has been clustered already or has dependency with + // MemOpa. + if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && + (FastCluster || + (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && + !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)))) + break; + if (NextIdx == End) continue; - - auto MemOpb = MemOpRecords[NextIdx]; - unsigned ClusterLength = 2; - unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width; - if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { - ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; - CurrentClusterBytes = - SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width; + + auto MemOpb = MemOpRecords[NextIdx]; + unsigned ClusterLength = 2; + unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width; + if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { + ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; + CurrentClusterBytes = + SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width; } - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, - CurrentClusterBytes)) - continue; - + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) + continue; + SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); // FIXME: Is this check really required? - if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) + if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) continue; LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - ++NumClustered; - - if (IsLoad) { - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); - } - } else { - // Copy predecessor edges from SUb to SUa to avoid the SUnits that - // SUb dependent on scheduled in-between SUb and SUa. Successor edges - // do not need to be copied from SUa to SUb since no one will depend - // on stores. - // Notice that, we don't need to care about the memory dependency as - // we won't try to cluster them if they have any memory dependency. - for (const SDep &Pred : SUb->Preds) { - if (Pred.getSUnit() == SUa) - continue; - LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); - } + ++NumClustered; + + if (IsLoad) { + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + } else { + // Copy predecessor edges from SUb to SUa to avoid the SUnits that + // SUb dependent on scheduled in-between SUb and SUa. Successor edges + // do not need to be copied from SUa to SUb since no one will depend + // on stores. + // Notice that, we don't need to care about the memory dependency as + // we won't try to cluster them if they have any memory dependency. + for (const SDep &Pred : SUb->Preds) { + if (Pred.getSUnit() == SUa) + continue; + LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); + } } - SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, - CurrentClusterBytes}; - + SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, + CurrentClusterBytes}; + LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength << ", Curr cluster bytes: " << CurrentClusterBytes << "\n"); } } -void BaseMemOpClusterMutation::collectMemOpRecords( - std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) { - for (auto &SU : SUnits) { +void BaseMemOpClusterMutation::collectMemOpRecords( + std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) { + for (auto &SU : SUnits) { if ((IsLoad && !SU.getInstr()->mayLoad()) || (!IsLoad && !SU.getInstr()->mayStore())) continue; - const MachineInstr &MI = *SU.getInstr(); - SmallVector<const MachineOperand *, 4> BaseOps; - int64_t Offset; - bool OffsetIsScalable; - unsigned Width; - if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, - OffsetIsScalable, Width, TRI)) { - MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width)); - - LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " - << Offset << ", OffsetIsScalable: " << OffsetIsScalable - << ", Width: " << Width << "\n"); - } -#ifndef NDEBUG - for (auto *Op : BaseOps) - assert(Op); -#endif - } -} - -bool BaseMemOpClusterMutation::groupMemOps( - ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, - DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) { - bool FastCluster = - ForceFastCluster || - MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold; - - for (const auto &MemOp : MemOps) { + const MachineInstr &MI = *SU.getInstr(); + SmallVector<const MachineOperand *, 4> BaseOps; + int64_t Offset; + bool OffsetIsScalable; + unsigned Width; + if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, + OffsetIsScalable, Width, TRI)) { + MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width)); + + LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " + << Offset << ", OffsetIsScalable: " << OffsetIsScalable + << ", Width: " << Width << "\n"); + } +#ifndef NDEBUG + for (auto *Op : BaseOps) + assert(Op); +#endif + } +} + +bool BaseMemOpClusterMutation::groupMemOps( + ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG, + DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) { + bool FastCluster = + ForceFastCluster || + MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold; + + for (const auto &MemOp : MemOps) { unsigned ChainPredID = DAG->SUnits.size(); - if (FastCluster) { - for (const SDep &Pred : MemOp.SU->Preds) { - // We only want to cluster the mem ops that have the same ctrl(non-data) - // pred so that they didn't have ctrl dependency for each other. But for - // store instrs, we can still cluster them if the pred is load instr. - if ((Pred.isCtrl() && - (IsLoad || - (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && - !Pred.isArtificial()) { - ChainPredID = Pred.getSUnit()->NodeNum; - break; - } + if (FastCluster) { + for (const SDep &Pred : MemOp.SU->Preds) { + // We only want to cluster the mem ops that have the same ctrl(non-data) + // pred so that they didn't have ctrl dependency for each other. But for + // store instrs, we can still cluster them if the pred is load instr. + if ((Pred.isCtrl() && + (IsLoad || + (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && + !Pred.isArtificial()) { + ChainPredID = Pred.getSUnit()->NodeNum; + break; + } } - } else - ChainPredID = 0; - - Groups[ChainPredID].push_back(MemOp); - } - return FastCluster; -} - -/// Callback from DAG postProcessing to create cluster edges for loads/stores. -void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { - // Collect all the clusterable loads/stores - SmallVector<MemOpInfo, 32> MemOpRecords; - collectMemOpRecords(DAG->SUnits, MemOpRecords); - - if (MemOpRecords.size() < 2) - return; - - // Put the loads/stores without dependency into the same group with some - // heuristic if the DAG is too complex to avoid compiling time blow up. - // Notice that, some fusion pair could be lost with this. - DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups; - bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups); - - for (auto &Group : Groups) { - // Sorting the loads/stores, so that, we can stop the cluster as early as - // possible. - llvm::sort(Group.second); - - // Trying to cluster all the neighboring loads/stores. - clusterNeighboringMemOps(Group.second, FastCluster, DAG); - } + } else + ChainPredID = 0; + + Groups[ChainPredID].push_back(MemOp); + } + return FastCluster; +} + +/// Callback from DAG postProcessing to create cluster edges for loads/stores. +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { + // Collect all the clusterable loads/stores + SmallVector<MemOpInfo, 32> MemOpRecords; + collectMemOpRecords(DAG->SUnits, MemOpRecords); + + if (MemOpRecords.size() < 2) + return; + + // Put the loads/stores without dependency into the same group with some + // heuristic if the DAG is too complex to avoid compiling time blow up. + // Notice that, some fusion pair could be lost with this. + DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups; + bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups); + + for (auto &Group : Groups) { + // Sorting the loads/stores, so that, we can stop the cluster as early as + // possible. + llvm::sort(Group.second); + + // Trying to cluster all the neighboring loads/stores. + clusterNeighboringMemOps(Group.second, FastCluster, DAG); + } } //===----------------------------------------------------------------------===// @@ -2816,11 +2816,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone) { if (Zone.isTop()) { - // Prefer the candidate with the lesser depth, but only if one of them has - // depth greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > - Zone.getScheduledLatency()) { + // Prefer the candidate with the lesser depth, but only if one of them has + // depth greater than the total latency scheduled so far, otherwise either + // of them could be scheduled now with no stall. + if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(), TryCand, Cand, GenericSchedulerBase::TopDepthReduce)) return true; @@ -2829,11 +2829,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, TryCand, Cand, GenericSchedulerBase::TopPathReduce)) return true; } else { - // Prefer the candidate with the lesser height, but only if one of them has - // height greater than the total latency scheduled so far, otherwise either - // of them could be scheduled now with no stall. - if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > - Zone.getScheduledLatency()) { + // Prefer the candidate with the lesser height, but only if one of them has + // height greater than the total latency scheduled so far, otherwise either + // of them could be scheduled now with no stall. + if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) > + Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) return true; @@ -3456,13 +3456,13 @@ ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { return DAG; } -static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { +static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { return createGenericSchedLive(C); } static MachineSchedRegistry GenericSchedRegistry("converge", "Standard converging scheduler.", - createConvergingSched); + createConvergingSched); //===----------------------------------------------------------------------===// // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy. @@ -3836,7 +3836,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits { return true; } - static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) { + static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) { if (ViewMISchedCutoff == 0) return false; return (Node->Preds.size() > ViewMISchedCutoff |