1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
|
//===- PPCMachineScheduler.cpp - MI Scheduler for PowerPC -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "PPCMachineScheduler.h"
#include "MCTargetDesc/PPCMCTargetDesc.h"
using namespace llvm;
static cl::opt<bool>
DisableAddiLoadHeuristic("disable-ppc-sched-addi-load",
cl::desc("Disable scheduling addi instruction before"
"load for ppc"), cl::Hidden);
static cl::opt<bool>
EnableAddiHeuristic("ppc-postra-bias-addi",
cl::desc("Enable scheduling addi instruction as early"
"as possible post ra"),
cl::Hidden, cl::init(true));
static bool isADDIInstr(const GenericScheduler::SchedCandidate &Cand) {
return Cand.SU->getInstr()->getOpcode() == PPC::ADDI ||
Cand.SU->getInstr()->getOpcode() == PPC::ADDI8;
}
bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary &Zone) const {
if (DisableAddiLoadHeuristic)
return false;
SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand;
SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand;
if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) {
TryCand.Reason = Stall;
return true;
}
if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) {
TryCand.Reason = NoCand;
return true;
}
return false;
}
bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// From GenericScheduler::tryCandidate
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
}
}
// GenericScheduler::tryCandidate end
// Add powerpc specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
return true;
// There are some benefits to schedule the ADDI before the load to hide the
// latency, as RA may create a true dependency between the load and addi.
if (SameBoundary) {
if (biasAddiLoadCandidate(Cand, TryCand, *Zone))
return TryCand.Reason != NoCand;
}
return TryCand.Reason != NoCand;
}
bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) const {
if (!EnableAddiHeuristic)
return false;
if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) {
TryCand.Reason = Stall;
return true;
}
return false;
}
bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
// From PostGenericScheduler::tryCandidate
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
// Keep clustered nodes together.
if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
return TryCand.Reason != NoCand;
// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
return TryCand.Reason != NoCand;
}
// Fall through to original instruction order.
if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
TryCand.Reason = NodeOrder;
// PostGenericScheduler::tryCandidate end
// Add powerpc post ra specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
return true;
// There are some benefits to schedule the ADDI as early as possible post ra
// to avoid stalled by vector instructions which take up all the hw units.
// And ADDI is usually used to post inc the loop indvar, which matters the
// performance.
if (biasAddiCandidate(Cand, TryCand))
return TryCand.Reason != NoCand;
return TryCand.Reason != NoCand;
}
void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
// Custom PPC PostRA specific behavior here.
PostGenericScheduler::enterMBB(MBB);
}
void PPCPostRASchedStrategy::leaveMBB() {
// Custom PPC PostRA specific behavior here.
PostGenericScheduler::leaveMBB();
}
void PPCPostRASchedStrategy::initialize(ScheduleDAGMI *Dag) {
// Custom PPC PostRA specific initialization here.
PostGenericScheduler::initialize(Dag);
}
SUnit *PPCPostRASchedStrategy::pickNode(bool &IsTopNode) {
// Custom PPC PostRA specific scheduling here.
return PostGenericScheduler::pickNode(IsTopNode);
}
|