1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
|
//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Performs general IR level optimizations on SVE intrinsics.
//
// This pass performs the following optimizations:
//
// - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
// %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
// %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
// ; (%1 can be replaced with a reinterpret of %2)
//
// - optimizes ptest intrinsics where the operands are being needlessly
// converted to and from svbool_t.
//
//===----------------------------------------------------------------------===//
#include "AArch64.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
namespace {
struct SVEIntrinsicOpts : public ModulePass {
static char ID; // Pass identification, replacement for typeid
SVEIntrinsicOpts() : ModulePass(ID) {
initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
}
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
SmallSetVector<IntrinsicInst *, 4> &PTrues);
bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
bool optimizePredicateStore(Instruction *I);
bool optimizePredicateLoad(Instruction *I);
bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);
/// Operates at the function-scope. I.e., optimizations are applied local to
/// the functions themselves.
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
};
} // end anonymous namespace
void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
char SVEIntrinsicOpts::ID = 0;
static const char *name = "SVE intrinsics optimizations";
INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
ModulePass *llvm::createSVEIntrinsicOptsPass() {
return new SVEIntrinsicOpts();
}
/// Checks if a ptrue intrinsic call is promoted. The act of promoting a
/// ptrue will introduce zeroing. For example:
///
/// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
/// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
/// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
///
/// %1 is promoted, because it is converted:
///
/// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
///
/// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
static bool isPTruePromoted(IntrinsicInst *PTrue) {
// Find all users of this intrinsic that are calls to convert-to-svbool
// reinterpret intrinsics.
SmallVector<IntrinsicInst *, 4> ConvertToUses;
for (User *User : PTrue->users()) {
if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
ConvertToUses.push_back(cast<IntrinsicInst>(User));
}
}
// If no such calls were found, this is ptrue is not promoted.
if (ConvertToUses.empty())
return false;
// Otherwise, try to find users of the convert-to-svbool intrinsics that are
// calls to the convert-from-svbool intrinsic, and would result in some lanes
// being zeroed.
const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
for (IntrinsicInst *ConvertToUse : ConvertToUses) {
for (User *User : ConvertToUse->users()) {
auto *IntrUser = dyn_cast<IntrinsicInst>(User);
if (IntrUser && IntrUser->getIntrinsicID() ==
Intrinsic::aarch64_sve_convert_from_svbool) {
const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
// Would some lanes become zeroed by the conversion?
if (IntrUserVTy->getElementCount().getKnownMinValue() >
PTrueVTy->getElementCount().getKnownMinValue())
// This is a promoted ptrue.
return true;
}
}
}
// If no matching calls were found, this is not a promoted ptrue.
return false;
}
/// Attempts to coalesce ptrues in a basic block.
bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4> &PTrues) {
if (PTrues.size() <= 1)
return false;
// Find the ptrue with the most lanes.
auto *MostEncompassingPTrue = *std::max_element(
PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
return PTrue1VTy->getElementCount().getKnownMinValue() <
PTrue2VTy->getElementCount().getKnownMinValue();
});
// Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
// behind only the ptrues to be coalesced.
PTrues.remove(MostEncompassingPTrue);
PTrues.remove_if(isPTruePromoted);
// Hoist MostEncompassingPTrue to the start of the basic block. It is always
// safe to do this, since ptrue intrinsic calls are guaranteed to have no
// predecessors.
MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
LLVMContext &Ctx = BB.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
auto *MostEncompassingPTrueVTy =
cast<VectorType>(MostEncompassingPTrue->getType());
auto *ConvertToSVBool = Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
{MostEncompassingPTrue});
bool ConvertFromCreated = false;
for (auto *PTrue : PTrues) {
auto *PTrueVTy = cast<VectorType>(PTrue->getType());
// Only create the converts if the types are not already the same, otherwise
// just use the most encompassing ptrue.
if (MostEncompassingPTrueVTy != PTrueVTy) {
ConvertFromCreated = true;
Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
auto *ConvertFromSVBool =
Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
{PTrueVTy}, {ConvertToSVBool});
PTrue->replaceAllUsesWith(ConvertFromSVBool);
} else
PTrue->replaceAllUsesWith(MostEncompassingPTrue);
PTrue->eraseFromParent();
}
// We never used the ConvertTo so remove it
if (!ConvertFromCreated)
ConvertToSVBool->eraseFromParent();
return true;
}
/// The goal of this function is to remove redundant calls to the SVE ptrue
/// intrinsic in each basic block within the given functions.
///
/// SVE ptrues have two representations in LLVM IR:
/// - a logical representation -- an arbitrary-width scalable vector of i1s,
/// i.e. <vscale x N x i1>.
/// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
/// scalable vector of i1s, i.e. <vscale x 16 x i1>.
///
/// The SVE ptrue intrinsic is used to create a logical representation of an SVE
/// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
/// P1 creates a logical SVE predicate that is at least as wide as the logical
/// SVE predicate created by P2, then all of the bits that are true in the
/// physical representation of P2 are necessarily also true in the physical
/// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
/// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
/// convert.{to,from}.svbool.
///
/// Currently, this pass only coalesces calls to SVE ptrue intrinsics
/// if they match the following conditions:
///
/// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
/// SV_ALL indicates that all bits of the predicate vector are to be set to
/// true. SV_POW2 indicates that all bits of the predicate vector up to the
/// largest power-of-two are to be set to true.
/// - the result of the call to the intrinsic is not promoted to a wider
/// predicate. In this case, keeping the extra ptrue leads to better codegen
/// -- coalescing here would create an irreducible chain of SVE reinterprets
/// via convert.{to,from}.svbool.
///
/// EXAMPLE:
///
/// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
/// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
/// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
/// ...
///
/// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
/// ; Logical: <1, 1, 1, 1>
/// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
/// ...
///
/// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
///
/// %1 = <vscale x 8 x i1> ptrue(i32 i31)
/// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
/// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
///
bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;
for (auto *F : Functions) {
for (auto &BB : *F) {
SmallSetVector<IntrinsicInst *, 4> SVAllPTrues;
SmallSetVector<IntrinsicInst *, 4> SVPow2PTrues;
// For each basic block, collect the used ptrues and try to coalesce them.
for (Instruction &I : BB) {
if (I.use_empty())
continue;
auto *IntrI = dyn_cast<IntrinsicInst>(&I);
if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
continue;
const auto PTruePattern =
cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
if (PTruePattern == AArch64SVEPredPattern::all)
SVAllPTrues.insert(IntrI);
if (PTruePattern == AArch64SVEPredPattern::pow2)
SVPow2PTrues.insert(IntrI);
}
Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
}
}
return Changed;
}
// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
// scalable stores as late as possible
bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
auto *F = I->getFunction();
auto Attr = F->getFnAttribute(Attribute::VScaleRange);
if (!Attr.isValid())
return false;
unsigned MinVScale = Attr.getVScaleRangeMin();
Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
// The transform needs to know the exact runtime length of scalable vectors
if (!MaxVScale || MinVScale != MaxVScale)
return false;
auto *PredType =
ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
auto *FixedPredType =
FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
// If we have a store..
auto *Store = dyn_cast<StoreInst>(I);
if (!Store || !Store->isSimple())
return false;
// ..that is storing a predicate vector sized worth of bits..
if (Store->getOperand(0)->getType() != FixedPredType)
return false;
// ..where the value stored comes from a vector extract..
auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
if (!IntrI ||
IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
return false;
// ..that is extracting from index 0..
if (!cast<ConstantInt>(IntrI->getOperand(1))->isZero())
return false;
// ..where the value being extract from comes from a bitcast
auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
if (!BitCast)
return false;
// ..and the bitcast is casting from predicate type
if (BitCast->getOperand(0)->getType() != PredType)
return false;
IRBuilder<> Builder(I->getContext());
Builder.SetInsertPoint(I);
auto *PtrBitCast = Builder.CreateBitCast(
Store->getPointerOperand(),
PredType->getPointerTo(Store->getPointerAddressSpace()));
Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);
Store->eraseFromParent();
if (IntrI->getNumUses() == 0)
IntrI->eraseFromParent();
if (BitCast->getNumUses() == 0)
BitCast->eraseFromParent();
return true;
}
// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
// scalable loads as late as possible
bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
auto *F = I->getFunction();
auto Attr = F->getFnAttribute(Attribute::VScaleRange);
if (!Attr.isValid())
return false;
unsigned MinVScale = Attr.getVScaleRangeMin();
Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax();
// The transform needs to know the exact runtime length of scalable vectors
if (!MaxVScale || MinVScale != MaxVScale)
return false;
auto *PredType =
ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
auto *FixedPredType =
FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
// If we have a bitcast..
auto *BitCast = dyn_cast<BitCastInst>(I);
if (!BitCast || BitCast->getType() != PredType)
return false;
// ..whose operand is a vector_insert..
auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
if (!IntrI ||
IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
return false;
// ..that is inserting into index zero of an undef vector..
if (!isa<UndefValue>(IntrI->getOperand(0)) ||
!cast<ConstantInt>(IntrI->getOperand(2))->isZero())
return false;
// ..where the value inserted comes from a load..
auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
if (!Load || !Load->isSimple())
return false;
// ..that is loading a predicate vector sized worth of bits..
if (Load->getType() != FixedPredType)
return false;
IRBuilder<> Builder(I->getContext());
Builder.SetInsertPoint(Load);
auto *PtrBitCast = Builder.CreateBitCast(
Load->getPointerOperand(),
PredType->getPointerTo(Load->getPointerAddressSpace()));
auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
BitCast->replaceAllUsesWith(LoadPred);
BitCast->eraseFromParent();
if (IntrI->getNumUses() == 0)
IntrI->eraseFromParent();
if (Load->getNumUses() == 0)
Load->eraseFromParent();
return true;
}
bool SVEIntrinsicOpts::optimizeInstructions(
SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;
for (auto *F : Functions) {
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
// Traverse the DT with an rpo walk so we see defs before uses, allowing
// simplification to be done incrementally.
BasicBlock *Root = DT->getRoot();
ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
for (auto *BB : RPOT) {
for (Instruction &I : make_early_inc_range(*BB)) {
switch (I.getOpcode()) {
case Instruction::Store:
Changed |= optimizePredicateStore(&I);
break;
case Instruction::BitCast:
Changed |= optimizePredicateLoad(&I);
break;
}
}
}
}
return Changed;
}
bool SVEIntrinsicOpts::optimizeFunctions(
SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;
Changed |= optimizePTrueIntrinsicCalls(Functions);
Changed |= optimizeInstructions(Functions);
return Changed;
}
bool SVEIntrinsicOpts::runOnModule(Module &M) {
bool Changed = false;
SmallSetVector<Function *, 4> Functions;
// Check for SVE intrinsic declarations first so that we only iterate over
// relevant functions. Where an appropriate declaration is found, store the
// function(s) where it is used so we can target these only.
for (auto &F : M.getFunctionList()) {
if (!F.isDeclaration())
continue;
switch (F.getIntrinsicID()) {
case Intrinsic::experimental_vector_extract:
case Intrinsic::experimental_vector_insert:
case Intrinsic::aarch64_sve_ptrue:
for (User *U : F.users())
Functions.insert(cast<Instruction>(U)->getFunction());
break;
default:
break;
}
}
if (!Functions.empty())
Changed |= optimizeFunctions(Functions);
return Changed;
}
|