aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Target/X86
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-02-10 16:44:30 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:44:30 +0300
commit2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch)
tree012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/X86
parent6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff)
downloadydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86')
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp1938
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp8
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make16
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp22
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp46
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h34
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp38
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp240
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h6
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make8
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86.h14
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86.td988
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp10
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h6
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp4
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp22
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp68
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h8
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp6
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp44
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp52
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp132
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp134
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp4
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp24
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp22
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp6
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp140
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp1026
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp6494
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h70
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp8
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp4034
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td84
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td304
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td236
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td22
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td8
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp28
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td12
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td44
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp374
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h40
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td232
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td172
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td4
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td94
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td248
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td38
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td26
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td78
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp52
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp18
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h40
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp30
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp176
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp58
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp702
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp82
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp6
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp530
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp260
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h22
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td20
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp16
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp16
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp64
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h82
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp144
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h4
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp604
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h44
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp496
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp2
-rw-r--r--contrib/libs/llvm12/lib/Target/X86/ya.make42
94 files changed, 10714 insertions, 10714 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9d9a20183f..f063bdbf6a 100644
--- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -32,7 +32,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -57,53 +57,53 @@ static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
namespace {
static const char OpPrecedence[] = {
- 0, // IC_OR
- 1, // IC_XOR
- 2, // IC_AND
- 4, // IC_LSHIFT
- 4, // IC_RSHIFT
- 5, // IC_PLUS
- 5, // IC_MINUS
- 6, // IC_MULTIPLY
- 6, // IC_DIVIDE
- 6, // IC_MOD
- 7, // IC_NOT
- 8, // IC_NEG
- 9, // IC_RPAREN
- 10, // IC_LPAREN
- 0, // IC_IMM
- 0, // IC_REGISTER
- 3, // IC_EQ
- 3, // IC_NE
- 3, // IC_LT
- 3, // IC_LE
- 3, // IC_GT
- 3 // IC_GE
+ 0, // IC_OR
+ 1, // IC_XOR
+ 2, // IC_AND
+ 4, // IC_LSHIFT
+ 4, // IC_RSHIFT
+ 5, // IC_PLUS
+ 5, // IC_MINUS
+ 6, // IC_MULTIPLY
+ 6, // IC_DIVIDE
+ 6, // IC_MOD
+ 7, // IC_NOT
+ 8, // IC_NEG
+ 9, // IC_RPAREN
+ 10, // IC_LPAREN
+ 0, // IC_IMM
+ 0, // IC_REGISTER
+ 3, // IC_EQ
+ 3, // IC_NE
+ 3, // IC_LT
+ 3, // IC_LE
+ 3, // IC_GT
+ 3 // IC_GE
};
class X86AsmParser : public MCTargetAsmParser {
ParseInstructionInfo *InstInfo;
bool Code16GCC;
- unsigned ForcedDataPrefix = 0;
+ unsigned ForcedDataPrefix = 0;
enum VEXEncoding {
VEXEncoding_Default,
VEXEncoding_VEX,
- VEXEncoding_VEX2,
+ VEXEncoding_VEX2,
VEXEncoding_VEX3,
VEXEncoding_EVEX,
};
VEXEncoding ForcedVEXEncoding = VEXEncoding_Default;
- enum DispEncoding {
- DispEncoding_Default,
- DispEncoding_Disp8,
- DispEncoding_Disp32,
- };
-
- DispEncoding ForcedDispEncoding = DispEncoding_Default;
-
+ enum DispEncoding {
+ DispEncoding_Default,
+ DispEncoding_Disp8,
+ DispEncoding_Disp32,
+ };
+
+ DispEncoding ForcedDispEncoding = DispEncoding_Default;
+
private:
SMLoc consumeToken() {
MCAsmParser &Parser = getParser();
@@ -149,13 +149,13 @@ private:
IC_RPAREN,
IC_LPAREN,
IC_IMM,
- IC_REGISTER,
- IC_EQ,
- IC_NE,
- IC_LT,
- IC_LE,
- IC_GT,
- IC_GE
+ IC_REGISTER,
+ IC_EQ,
+ IC_NE,
+ IC_LT,
+ IC_LE,
+ IC_GT,
+ IC_GE
};
enum IntelOperatorKind {
@@ -165,19 +165,19 @@ private:
IOK_TYPE,
};
- enum MasmOperatorKind {
- MOK_INVALID = 0,
- MOK_LENGTHOF,
- MOK_SIZEOF,
- MOK_TYPE,
- };
-
+ enum MasmOperatorKind {
+ MOK_INVALID = 0,
+ MOK_LENGTHOF,
+ MOK_SIZEOF,
+ MOK_TYPE,
+ };
+
class InfixCalculator {
typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
SmallVector<ICToken, 4> PostfixStack;
- bool isUnaryOperator(InfixCalculatorTok Op) const {
+ bool isUnaryOperator(InfixCalculatorTok Op) const {
return Op == IC_NEG || Op == IC_NOT;
}
@@ -344,44 +344,44 @@ private:
Val = Op1.second >> Op2.second;
OperandStack.push_back(std::make_pair(IC_IMM, Val));
break;
- case IC_EQ:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Equals operation with an immediate and a register!");
- Val = (Op1.second == Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
- case IC_NE:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Not-equals operation with an immediate and a register!");
- Val = (Op1.second != Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
- case IC_LT:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Less-than operation with an immediate and a register!");
- Val = (Op1.second < Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
- case IC_LE:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Less-than-or-equal operation with an immediate and a "
- "register!");
- Val = (Op1.second <= Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
- case IC_GT:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Greater-than operation with an immediate and a register!");
- Val = (Op1.second > Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
- case IC_GE:
- assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
- "Greater-than-or-equal operation with an immediate and a "
- "register!");
- Val = (Op1.second >= Op2.second) ? -1 : 0;
- OperandStack.push_back(std::make_pair(IC_IMM, Val));
- break;
+ case IC_EQ:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Equals operation with an immediate and a register!");
+ Val = (Op1.second == Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_NE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Not-equals operation with an immediate and a register!");
+ Val = (Op1.second != Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than operation with an immediate and a register!");
+ Val = (Op1.second < Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_LE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Less-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second <= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GT:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than operation with an immediate and a register!");
+ Val = (Op1.second > Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
+ case IC_GE:
+ assert(Op1.first == IC_IMM && Op2.first == IC_IMM &&
+ "Greater-than-or-equal operation with an immediate and a "
+ "register!");
+ Val = (Op1.second >= Op2.second) ? -1 : 0;
+ OperandStack.push_back(std::make_pair(IC_IMM, Val));
+ break;
}
}
}
@@ -395,12 +395,12 @@ private:
IES_OR,
IES_XOR,
IES_AND,
- IES_EQ,
- IES_NE,
- IES_LT,
- IES_LE,
- IES_GT,
- IES_GE,
+ IES_EQ,
+ IES_NE,
+ IES_LT,
+ IES_LE,
+ IES_GT,
+ IES_GE,
IES_LSHIFT,
IES_RSHIFT,
IES_PLUS,
@@ -433,7 +433,7 @@ private:
bool MemExpr;
bool OffsetOperator;
SMLoc OffsetOperatorLoc;
- AsmTypeInfo CurType;
+ AsmTypeInfo CurType;
bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
if (Sym) {
@@ -452,25 +452,25 @@ private:
MemExpr(false), OffsetOperator(false) {}
void addImm(int64_t imm) { Imm += imm; }
- short getBracCount() const { return BracCount; }
- bool isMemExpr() const { return MemExpr; }
- bool isOffsetOperator() const { return OffsetOperator; }
- SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
- unsigned getBaseReg() const { return BaseReg; }
- unsigned getIndexReg() const { return IndexReg; }
- unsigned getScale() const { return Scale; }
- const MCExpr *getSym() const { return Sym; }
- StringRef getSymName() const { return SymName; }
- StringRef getType() const { return CurType.Name; }
- unsigned getSize() const { return CurType.Size; }
- unsigned getElementSize() const { return CurType.ElementSize; }
- unsigned getLength() const { return CurType.Length; }
+ short getBracCount() const { return BracCount; }
+ bool isMemExpr() const { return MemExpr; }
+ bool isOffsetOperator() const { return OffsetOperator; }
+ SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
+ unsigned getBaseReg() const { return BaseReg; }
+ unsigned getIndexReg() const { return IndexReg; }
+ unsigned getScale() const { return Scale; }
+ const MCExpr *getSym() const { return Sym; }
+ StringRef getSymName() const { return SymName; }
+ StringRef getType() const { return CurType.Name; }
+ unsigned getSize() const { return CurType.Size; }
+ unsigned getElementSize() const { return CurType.ElementSize; }
+ unsigned getLength() const { return CurType.Length; }
int64_t getImm() { return Imm + IC.execute(); }
- bool isValidEndState() const {
+ bool isValidEndState() const {
return State == IES_RBRAC || State == IES_INTEGER;
}
- bool hadError() const { return State == IES_ERROR; }
- const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
+ bool hadError() const { return State == IES_ERROR; }
+ const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; }
void onOr() {
IntelExprState CurrState = State;
@@ -517,96 +517,96 @@ private:
}
PrevState = CurrState;
}
- void onEq() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_EQ;
- IC.pushOperator(IC_EQ);
- break;
- }
- PrevState = CurrState;
- }
- void onNE() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_NE;
- IC.pushOperator(IC_NE);
- break;
- }
- PrevState = CurrState;
- }
- void onLT() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_LT;
- IC.pushOperator(IC_LT);
- break;
- }
- PrevState = CurrState;
- }
- void onLE() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_LE;
- IC.pushOperator(IC_LE);
- break;
- }
- PrevState = CurrState;
- }
- void onGT() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_GT;
- IC.pushOperator(IC_GT);
- break;
- }
- PrevState = CurrState;
- }
- void onGE() {
- IntelExprState CurrState = State;
- switch (State) {
- default:
- State = IES_ERROR;
- break;
- case IES_INTEGER:
- case IES_RPAREN:
- case IES_REGISTER:
- State = IES_GE;
- IC.pushOperator(IC_GE);
- break;
- }
- PrevState = CurrState;
- }
+ void onEq() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_EQ;
+ IC.pushOperator(IC_EQ);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onNE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_NE;
+ IC.pushOperator(IC_NE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LT;
+ IC.pushOperator(IC_LT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onLE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_LE;
+ IC.pushOperator(IC_LE);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGT() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GT;
+ IC.pushOperator(IC_GT);
+ break;
+ }
+ PrevState = CurrState;
+ }
+ void onGE() {
+ IntelExprState CurrState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_INTEGER:
+ case IES_RPAREN:
+ case IES_REGISTER:
+ State = IES_GE;
+ IC.pushOperator(IC_GE);
+ break;
+ }
+ PrevState = CurrState;
+ }
void onLShift() {
IntelExprState CurrState = State;
switch (State) {
@@ -677,12 +677,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
- case IES_EQ:
- case IES_NE:
- case IES_LT:
- case IES_LE:
- case IES_GT:
- case IES_GE:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_PLUS:
@@ -738,12 +738,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
- case IES_EQ:
- case IES_NE:
- case IES_LT:
- case IES_LE:
- case IES_GT:
- case IES_GE:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_PLUS:
@@ -799,8 +799,8 @@ private:
}
bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
const InlineAsmIdentifierInfo &IDInfo,
- const AsmTypeInfo &Type, bool ParsingMSInlineAsm,
- StringRef &ErrMsg) {
+ const AsmTypeInfo &Type, bool ParsingMSInlineAsm,
+ StringRef &ErrMsg) {
// InlineAsm: Treat an enum value as an integer
if (ParsingMSInlineAsm)
if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
@@ -819,7 +819,7 @@ private:
case IES_NOT:
case IES_INIT:
case IES_LBRAC:
- case IES_LPAREN:
+ case IES_LPAREN:
if (setSymRef(SymRef, SymRefName, ErrMsg))
return true;
MemExpr = true;
@@ -827,7 +827,7 @@ private:
IC.pushOperand(IC_IMM);
if (ParsingMSInlineAsm)
Info = IDInfo;
- setTypeInfo(Type);
+ setTypeInfo(Type);
break;
}
return false;
@@ -844,12 +844,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
- case IES_EQ:
- case IES_NE:
- case IES_LT:
- case IES_LE:
- case IES_GT:
- case IES_GE:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_DIVIDE:
@@ -932,8 +932,8 @@ private:
case IES_RPAREN:
State = IES_PLUS;
IC.pushOperator(IC_PLUS);
- CurType.Length = 1;
- CurType.Size = CurType.ElementSize;
+ CurType.Length = 1;
+ CurType.Size = CurType.ElementSize;
break;
case IES_INIT:
case IES_CAST:
@@ -986,12 +986,12 @@ private:
case IES_OR:
case IES_XOR:
case IES_AND:
- case IES_EQ:
- case IES_NE:
- case IES_LT:
- case IES_LE:
- case IES_GT:
- case IES_GE:
+ case IES_EQ:
+ case IES_NE:
+ case IES_LT:
+ case IES_LE:
+ case IES_GT:
+ case IES_GE:
case IES_LSHIFT:
case IES_RSHIFT:
case IES_MULTIPLY:
@@ -1023,8 +1023,8 @@ private:
}
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
- const InlineAsmIdentifierInfo &IDInfo,
- bool ParsingMSInlineAsm, StringRef &ErrMsg) {
+ const InlineAsmIdentifierInfo &IDInfo,
+ bool ParsingMSInlineAsm, StringRef &ErrMsg) {
PrevState = State;
switch (State) {
default:
@@ -1048,19 +1048,19 @@ private:
}
return false;
}
- void onCast(AsmTypeInfo Info) {
+ void onCast(AsmTypeInfo Info) {
PrevState = State;
switch (State) {
default:
State = IES_ERROR;
break;
case IES_LPAREN:
- setTypeInfo(Info);
+ setTypeInfo(Info);
State = IES_CAST;
break;
}
}
- void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
+ void setTypeInfo(AsmTypeInfo Type) { CurType = Type; }
};
bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -1089,21 +1089,21 @@ private:
std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst);
bool VerifyAndAdjustOperands(OperandVector &OrigOperands,
OperandVector &FinalOperands);
- bool ParseOperand(OperandVector &Operands);
- bool ParseATTOperand(OperandVector &Operands);
- bool ParseIntelOperand(OperandVector &Operands);
+ bool ParseOperand(OperandVector &Operands);
+ bool ParseATTOperand(OperandVector &Operands);
+ bool ParseIntelOperand(OperandVector &Operands);
bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
InlineAsmIdentifierInfo &Info, SMLoc &End);
bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
- unsigned IdentifyMasmOperator(StringRef Name);
- bool ParseMasmOperator(unsigned OpKind, int64_t &Val);
- bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands);
+ unsigned IdentifyMasmOperator(StringRef Name);
+ bool ParseMasmOperator(unsigned OpKind, int64_t &Val);
+ bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands);
bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM,
bool &ParseError, SMLoc &End);
- bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM,
- bool &ParseError, SMLoc &End);
+ bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End);
void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
SMLoc End);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
@@ -1112,21 +1112,21 @@ private:
bool IsUnevaluatedOperand, SMLoc &End,
bool IsParsingOffsetOperator = false);
- bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
- SMLoc EndLoc, OperandVector &Operands);
+ bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+ SMLoc EndLoc, OperandVector &Operands);
X86::CondCode ParseConditionCode(StringRef CCode);
bool ParseIntelMemoryOperandSize(unsigned &Size);
- bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
- unsigned BaseReg, unsigned IndexReg,
- unsigned Scale, SMLoc Start, SMLoc End,
- unsigned Size, StringRef Identifier,
- const InlineAsmIdentifierInfo &Info,
- OperandVector &Operands);
-
- bool parseDirectiveArch();
- bool parseDirectiveNops(SMLoc L);
+ bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc Start, SMLoc End,
+ unsigned Size, StringRef Identifier,
+ const InlineAsmIdentifierInfo &Info,
+ OperandVector &Operands);
+
+ bool parseDirectiveArch();
+ bool parseDirectiveNops(SMLoc L);
bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
@@ -1187,7 +1187,7 @@ private:
/// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z})
/// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required.
/// return false if no parsing errors occurred, true otherwise.
- bool HandleAVX512Operand(OperandVector &Operands);
+ bool HandleAVX512Operand(OperandVector &Operands);
bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
@@ -1716,17 +1716,17 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
return false;
}
-bool X86AsmParser::ParseOperand(OperandVector &Operands) {
+bool X86AsmParser::ParseOperand(OperandVector &Operands) {
if (isParsingIntelSyntax())
- return ParseIntelOperand(Operands);
-
- return ParseATTOperand(Operands);
+ return ParseIntelOperand(Operands);
+
+ return ParseATTOperand(Operands);
}
-bool X86AsmParser::CreateMemForMSInlineAsm(
+bool X86AsmParser::CreateMemForMSInlineAsm(
unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
- const InlineAsmIdentifierInfo &Info, OperandVector &Operands) {
+ const InlineAsmIdentifierInfo &Info, OperandVector &Operands) {
// If we found a decl other than a VarDecl, then assume it is a FuncDecl or
// some other label reference.
if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
@@ -1738,10 +1738,10 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
}
// Create an absolute memory reference in order to match against
// instructions taking a PC relative operand.
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
- End, Size, Identifier,
- Info.Label.Decl));
- return false;
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start,
+ End, Size, Identifier,
+ Info.Label.Decl));
+ return false;
}
// We either have a direct symbol reference, or an offset from a symbol. The
// parser always puts the symbol on the LHS, so look there for size
@@ -1758,19 +1758,19 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
// It is widely common for MS InlineAsm to use a global variable and one/two
// registers in a mmory expression, and though unaccessible via rip/eip.
if (IsGlobalLV && (BaseReg || IndexReg)) {
- Operands.push_back(
- X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
- return false;
- }
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End));
+ return false;
+ }
// Otherwise, we set the base register to a non-zero value
// if we don't know the actual value at this time. This is necessary to
// get the matching correct in some cases.
- BaseReg = BaseReg ? BaseReg : 1;
- Operands.push_back(X86Operand::CreateMem(
- getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End,
- Size,
- /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize));
- return false;
+ BaseReg = BaseReg ? BaseReg : 1;
+ Operands.push_back(X86Operand::CreateMem(
+ getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End,
+ Size,
+ /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize));
+ return false;
}
// Some binary bitwise operators have a named synonymous
@@ -1779,10 +1779,10 @@ bool X86AsmParser::CreateMemForMSInlineAsm(
bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
IntelExprStateMachine &SM,
bool &ParseError, SMLoc &End) {
- // A named operator should be either lower or upper case, but not a mix...
- // except in MASM, which uses full case-insensitivity.
- if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
- !getParser().isParsingMasm())
+ // A named operator should be either lower or upper case, but not a mix...
+ // except in MASM, which uses full case-insensitivity.
+ if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+ !getParser().isParsingMasm())
return false;
if (Name.equals_lower("not")) {
SM.onNot();
@@ -1818,27 +1818,27 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
End = consumeToken();
return true;
}
-bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
- IntelExprStateMachine &SM,
- bool &ParseError, SMLoc &End) {
- if (Name.equals_lower("eq")) {
- SM.onEq();
- } else if (Name.equals_lower("ne")) {
- SM.onNE();
- } else if (Name.equals_lower("lt")) {
- SM.onLT();
- } else if (Name.equals_lower("le")) {
- SM.onLE();
- } else if (Name.equals_lower("gt")) {
- SM.onGT();
- } else if (Name.equals_lower("ge")) {
- SM.onGE();
- } else {
- return false;
- }
- End = consumeToken();
- return true;
-}
+bool X86AsmParser::ParseMasmNamedOperator(StringRef Name,
+ IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End) {
+ if (Name.equals_lower("eq")) {
+ SM.onEq();
+ } else if (Name.equals_lower("ne")) {
+ SM.onNE();
+ } else if (Name.equals_lower("lt")) {
+ SM.onLT();
+ } else if (Name.equals_lower("le")) {
+ SM.onLE();
+ } else if (Name.equals_lower("gt")) {
+ SM.onGT();
+ } else if (Name.equals_lower("ge")) {
+ SM.onGE();
+ } else {
+ return false;
+ }
+ End = consumeToken();
+ return true;
+}
bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
MCAsmParser &Parser = getParser();
@@ -1847,10 +1847,10 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
AsmToken::TokenKind PrevTK = AsmToken::Error;
bool Done = false;
while (!Done) {
- // Get a fresh reference on each loop iteration in case the previous
- // iteration moved the token storage during UnLex().
- const AsmToken &Tok = Parser.getTok();
-
+ // Get a fresh reference on each loop iteration in case the previous
+ // iteration moved the token storage during UnLex().
+ const AsmToken &Tok = Parser.getTok();
+
bool UpdateLocLex = true;
AsmToken::TokenKind TK = getLexer().getKind();
@@ -1859,9 +1859,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if ((Done = SM.isValidEndState()))
break;
return Error(Tok.getLoc(), "unknown token in expression");
- case AsmToken::Error:
- return Error(getLexer().getErrLoc(), getLexer().getErr());
- break;
+ case AsmToken::Error:
+ return Error(getLexer().getErrLoc(), getLexer().getErr());
+ break;
case AsmToken::EndOfStatement:
Done = true;
break;
@@ -1871,73 +1871,73 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (ParseIntelDotOperator(SM, End))
return true;
break;
- case AsmToken::Dot:
- if (!Parser.isParsingMasm()) {
- if ((Done = SM.isValidEndState()))
- break;
- return Error(Tok.getLoc(), "unknown token in expression");
- }
- // MASM allows spaces around the dot operator (e.g., "var . x")
- Lex();
- UpdateLocLex = false;
- if (ParseIntelDotOperator(SM, End))
- return true;
- break;
- case AsmToken::Dollar:
- if (!Parser.isParsingMasm()) {
- if ((Done = SM.isValidEndState()))
- break;
- return Error(Tok.getLoc(), "unknown token in expression");
- }
- LLVM_FALLTHROUGH;
- case AsmToken::String: {
- if (Parser.isParsingMasm()) {
- // MASM parsers handle strings in expressions as constants.
- SMLoc ValueLoc = Tok.getLoc();
- int64_t Res;
- const MCExpr *Val;
- if (Parser.parsePrimaryExpr(Val, End, nullptr))
- return true;
- UpdateLocLex = false;
- if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
- return Error(ValueLoc, "expected absolute value");
- if (SM.onInteger(Res, ErrMsg))
- return Error(ValueLoc, ErrMsg);
- break;
- }
- LLVM_FALLTHROUGH;
- }
+ case AsmToken::Dot:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ // MASM allows spaces around the dot operator (e.g., "var . x")
+ Lex();
+ UpdateLocLex = false;
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
+ case AsmToken::Dollar:
+ if (!Parser.isParsingMasm()) {
+ if ((Done = SM.isValidEndState()))
+ break;
+ return Error(Tok.getLoc(), "unknown token in expression");
+ }
+ LLVM_FALLTHROUGH;
+ case AsmToken::String: {
+ if (Parser.isParsingMasm()) {
+ // MASM parsers handle strings in expressions as constants.
+ SMLoc ValueLoc = Tok.getLoc();
+ int64_t Res;
+ const MCExpr *Val;
+ if (Parser.parsePrimaryExpr(Val, End, nullptr))
+ return true;
+ UpdateLocLex = false;
+ if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
+ return Error(ValueLoc, "expected absolute value");
+ if (SM.onInteger(Res, ErrMsg))
+ return Error(ValueLoc, ErrMsg);
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ }
case AsmToken::At:
case AsmToken::Identifier: {
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
UpdateLocLex = false;
- if (Parser.isParsingMasm()) {
- size_t DotOffset = Identifier.find_first_of('.');
- if (DotOffset != StringRef::npos) {
- consumeToken();
- StringRef LHS = Identifier.slice(0, DotOffset);
- StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
- StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos);
- if (!RHS.empty()) {
- getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
- }
- getLexer().UnLex(AsmToken(AsmToken::Dot, Dot));
- if (!LHS.empty()) {
- getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS));
- }
- break;
- }
- }
+ if (Parser.isParsingMasm()) {
+ size_t DotOffset = Identifier.find_first_of('.');
+ if (DotOffset != StringRef::npos) {
+ consumeToken();
+ StringRef LHS = Identifier.slice(0, DotOffset);
+ StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1);
+ StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos);
+ if (!RHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS));
+ }
+ getLexer().UnLex(AsmToken(AsmToken::Dot, Dot));
+ if (!LHS.empty()) {
+ getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS));
+ }
+ break;
+ }
+ }
// (MASM only) <TYPE> PTR operator
if (Parser.isParsingMasm()) {
const AsmToken &NextTok = getLexer().peekTok();
if (NextTok.is(AsmToken::Identifier) &&
NextTok.getIdentifier().equals_lower("ptr")) {
- AsmTypeInfo Info;
- if (Parser.lookUpType(Identifier, Info))
- return Error(Tok.getLoc(), "unknown type");
- SM.onCast(Info);
+ AsmTypeInfo Info;
+ if (Parser.lookUpType(Identifier, Info))
+ return Error(Tok.getLoc(), "unknown type");
+ SM.onCast(Info);
// Eat type and PTR.
consumeToken();
End = consumeToken();
@@ -1962,15 +1962,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (SM.onRegister(Reg, ErrMsg))
return Error(IdentLoc, ErrMsg);
- AsmFieldInfo Info;
+ AsmFieldInfo Info;
SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
- if (Parser.lookUpField(Field, Info))
+ if (Parser.lookUpField(Field, Info))
return Error(FieldStartLoc, "unknown offset");
else if (SM.onPlus(ErrMsg))
return Error(getTok().getLoc(), ErrMsg);
- else if (SM.onInteger(Info.Offset, ErrMsg))
+ else if (SM.onInteger(Info.Offset, ErrMsg))
return Error(IdentLoc, ErrMsg);
- SM.setTypeInfo(Info.Type);
+ SM.setTypeInfo(Info.Type);
End = consumeToken();
break;
@@ -1984,15 +1984,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return true;
break;
}
- if (Parser.isParsingMasm() &&
- ParseMasmNamedOperator(Identifier, SM, ParseError, End)) {
- if (ParseError)
- return true;
- break;
- }
+ if (Parser.isParsingMasm() &&
+ ParseMasmNamedOperator(Identifier, SM, ParseError, End)) {
+ if (ParseError)
+ return true;
+ break;
+ }
// Symbol reference, when parsing assembly content
InlineAsmIdentifierInfo Info;
- AsmFieldInfo FieldInfo;
+ AsmFieldInfo FieldInfo;
const MCExpr *Val;
if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
// MS Dot Operator expression
@@ -2009,9 +2009,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
if (SM.onInteger(Val, ErrMsg))
return Error(IdentLoc, ErrMsg);
- } else {
+ } else {
return true;
- }
+ }
break;
}
// MS InlineAsm identifier
@@ -2020,49 +2020,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(IdentLoc, "expected identifier");
if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
return true;
- else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
- true, ErrMsg))
+ else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ true, ErrMsg))
return Error(IdentLoc, ErrMsg);
break;
}
- if (Parser.isParsingMasm()) {
- if (unsigned OpKind = IdentifyMasmOperator(Identifier)) {
- int64_t Val;
- if (ParseMasmOperator(OpKind, Val))
- return true;
- if (SM.onInteger(Val, ErrMsg))
- return Error(IdentLoc, ErrMsg);
- break;
- }
- if (!getParser().lookUpType(Identifier, FieldInfo.Type)) {
- // Field offset immediate; <TYPE>.<field specification>
- Lex(); // eat type
- bool EndDot = parseOptionalToken(AsmToken::Dot);
- while (EndDot || (getTok().is(AsmToken::Identifier) &&
- getTok().getString().startswith("."))) {
- getParser().parseIdentifier(Identifier);
- if (!EndDot)
- Identifier.consume_front(".");
- EndDot = Identifier.consume_back(".");
- if (getParser().lookUpField(FieldInfo.Type.Name, Identifier,
- FieldInfo)) {
- SMLoc IDEnd =
- SMLoc::getFromPointer(Identifier.data() + Identifier.size());
- return Error(IdentLoc, "Unable to lookup field reference!",
- SMRange(IdentLoc, IDEnd));
- }
- if (!EndDot)
- EndDot = parseOptionalToken(AsmToken::Dot);
- }
- if (SM.onInteger(FieldInfo.Offset, ErrMsg))
- return Error(IdentLoc, ErrMsg);
- break;
- }
- }
- if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) {
+ if (Parser.isParsingMasm()) {
+ if (unsigned OpKind = IdentifyMasmOperator(Identifier)) {
+ int64_t Val;
+ if (ParseMasmOperator(OpKind, Val))
+ return true;
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (!getParser().lookUpType(Identifier, FieldInfo.Type)) {
+ // Field offset immediate; <TYPE>.<field specification>
+ Lex(); // eat type
+ bool EndDot = parseOptionalToken(AsmToken::Dot);
+ while (EndDot || (getTok().is(AsmToken::Identifier) &&
+ getTok().getString().startswith("."))) {
+ getParser().parseIdentifier(Identifier);
+ if (!EndDot)
+ Identifier.consume_front(".");
+ EndDot = Identifier.consume_back(".");
+ if (getParser().lookUpField(FieldInfo.Type.Name, Identifier,
+ FieldInfo)) {
+ SMLoc IDEnd =
+ SMLoc::getFromPointer(Identifier.data() + Identifier.size());
+ return Error(IdentLoc, "Unable to lookup field reference!",
+ SMRange(IdentLoc, IDEnd));
+ }
+ if (!EndDot)
+ EndDot = parseOptionalToken(AsmToken::Dot);
+ }
+ if (SM.onInteger(FieldInfo.Offset, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ }
+ if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) {
return Error(Tok.getLoc(), "Unexpected identifier!");
- } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
- false, ErrMsg)) {
+ } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type,
+ false, ErrMsg)) {
return Error(IdentLoc, ErrMsg);
}
break;
@@ -2085,9 +2085,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(Loc, "invalid reference to undefined symbol");
StringRef Identifier = Sym->getName();
InlineAsmIdentifierInfo Info;
- AsmTypeInfo Type;
- if (SM.onIdentifierExpr(Val, Identifier, Info, Type,
- isParsingMSInlineAsm(), ErrMsg))
+ AsmTypeInfo Type;
+ if (SM.onIdentifierExpr(Val, Identifier, Info, Type,
+ isParsingMSInlineAsm(), ErrMsg))
return Error(Loc, ErrMsg);
End = consumeToken();
} else {
@@ -2229,13 +2229,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(
}
//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
-bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
+bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
// Eat "{" and mark the current place.
const SMLoc consumedToken = consumeToken();
if (Tok.isNot(AsmToken::Identifier))
- return Error(Tok.getLoc(), "Expected an identifier after {");
+ return Error(Tok.getLoc(), "Expected an identifier after {");
if (Tok.getIdentifier().startswith("r")){
int rndMode = StringSwitch<int>(Tok.getIdentifier())
.Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT)
@@ -2244,76 +2244,76 @@ bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) {
.Case("rz", X86::STATIC_ROUNDING::TO_ZERO)
.Default(-1);
if (-1 == rndMode)
- return Error(Tok.getLoc(), "Invalid rounding mode.");
+ return Error(Tok.getLoc(), "Invalid rounding mode.");
Parser.Lex(); // Eat "r*" of r*-sae
if (!getLexer().is(AsmToken::Minus))
- return Error(Tok.getLoc(), "Expected - at this point");
+ return Error(Tok.getLoc(), "Expected - at this point");
Parser.Lex(); // Eat "-"
Parser.Lex(); // Eat the sae
if (!getLexer().is(AsmToken::RCurly))
- return Error(Tok.getLoc(), "Expected } at this point");
+ return Error(Tok.getLoc(), "Expected } at this point");
SMLoc End = Tok.getEndLoc();
Parser.Lex(); // Eat "}"
const MCExpr *RndModeOp =
MCConstantExpr::create(rndMode, Parser.getContext());
- Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
- return false;
+ Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End));
+ return false;
}
if(Tok.getIdentifier().equals("sae")){
Parser.Lex(); // Eat the sae
if (!getLexer().is(AsmToken::RCurly))
- return Error(Tok.getLoc(), "Expected } at this point");
+ return Error(Tok.getLoc(), "Expected } at this point");
Parser.Lex(); // Eat "}"
- Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken));
- return false;
+ Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken));
+ return false;
}
- return Error(Tok.getLoc(), "unknown token in expression");
+ return Error(Tok.getLoc(), "unknown token in expression");
}
/// Parse the '.' operator.
bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
SMLoc &End) {
const AsmToken &Tok = getTok();
- AsmFieldInfo Info;
+ AsmFieldInfo Info;
// Drop the optional '.'.
StringRef DotDispStr = Tok.getString();
if (DotDispStr.startswith("."))
DotDispStr = DotDispStr.drop_front(1);
- StringRef TrailingDot;
+ StringRef TrailingDot;
// .Imm gets lexed as a real.
if (Tok.is(AsmToken::Real)) {
APInt DotDisp;
DotDispStr.getAsInteger(10, DotDisp);
- Info.Offset = DotDisp.getZExtValue();
+ Info.Offset = DotDisp.getZExtValue();
} else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
Tok.is(AsmToken::Identifier)) {
- if (DotDispStr.endswith(".")) {
- TrailingDot = DotDispStr.substr(DotDispStr.size() - 1);
- DotDispStr = DotDispStr.drop_back(1);
- }
+ if (DotDispStr.endswith(".")) {
+ TrailingDot = DotDispStr.substr(DotDispStr.size() - 1);
+ DotDispStr = DotDispStr.drop_back(1);
+ }
const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
const StringRef Base = BaseMember.first, Member = BaseMember.second;
- if (getParser().lookUpField(SM.getType(), DotDispStr, Info) &&
- getParser().lookUpField(SM.getSymName(), DotDispStr, Info) &&
- getParser().lookUpField(DotDispStr, Info) &&
+ if (getParser().lookUpField(SM.getType(), DotDispStr, Info) &&
+ getParser().lookUpField(SM.getSymName(), DotDispStr, Info) &&
+ getParser().lookUpField(DotDispStr, Info) &&
(!SemaCallback ||
- SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset)))
+ SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset)))
return Error(Tok.getLoc(), "Unable to lookup field reference!");
- } else {
+ } else {
return Error(Tok.getLoc(), "Unexpected token type!");
- }
+ }
// Eat the DotExpression and update End
End = SMLoc::getFromPointer(DotDispStr.data());
const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
while (Tok.getLoc().getPointer() < DotExprEndLoc)
Lex();
- if (!TrailingDot.empty())
- getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot));
- SM.addImm(Info.Offset);
- SM.setTypeInfo(Info.Type);
+ if (!TrailingDot.empty())
+ getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot));
+ SM.addImm(Info.Offset);
+ SM.setTypeInfo(Info.Type);
return false;
}
@@ -2328,7 +2328,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
if (!isParsingMSInlineAsm()) {
if ((getTok().isNot(AsmToken::Identifier) &&
getTok().isNot(AsmToken::String)) ||
- getParser().parsePrimaryExpr(Val, End, nullptr))
+ getParser().parsePrimaryExpr(Val, End, nullptr))
return Error(Start, "unexpected token!");
} else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) {
return Error(Start, "unable to lookup expression");
@@ -2364,7 +2364,7 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
SMLoc Start = Tok.getLoc(), End;
StringRef Identifier = Tok.getString();
if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
- /*IsUnevaluatedOperand=*/true, End))
+ /*IsUnevaluatedOperand=*/true, End))
return 0;
if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
@@ -2383,73 +2383,73 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
return CVal;
}
-// Query a candidate string for being an Intel assembly operator
-// Report back its kind, or IOK_INVALID if does not evaluated as a known one
-unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
- return StringSwitch<unsigned>(Name.lower())
- .Case("type", MOK_TYPE)
- .Cases("size", "sizeof", MOK_SIZEOF)
- .Cases("length", "lengthof", MOK_LENGTHOF)
- .Default(MOK_INVALID);
-}
-
-/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator
-/// returns the number of elements in an array. It returns the value 1 for
-/// non-array variables. The SIZEOF operator returns the size of a type or
-/// variable in bytes. A variable's size is the product of its LENGTH and TYPE.
-/// The TYPE operator returns the size of a variable. If the variable is an
-/// array, TYPE returns the size of a single element.
-bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
- MCAsmParser &Parser = getParser();
- SMLoc OpLoc = Parser.getTok().getLoc();
- Parser.Lex(); // Eat operator.
-
- Val = 0;
- if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) {
- // Check for SIZEOF(<type>) and TYPE(<type>).
- bool InParens = Parser.getTok().is(AsmToken::LParen);
- const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok();
- AsmTypeInfo Type;
- if (IDTok.is(AsmToken::Identifier) &&
- !Parser.lookUpType(IDTok.getIdentifier(), Type)) {
- Val = Type.Size;
-
- // Eat tokens.
- if (InParens)
- parseToken(AsmToken::LParen);
- parseToken(AsmToken::Identifier);
- if (InParens)
- parseToken(AsmToken::RParen);
- }
- }
-
- if (!Val) {
- IntelExprStateMachine SM;
- SMLoc End, Start = Parser.getTok().getLoc();
- if (ParseIntelExpression(SM, End))
- return true;
-
- switch (OpKind) {
- default:
- llvm_unreachable("Unexpected operand kind!");
- case MOK_SIZEOF:
- Val = SM.getSize();
- break;
- case MOK_LENGTHOF:
- Val = SM.getLength();
- break;
- case MOK_TYPE:
- Val = SM.getElementSize();
- break;
- }
-
- if (!Val)
- return Error(OpLoc, "expression has unknown type", SMRange(Start, End));
- }
-
- return false;
-}
-
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) {
+ return StringSwitch<unsigned>(Name.lower())
+ .Case("type", MOK_TYPE)
+ .Cases("size", "sizeof", MOK_SIZEOF)
+ .Cases("length", "lengthof", MOK_LENGTHOF)
+ .Default(MOK_INVALID);
+}
+
+/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZEOF operator returns the size of a type or
+/// variable in bytes. A variable's size is the product of its LENGTH and TYPE.
+/// The TYPE operator returns the size of a variable. If the variable is an
+/// array, TYPE returns the size of a single element.
+bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
+ MCAsmParser &Parser = getParser();
+ SMLoc OpLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat operator.
+
+ Val = 0;
+ if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) {
+ // Check for SIZEOF(<type>) and TYPE(<type>).
+ bool InParens = Parser.getTok().is(AsmToken::LParen);
+ const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok();
+ AsmTypeInfo Type;
+ if (IDTok.is(AsmToken::Identifier) &&
+ !Parser.lookUpType(IDTok.getIdentifier(), Type)) {
+ Val = Type.Size;
+
+ // Eat tokens.
+ if (InParens)
+ parseToken(AsmToken::LParen);
+ parseToken(AsmToken::Identifier);
+ if (InParens)
+ parseToken(AsmToken::RParen);
+ }
+ }
+
+ if (!Val) {
+ IntelExprStateMachine SM;
+ SMLoc End, Start = Parser.getTok().getLoc();
+ if (ParseIntelExpression(SM, End))
+ return true;
+
+ switch (OpKind) {
+ default:
+ llvm_unreachable("Unexpected operand kind!");
+ case MOK_SIZEOF:
+ Val = SM.getSize();
+ break;
+ case MOK_LENGTHOF:
+ Val = SM.getLength();
+ break;
+ case MOK_TYPE:
+ Val = SM.getElementSize();
+ break;
+ }
+
+ if (!Val)
+ return Error(OpLoc, "expression has unknown type", SMRange(Start, End));
+ }
+
+ return false;
+}
+
bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
Size = StringSwitch<unsigned>(getTok().getString())
.Cases("BYTE", "byte", 8)
@@ -2476,7 +2476,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
return false;
}
-bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
+bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
SMLoc Start, End;
@@ -2484,31 +2484,31 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
// Parse optional Size directive.
unsigned Size;
if (ParseIntelMemoryOperandSize(Size))
- return true;
+ return true;
bool PtrInOperand = bool(Size);
Start = Tok.getLoc();
// Rounding mode operand.
if (getLexer().is(AsmToken::LCurly))
- return ParseRoundingModeOp(Start, Operands);
+ return ParseRoundingModeOp(Start, Operands);
// Register operand.
unsigned RegNo = 0;
if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
if (RegNo == X86::RIP)
- return Error(Start, "rip can only be used as a base register");
+ return Error(Start, "rip can only be used as a base register");
// A Register followed by ':' is considered a segment override
- if (Tok.isNot(AsmToken::Colon)) {
- if (PtrInOperand)
- return Error(Start, "expected memory operand after 'ptr', "
+ if (Tok.isNot(AsmToken::Colon)) {
+ if (PtrInOperand)
+ return Error(Start, "expected memory operand after 'ptr', "
"found register operand instead");
- Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
- return false;
- }
+ Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
+ return false;
+ }
// An alleged segment override. check if we have a valid segment register
if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
- return Error(Start, "invalid segment register");
+ return Error(Start, "invalid segment register");
// Eat ':' and update Start location
Start = Lex().getLoc();
}
@@ -2516,7 +2516,7 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
// Immediates and Memory
IntelExprStateMachine SM;
if (ParseIntelExpression(SM, End))
- return true;
+ return true;
if (isParsingMSInlineAsm())
RewriteIntelExpression(SM, Start, Tok.getLoc());
@@ -2533,27 +2533,27 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
// and we are parsing a segment override
if (!SM.isMemExpr() && !RegNo) {
if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
- const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+ const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
// Disp includes the address of a variable; make sure this is recorded
// for later handling.
- Operands.push_back(X86Operand::CreateImm(Disp, Start, End,
- SM.getSymName(), Info.Var.Decl,
- Info.Var.IsGlobalLV));
- return false;
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End,
+ SM.getSymName(), Info.Var.Decl,
+ Info.Var.IsGlobalLV));
+ return false;
}
}
- Operands.push_back(X86Operand::CreateImm(Disp, Start, End));
- return false;
+ Operands.push_back(X86Operand::CreateImm(Disp, Start, End));
+ return false;
}
StringRef ErrMsg;
unsigned BaseReg = SM.getBaseReg();
unsigned IndexReg = SM.getIndexReg();
unsigned Scale = SM.getScale();
- if (!PtrInOperand)
- Size = SM.getElementSize() << 3;
+ if (!PtrInOperand)
+ Size = SM.getElementSize() << 3;
if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP &&
(IndexReg == X86::ESP || IndexReg == X86::RSP))
@@ -2572,7 +2572,7 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
if (Scale != 0 &&
X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))
- return Error(Start, "16-bit addresses cannot have a scale");
+ return Error(Start, "16-bit addresses cannot have a scale");
// If there was no explicit scale specified, change it to 1.
if (Scale == 0)
@@ -2588,33 +2588,33 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) {
if ((BaseReg || IndexReg) &&
CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
ErrMsg))
- return Error(Start, ErrMsg);
+ return Error(Start, ErrMsg);
if (isParsingMSInlineAsm())
return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
End, Size, SM.getSymName(),
- SM.getIdentifierInfo(), Operands);
+ SM.getIdentifierInfo(), Operands);
// When parsing x64 MS-style assembly, all memory operands default to
// RIP-relative when interpreted as non-absolute references.
- if (Parser.isParsingMasm() && is64BitMode()) {
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
- BaseReg, IndexReg, Scale, Start,
- End, Size,
- /*DefaultBaseReg=*/X86::RIP));
- return false;
- }
-
- if ((BaseReg || IndexReg || RegNo))
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
- BaseReg, IndexReg, Scale, Start,
- End, Size));
- else
- Operands.push_back(
- X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
- return false;
+ if (Parser.isParsingMasm() && is64BitMode()) {
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size,
+ /*DefaultBaseReg=*/X86::RIP));
+ return false;
+ }
+
+ if ((BaseReg || IndexReg || RegNo))
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start,
+ End, Size));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size));
+ return false;
}
-bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
+bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
switch (getLexer().getKind()) {
case AsmToken::Dollar: {
@@ -2629,13 +2629,13 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
"expected immediate expression") ||
getParser().parseExpression(Val, End) ||
check(isa<X86MCExpr>(Val), L, "expected immediate expression"))
- return true;
- Operands.push_back(X86Operand::CreateImm(Val, Start, End));
- return false;
+ return true;
+ Operands.push_back(X86Operand::CreateImm(Val, Start, End));
+ return false;
}
case AsmToken::LCurly: {
SMLoc Start = Parser.getTok().getLoc();
- return ParseRoundingModeOp(Start, Operands);
+ return ParseRoundingModeOp(Start, Operands);
}
default: {
// This a memory operand or a register. We have some parsing complications
@@ -2649,7 +2649,7 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
if (getLexer().isNot(AsmToken::LParen)) {
// No '(' so this is either a displacement expression or a register.
if (Parser.parseExpression(Expr, EndLoc))
- return true;
+ return true;
if (auto *RE = dyn_cast<X86MCExpr>(Expr)) {
// Segment Register. Reset Expr and copy value to register.
Expr = nullptr;
@@ -2657,27 +2657,27 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) {
// Sanity check register.
if (Reg == X86::EIZ || Reg == X86::RIZ)
- return Error(
+ return Error(
Loc, "%eiz and %riz can only be used as index registers",
SMRange(Loc, EndLoc));
if (Reg == X86::RIP)
- return Error(Loc, "%rip can only be used as a base register",
- SMRange(Loc, EndLoc));
+ return Error(Loc, "%rip can only be used as a base register",
+ SMRange(Loc, EndLoc));
// Return register that are not segment prefixes immediately.
- if (!Parser.parseOptionalToken(AsmToken::Colon)) {
- Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc));
- return false;
- }
+ if (!Parser.parseOptionalToken(AsmToken::Colon)) {
+ Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc));
+ return false;
+ }
if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg))
- return Error(Loc, "invalid segment register");
- // Accept a '*' absolute memory reference after the segment. Place it
- // before the full memory operand.
- if (getLexer().is(AsmToken::Star))
- Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
+ return Error(Loc, "invalid segment register");
+ // Accept a '*' absolute memory reference after the segment. Place it
+ // before the full memory operand.
+ if (getLexer().is(AsmToken::Star))
+ Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
}
}
// This is a Memory operand.
- return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands);
+ return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands);
}
}
}
@@ -2727,7 +2727,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z,
}
// true on failure, false otherwise
-bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
+bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
MCAsmParser &Parser = getParser();
if (getLexer().is(AsmToken::LCurly)) {
// Eat "{" and mark the current place.
@@ -2737,26 +2737,26 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
// Parse memory broadcasting ({1to<NUM>}).
if (getLexer().getTok().getIntVal() != 1)
return TokError("Expected 1to<NUM> at this point");
- StringRef Prefix = getLexer().getTok().getString();
- Parser.Lex(); // Eat first token of 1to8
- if (!getLexer().is(AsmToken::Identifier))
+ StringRef Prefix = getLexer().getTok().getString();
+ Parser.Lex(); // Eat first token of 1to8
+ if (!getLexer().is(AsmToken::Identifier))
return TokError("Expected 1to<NUM> at this point");
// Recognize only reasonable suffixes.
- SmallVector<char, 5> BroadcastVector;
- StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier())
- .toStringRef(BroadcastVector);
- if (!BroadcastString.startswith("1to"))
- return TokError("Expected 1to<NUM> at this point");
+ SmallVector<char, 5> BroadcastVector;
+ StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier())
+ .toStringRef(BroadcastVector);
+ if (!BroadcastString.startswith("1to"))
+ return TokError("Expected 1to<NUM> at this point");
const char *BroadcastPrimitive =
- StringSwitch<const char *>(BroadcastString)
- .Case("1to2", "{1to2}")
- .Case("1to4", "{1to4}")
- .Case("1to8", "{1to8}")
- .Case("1to16", "{1to16}")
- .Default(nullptr);
+ StringSwitch<const char *>(BroadcastString)
+ .Case("1to2", "{1to2}")
+ .Case("1to4", "{1to4}")
+ .Case("1to8", "{1to8}")
+ .Case("1to16", "{1to16}")
+ .Default(nullptr);
if (!BroadcastPrimitive)
return TokError("Invalid memory broadcast primitive.");
- Parser.Lex(); // Eat trailing token of 1toN
+ Parser.Lex(); // Eat trailing token of 1toN
if (!getLexer().is(AsmToken::RCurly))
return TokError("Expected } at this point");
Parser.Lex(); // Eat "}"
@@ -2816,9 +2816,9 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
/// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'. The '%ds:' prefix
/// has already been parsed if present. disp may be provided as well.
-bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
- SMLoc StartLoc, SMLoc EndLoc,
- OperandVector &Operands) {
+bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+ SMLoc StartLoc, SMLoc EndLoc,
+ OperandVector &Operands) {
MCAsmParser &Parser = getParser();
SMLoc Loc;
// Based on the initial passed values, we may be in any of these cases, we are
@@ -2880,7 +2880,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
// Parse immediate if we're not at a mem operand yet.
if (!isAtMemOperand()) {
if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc))
- return true;
+ return true;
assert(!isa<X86MCExpr>(Disp) && "Expected non-register here.");
} else {
// Disp is implicitly zero if we haven't parsed it yet.
@@ -2893,12 +2893,12 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
if (!parseOptionalToken(AsmToken::LParen)) {
if (SegReg == 0)
- Operands.push_back(
- X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
- else
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
- 0, 0, 1, StartLoc, EndLoc));
- return false;
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ else
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ 0, 0, 1, StartLoc, EndLoc));
+ return false;
}
// If we reached here, then eat the '(' and Process
@@ -2912,13 +2912,13 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) {
if (Parser.parseExpression(E, EndLoc) ||
check(!isa<X86MCExpr>(E), BaseLoc, "expected register here"))
- return true;
+ return true;
// Sanity check register.
BaseReg = cast<X86MCExpr>(E)->getRegNo();
if (BaseReg == X86::EIZ || BaseReg == X86::RIZ)
- return Error(BaseLoc, "eiz and riz can only be used as index registers",
- SMRange(BaseLoc, EndLoc));
+ return Error(BaseLoc, "eiz and riz can only be used as index registers",
+ SMRange(BaseLoc, EndLoc));
}
if (parseOptionalToken(AsmToken::Comma)) {
@@ -2930,14 +2930,14 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
// "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
if (getLexer().isNot(AsmToken::RParen)) {
if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc))
- return true;
+ return true;
if (!isa<X86MCExpr>(E)) {
// We've parsed an unexpected Scale Value instead of an index
// register. Interpret it as an absolute.
int64_t ScaleVal;
if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr()))
- return Error(Loc, "expected absolute expression");
+ return Error(Loc, "expected absolute expression");
if (ScaleVal != 1)
Warning(Loc, "scale factor without index register is ignored");
Scale = 1;
@@ -2945,10 +2945,10 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
IndexReg = cast<X86MCExpr>(E)->getRegNo();
if (BaseReg == X86::RIP)
- return Error(Loc,
- "%rip as base register can not have an index register");
+ return Error(Loc,
+ "%rip as base register can not have an index register");
if (IndexReg == X86::RIP)
- return Error(Loc, "%rip is not allowed as an index register");
+ return Error(Loc, "%rip is not allowed as an index register");
if (parseOptionalToken(AsmToken::Comma)) {
// Parse the scale amount:
@@ -2959,14 +2959,14 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
int64_t ScaleVal;
if (Parser.parseTokenLoc(Loc) ||
Parser.parseAbsoluteExpression(ScaleVal))
- return Error(Loc, "expected scale expression");
+ return Error(Loc, "expected scale expression");
Scale = (unsigned)ScaleVal;
// Validate the scale amount.
if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
Scale != 1)
- return Error(Loc, "scale factor in 16-bit address must be 1");
+ return Error(Loc, "scale factor in 16-bit address must be 1");
if (checkScale(Scale, ErrMsg))
- return Error(Loc, ErrMsg);
+ return Error(Loc, ErrMsg);
}
}
}
@@ -2975,30 +2975,30 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
// Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
if (parseToken(AsmToken::RParen, "unexpected token in memory operand"))
- return true;
+ return true;
// This is to support otherwise illegal operand (%dx) found in various
// unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now
// be supported. Mark such DX variants separately fix only in special cases.
if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 &&
- isa<MCConstantExpr>(Disp) &&
- cast<MCConstantExpr>(Disp)->getValue() == 0) {
- Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc));
- return false;
- }
+ isa<MCConstantExpr>(Disp) &&
+ cast<MCConstantExpr>(Disp)->getValue() == 0) {
+ Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc));
+ return false;
+ }
if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
ErrMsg))
- return Error(BaseLoc, ErrMsg);
+ return Error(BaseLoc, ErrMsg);
if (SegReg || BaseReg || IndexReg)
- Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
- BaseReg, IndexReg, Scale, StartLoc,
- EndLoc));
- else
- Operands.push_back(
- X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
- return false;
+ Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
+ BaseReg, IndexReg, Scale, StartLoc,
+ EndLoc));
+ else
+ Operands.push_back(
+ X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc));
+ return false;
}
// Parse either a standard primary expression or a register.
@@ -3015,7 +3015,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
Res = X86MCExpr::create(RegNo, Parser.getContext());
return false;
}
- return Parser.parsePrimaryExpr(Res, EndLoc, nullptr);
+ return Parser.parsePrimaryExpr(Res, EndLoc, nullptr);
}
bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -3025,7 +3025,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Reset the forced VEX encoding.
ForcedVEXEncoding = VEXEncoding_Default;
- ForcedDispEncoding = DispEncoding_Default;
+ ForcedDispEncoding = DispEncoding_Default;
// Parse pseudo prefixes.
while (1) {
@@ -3038,18 +3038,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(Parser.getTok().getLoc(), "Expected '}'");
Parser.Lex(); // Eat curly.
- if (Prefix == "vex")
+ if (Prefix == "vex")
ForcedVEXEncoding = VEXEncoding_VEX;
- else if (Prefix == "vex2")
- ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX2;
else if (Prefix == "vex3")
ForcedVEXEncoding = VEXEncoding_VEX3;
else if (Prefix == "evex")
ForcedVEXEncoding = VEXEncoding_EVEX;
- else if (Prefix == "disp8")
- ForcedDispEncoding = DispEncoding_Disp8;
- else if (Prefix == "disp32")
- ForcedDispEncoding = DispEncoding_Disp32;
+ else if (Prefix == "disp8")
+ ForcedDispEncoding = DispEncoding_Disp8;
+ else if (Prefix == "disp32")
+ ForcedDispEncoding = DispEncoding_Disp32;
else
return Error(NameLoc, "unknown prefix");
@@ -3066,36 +3066,36 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
continue;
}
- // Parse MASM style pseudo prefixes.
- if (isParsingMSInlineAsm()) {
- if (Name.equals_lower("vex"))
- ForcedVEXEncoding = VEXEncoding_VEX;
- else if (Name.equals_lower("vex2"))
- ForcedVEXEncoding = VEXEncoding_VEX2;
- else if (Name.equals_lower("vex3"))
- ForcedVEXEncoding = VEXEncoding_VEX3;
- else if (Name.equals_lower("evex"))
- ForcedVEXEncoding = VEXEncoding_EVEX;
-
- if (ForcedVEXEncoding != VEXEncoding_Default) {
- if (getLexer().isNot(AsmToken::Identifier))
- return Error(Parser.getTok().getLoc(), "Expected identifier");
- // FIXME: The mnemonic won't match correctly if its not in lower case.
- Name = Parser.getTok().getString();
- NameLoc = Parser.getTok().getLoc();
- Parser.Lex();
- }
- }
+ // Parse MASM style pseudo prefixes.
+ if (isParsingMSInlineAsm()) {
+ if (Name.equals_lower("vex"))
+ ForcedVEXEncoding = VEXEncoding_VEX;
+ else if (Name.equals_lower("vex2"))
+ ForcedVEXEncoding = VEXEncoding_VEX2;
+ else if (Name.equals_lower("vex3"))
+ ForcedVEXEncoding = VEXEncoding_VEX3;
+ else if (Name.equals_lower("evex"))
+ ForcedVEXEncoding = VEXEncoding_EVEX;
+
+ if (ForcedVEXEncoding != VEXEncoding_Default) {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return Error(Parser.getTok().getLoc(), "Expected identifier");
+ // FIXME: The mnemonic won't match correctly if its not in lower case.
+ Name = Parser.getTok().getString();
+ NameLoc = Parser.getTok().getLoc();
+ Parser.Lex();
+ }
+ }
break;
}
- // Support the suffix syntax for overriding displacement size as well.
- if (Name.consume_back(".d32")) {
- ForcedDispEncoding = DispEncoding_Disp32;
- } else if (Name.consume_back(".d8")) {
- ForcedDispEncoding = DispEncoding_Disp8;
- }
-
+ // Support the suffix syntax for overriding displacement size as well.
+ if (Name.consume_back(".d32")) {
+ ForcedDispEncoding = DispEncoding_Disp32;
+ } else if (Name.consume_back(".d8")) {
+ ForcedDispEncoding = DispEncoding_Disp8;
+ }
+
StringRef PatchedName = Name;
// Hack to skip "short" following Jcc.
@@ -3263,13 +3263,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// repz repnz <insn> ; GAS errors for the use of two similar prefixes
// lock addq %rax, %rbx ; Destination operand must be of memory type
// xacquire <insn> ; xacquire must be accompanied by 'lock'
- bool IsPrefix =
- StringSwitch<bool>(Name)
- .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
- .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
- .Cases("xacquire", "xrelease", true)
- .Cases("acquire", "release", isParsingIntelSyntax())
- .Default(false);
+ bool IsPrefix =
+ StringSwitch<bool>(Name)
+ .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
+ .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
+ .Cases("xacquire", "xrelease", true)
+ .Cases("acquire", "release", isParsingIntelSyntax())
+ .Default(false);
auto isLockRepeatNtPrefix = [](StringRef N) {
return StringSwitch<bool>(N)
@@ -3324,22 +3324,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(NameLoc, "'data32' is not supported in 64-bit mode");
// Hack to 'data16' for the table lookup.
PatchedName = "data16";
-
- if (getLexer().isNot(AsmToken::EndOfStatement)) {
- StringRef Next = Parser.getTok().getString();
- getLexer().Lex();
- // data32 effectively changes the instruction suffix.
- // TODO Generalize.
- if (Next == "callw")
- Next = "calll";
- if (Next == "ljmpw")
- Next = "ljmpl";
-
- Name = Next;
- PatchedName = Name;
- ForcedDataPrefix = X86::Mode32Bit;
- IsPrefix = false;
- }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ StringRef Next = Parser.getTok().getString();
+ getLexer().Lex();
+ // data32 effectively changes the instruction suffix.
+ // TODO Generalize.
+ if (Next == "callw")
+ Next = "calll";
+ if (Next == "ljmpw")
+ Next = "ljmpl";
+
+ Name = Next;
+ PatchedName = Name;
+ ForcedDataPrefix = X86::Mode32Bit;
+ IsPrefix = false;
+ }
}
Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
@@ -3355,18 +3355,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
// the next one.
- if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) {
+ if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) {
// Parse '*' modifier.
if (getLexer().is(AsmToken::Star))
Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
// Read the operands.
while(1) {
- if (ParseOperand(Operands))
- return true;
- if (HandleAVX512Operand(Operands))
- return true;
-
+ if (ParseOperand(Operands))
+ return true;
+ if (HandleAVX512Operand(Operands))
+ return true;
+
// check for comma and eat it
if (getLexer().is(AsmToken::Comma))
Parser.Lex();
@@ -3392,7 +3392,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// Consume the EndOfStatement or the prefix separator Slash
if (getLexer().is(AsmToken::EndOfStatement) ||
- (IsPrefix && getLexer().is(AsmToken::Slash)))
+ (IsPrefix && getLexer().is(AsmToken::Slash)))
Parser.Lex();
else if (CurlyAsEndOfStatement)
// Add an actual EndOfStatement before the curly brace
@@ -3567,26 +3567,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
switch (Inst.getOpcode()) {
default: return false;
- case X86::JMP_1:
- // {disp32} forces a larger displacement as if the instruction was relaxed.
- // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
- // This matches GNU assembler.
- if (ForcedDispEncoding == DispEncoding_Disp32) {
- Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4);
- return true;
- }
-
- return false;
- case X86::JCC_1:
- // {disp32} forces a larger displacement as if the instruction was relaxed.
- // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
- // This matches GNU assembler.
- if (ForcedDispEncoding == DispEncoding_Disp32) {
- Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4);
- return true;
- }
-
- return false;
+ case X86::JMP_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4);
+ return true;
+ }
+
+ return false;
+ case X86::JCC_1:
+ // {disp32} forces a larger displacement as if the instruction was relaxed.
+ // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}.
+ // This matches GNU assembler.
+ if (ForcedDispEncoding == DispEncoding_Disp32) {
+ Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4);
+ return true;
+ }
+
+ return false;
case X86::VMOVZPQILo2PQIrr:
case X86::VMOVAPDrr:
case X86::VMOVAPDYrr:
@@ -3645,123 +3645,123 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
Inst.setOpcode(NewOpc);
return true;
}
- case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri:
- case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri:
- case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
- case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
- case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
- case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
- case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: {
- // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
- // FIXME: It would be great if we could just do this with an InstAlias.
- if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1)
- return false;
-
- unsigned NewOpc;
- switch (Inst.getOpcode()) {
- default: llvm_unreachable("Invalid opcode");
- case X86::RCR8ri: NewOpc = X86::RCR8r1; break;
- case X86::RCR16ri: NewOpc = X86::RCR16r1; break;
- case X86::RCR32ri: NewOpc = X86::RCR32r1; break;
- case X86::RCR64ri: NewOpc = X86::RCR64r1; break;
- case X86::RCL8ri: NewOpc = X86::RCL8r1; break;
- case X86::RCL16ri: NewOpc = X86::RCL16r1; break;
- case X86::RCL32ri: NewOpc = X86::RCL32r1; break;
- case X86::RCL64ri: NewOpc = X86::RCL64r1; break;
- case X86::ROR8ri: NewOpc = X86::ROR8r1; break;
- case X86::ROR16ri: NewOpc = X86::ROR16r1; break;
- case X86::ROR32ri: NewOpc = X86::ROR32r1; break;
- case X86::ROR64ri: NewOpc = X86::ROR64r1; break;
- case X86::ROL8ri: NewOpc = X86::ROL8r1; break;
- case X86::ROL16ri: NewOpc = X86::ROL16r1; break;
- case X86::ROL32ri: NewOpc = X86::ROL32r1; break;
- case X86::ROL64ri: NewOpc = X86::ROL64r1; break;
- case X86::SAR8ri: NewOpc = X86::SAR8r1; break;
- case X86::SAR16ri: NewOpc = X86::SAR16r1; break;
- case X86::SAR32ri: NewOpc = X86::SAR32r1; break;
- case X86::SAR64ri: NewOpc = X86::SAR64r1; break;
- case X86::SHR8ri: NewOpc = X86::SHR8r1; break;
- case X86::SHR16ri: NewOpc = X86::SHR16r1; break;
- case X86::SHR32ri: NewOpc = X86::SHR32r1; break;
- case X86::SHR64ri: NewOpc = X86::SHR64r1; break;
- case X86::SHL8ri: NewOpc = X86::SHL8r1; break;
- case X86::SHL16ri: NewOpc = X86::SHL16r1; break;
- case X86::SHL32ri: NewOpc = X86::SHL32r1; break;
- case X86::SHL64ri: NewOpc = X86::SHL64r1; break;
- }
-
- MCInst TmpInst;
- TmpInst.setOpcode(NewOpc);
- TmpInst.addOperand(Inst.getOperand(0));
- TmpInst.addOperand(Inst.getOperand(1));
- Inst = TmpInst;
- return true;
- }
- case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi:
- case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi:
- case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi:
- case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi:
- case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi:
- case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi:
- case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: {
- // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
- // FIXME: It would be great if we could just do this with an InstAlias.
- if (!Inst.getOperand(X86::AddrNumOperands).isImm() ||
- Inst.getOperand(X86::AddrNumOperands).getImm() != 1)
- return false;
-
- unsigned NewOpc;
- switch (Inst.getOpcode()) {
- default: llvm_unreachable("Invalid opcode");
- case X86::RCR8mi: NewOpc = X86::RCR8m1; break;
- case X86::RCR16mi: NewOpc = X86::RCR16m1; break;
- case X86::RCR32mi: NewOpc = X86::RCR32m1; break;
- case X86::RCR64mi: NewOpc = X86::RCR64m1; break;
- case X86::RCL8mi: NewOpc = X86::RCL8m1; break;
- case X86::RCL16mi: NewOpc = X86::RCL16m1; break;
- case X86::RCL32mi: NewOpc = X86::RCL32m1; break;
- case X86::RCL64mi: NewOpc = X86::RCL64m1; break;
- case X86::ROR8mi: NewOpc = X86::ROR8m1; break;
- case X86::ROR16mi: NewOpc = X86::ROR16m1; break;
- case X86::ROR32mi: NewOpc = X86::ROR32m1; break;
- case X86::ROR64mi: NewOpc = X86::ROR64m1; break;
- case X86::ROL8mi: NewOpc = X86::ROL8m1; break;
- case X86::ROL16mi: NewOpc = X86::ROL16m1; break;
- case X86::ROL32mi: NewOpc = X86::ROL32m1; break;
- case X86::ROL64mi: NewOpc = X86::ROL64m1; break;
- case X86::SAR8mi: NewOpc = X86::SAR8m1; break;
- case X86::SAR16mi: NewOpc = X86::SAR16m1; break;
- case X86::SAR32mi: NewOpc = X86::SAR32m1; break;
- case X86::SAR64mi: NewOpc = X86::SAR64m1; break;
- case X86::SHR8mi: NewOpc = X86::SHR8m1; break;
- case X86::SHR16mi: NewOpc = X86::SHR16m1; break;
- case X86::SHR32mi: NewOpc = X86::SHR32m1; break;
- case X86::SHR64mi: NewOpc = X86::SHR64m1; break;
- case X86::SHL8mi: NewOpc = X86::SHL8m1; break;
- case X86::SHL16mi: NewOpc = X86::SHL16m1; break;
- case X86::SHL32mi: NewOpc = X86::SHL32m1; break;
- case X86::SHL64mi: NewOpc = X86::SHL64m1; break;
- }
-
- MCInst TmpInst;
- TmpInst.setOpcode(NewOpc);
- for (int i = 0; i != X86::AddrNumOperands; ++i)
- TmpInst.addOperand(Inst.getOperand(i));
- Inst = TmpInst;
- return true;
- }
- case X86::INT: {
- // Transforms "int $3" into "int3" as a size optimization. We can't write an
- // instalias with an immediate operand yet.
- if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3)
- return false;
-
- MCInst TmpInst;
- TmpInst.setOpcode(X86::INT3);
- Inst = TmpInst;
- return true;
- }
- }
+ case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri:
+ case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri:
+ case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
+ case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
+ case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
+ case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
+ case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8ri: NewOpc = X86::RCR8r1; break;
+ case X86::RCR16ri: NewOpc = X86::RCR16r1; break;
+ case X86::RCR32ri: NewOpc = X86::RCR32r1; break;
+ case X86::RCR64ri: NewOpc = X86::RCR64r1; break;
+ case X86::RCL8ri: NewOpc = X86::RCL8r1; break;
+ case X86::RCL16ri: NewOpc = X86::RCL16r1; break;
+ case X86::RCL32ri: NewOpc = X86::RCL32r1; break;
+ case X86::RCL64ri: NewOpc = X86::RCL64r1; break;
+ case X86::ROR8ri: NewOpc = X86::ROR8r1; break;
+ case X86::ROR16ri: NewOpc = X86::ROR16r1; break;
+ case X86::ROR32ri: NewOpc = X86::ROR32r1; break;
+ case X86::ROR64ri: NewOpc = X86::ROR64r1; break;
+ case X86::ROL8ri: NewOpc = X86::ROL8r1; break;
+ case X86::ROL16ri: NewOpc = X86::ROL16r1; break;
+ case X86::ROL32ri: NewOpc = X86::ROL32r1; break;
+ case X86::ROL64ri: NewOpc = X86::ROL64r1; break;
+ case X86::SAR8ri: NewOpc = X86::SAR8r1; break;
+ case X86::SAR16ri: NewOpc = X86::SAR16r1; break;
+ case X86::SAR32ri: NewOpc = X86::SAR32r1; break;
+ case X86::SAR64ri: NewOpc = X86::SAR64r1; break;
+ case X86::SHR8ri: NewOpc = X86::SHR8r1; break;
+ case X86::SHR16ri: NewOpc = X86::SHR16r1; break;
+ case X86::SHR32ri: NewOpc = X86::SHR32r1; break;
+ case X86::SHR64ri: NewOpc = X86::SHR64r1; break;
+ case X86::SHL8ri: NewOpc = X86::SHL8r1; break;
+ case X86::SHL16ri: NewOpc = X86::SHL16r1; break;
+ case X86::SHL32ri: NewOpc = X86::SHL32r1; break;
+ case X86::SHL64ri: NewOpc = X86::SHL64r1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi:
+ case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi:
+ case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi:
+ case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi:
+ case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi:
+ case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi:
+ case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: {
+ // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate.
+ // FIXME: It would be great if we could just do this with an InstAlias.
+ if (!Inst.getOperand(X86::AddrNumOperands).isImm() ||
+ Inst.getOperand(X86::AddrNumOperands).getImm() != 1)
+ return false;
+
+ unsigned NewOpc;
+ switch (Inst.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::RCR8mi: NewOpc = X86::RCR8m1; break;
+ case X86::RCR16mi: NewOpc = X86::RCR16m1; break;
+ case X86::RCR32mi: NewOpc = X86::RCR32m1; break;
+ case X86::RCR64mi: NewOpc = X86::RCR64m1; break;
+ case X86::RCL8mi: NewOpc = X86::RCL8m1; break;
+ case X86::RCL16mi: NewOpc = X86::RCL16m1; break;
+ case X86::RCL32mi: NewOpc = X86::RCL32m1; break;
+ case X86::RCL64mi: NewOpc = X86::RCL64m1; break;
+ case X86::ROR8mi: NewOpc = X86::ROR8m1; break;
+ case X86::ROR16mi: NewOpc = X86::ROR16m1; break;
+ case X86::ROR32mi: NewOpc = X86::ROR32m1; break;
+ case X86::ROR64mi: NewOpc = X86::ROR64m1; break;
+ case X86::ROL8mi: NewOpc = X86::ROL8m1; break;
+ case X86::ROL16mi: NewOpc = X86::ROL16m1; break;
+ case X86::ROL32mi: NewOpc = X86::ROL32m1; break;
+ case X86::ROL64mi: NewOpc = X86::ROL64m1; break;
+ case X86::SAR8mi: NewOpc = X86::SAR8m1; break;
+ case X86::SAR16mi: NewOpc = X86::SAR16m1; break;
+ case X86::SAR32mi: NewOpc = X86::SAR32m1; break;
+ case X86::SAR64mi: NewOpc = X86::SAR64m1; break;
+ case X86::SHR8mi: NewOpc = X86::SHR8m1; break;
+ case X86::SHR16mi: NewOpc = X86::SHR16m1; break;
+ case X86::SHR32mi: NewOpc = X86::SHR32m1; break;
+ case X86::SHR64mi: NewOpc = X86::SHR64m1; break;
+ case X86::SHL8mi: NewOpc = X86::SHL8m1; break;
+ case X86::SHL16mi: NewOpc = X86::SHL16m1; break;
+ case X86::SHL32mi: NewOpc = X86::SHL32m1; break;
+ case X86::SHL64mi: NewOpc = X86::SHL64m1; break;
+ }
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(NewOpc);
+ for (int i = 0; i != X86::AddrNumOperands; ++i)
+ TmpInst.addOperand(Inst.getOperand(i));
+ Inst = TmpInst;
+ return true;
+ }
+ case X86::INT: {
+ // Transforms "int $3" into "int3" as a size optimization. We can't write an
+ // instalias with an immediate operand yet.
+ if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3)
+ return false;
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::INT3);
+ Inst = TmpInst;
+ return true;
+ }
+ }
}
bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
@@ -3860,33 +3860,33 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
}
}
- const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
- // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
- // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
- if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
- MCPhysReg HReg = X86::NoRegister;
- bool UsesRex = MCID.TSFlags & X86II::REX_W;
- unsigned NumOps = Inst.getNumOperands();
- for (unsigned i = 0; i != NumOps; ++i) {
- const MCOperand &MO = Inst.getOperand(i);
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
- HReg = Reg;
- if (X86II::isX86_64NonExtLowByteReg(Reg) ||
- X86II::isX86_64ExtendedReg(Reg))
- UsesRex = true;
- }
-
- if (UsesRex && HReg != X86::NoRegister) {
- StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
- return Error(Ops[0]->getStartLoc(),
- "can't encode '" + RegName + "' in an instruction requiring "
- "REX prefix");
- }
- }
-
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+ // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
+ // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
+ if ((MCID.TSFlags & X86II::EncodingMask) == 0) {
+ MCPhysReg HReg = X86::NoRegister;
+ bool UsesRex = MCID.TSFlags & X86II::REX_W;
+ unsigned NumOps = Inst.getNumOperands();
+ for (unsigned i = 0; i != NumOps; ++i) {
+ const MCOperand &MO = Inst.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ HReg = Reg;
+ if (X86II::isX86_64NonExtLowByteReg(Reg) ||
+ X86II::isX86_64ExtendedReg(Reg))
+ UsesRex = true;
+ }
+
+ if (UsesRex && HReg != X86::NoRegister) {
+ StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
+ return Error(Ops[0]->getStartLoc(),
+ "can't encode '" + RegName + "' in an instruction requiring "
+ "REX prefix");
+ }
+ }
+
return false;
}
@@ -4080,18 +4080,18 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Unsupported;
if ((ForcedVEXEncoding == VEXEncoding_VEX ||
- ForcedVEXEncoding == VEXEncoding_VEX2 ||
+ ForcedVEXEncoding == VEXEncoding_VEX2 ||
ForcedVEXEncoding == VEXEncoding_VEX3) &&
(MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
return Match_Unsupported;
- // These instructions are only available with {vex}, {vex2} or {vex3} prefix
- if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
- (ForcedVEXEncoding != VEXEncoding_VEX &&
- ForcedVEXEncoding != VEXEncoding_VEX2 &&
- ForcedVEXEncoding != VEXEncoding_VEX3))
- return Match_Unsupported;
-
+ // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+ if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+ (ForcedVEXEncoding != VEXEncoding_VEX &&
+ ForcedVEXEncoding != VEXEncoding_VEX2 &&
+ ForcedVEXEncoding != VEXEncoding_VEX3))
+ return Match_Unsupported;
+
// These instructions match ambiguously with their VEX encoded counterparts
// and appear first in the matching table. Reject them unless we're forcing
// EVEX encoding.
@@ -4130,39 +4130,39 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
- // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
- // encoder and printer.
- if (ForcedVEXEncoding == VEXEncoding_VEX)
- Prefixes |= X86::IP_USE_VEX;
- else if (ForcedVEXEncoding == VEXEncoding_VEX2)
- Prefixes |= X86::IP_USE_VEX2;
- else if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
Prefixes |= X86::IP_USE_VEX3;
- else if (ForcedVEXEncoding == VEXEncoding_EVEX)
- Prefixes |= X86::IP_USE_EVEX;
-
- // Set encoded flags for {disp8} and {disp32}.
- if (ForcedDispEncoding == DispEncoding_Disp8)
- Prefixes |= X86::IP_USE_DISP8;
- else if (ForcedDispEncoding == DispEncoding_Disp32)
- Prefixes |= X86::IP_USE_DISP32;
-
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
+
if (Prefixes)
Inst.setFlags(Prefixes);
- // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
- // when matching the instruction.
- if (ForcedDataPrefix == X86::Mode32Bit)
- SwitchMode(X86::Mode32Bit);
+ // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode
+ // when matching the instruction.
+ if (ForcedDataPrefix == X86::Mode32Bit)
+ SwitchMode(X86::Mode32Bit);
// First, try a direct match.
FeatureBitset MissingFeatures;
unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo,
MissingFeatures, MatchingInlineAsm,
isParsingIntelSyntax());
- if (ForcedDataPrefix == X86::Mode32Bit) {
- SwitchMode(X86::Mode16Bit);
- ForcedDataPrefix = 0;
- }
+ if (ForcedDataPrefix == X86::Mode32Bit) {
+ SwitchMode(X86::Mode16Bit);
+ ForcedDataPrefix = 0;
+ }
switch (OriginalError) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
@@ -4271,15 +4271,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
unsigned NumSuccessfulMatches =
std::count(std::begin(Match), std::end(Match), Match_Success);
if (NumSuccessfulMatches == 1) {
- if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
- return true;
- // Some instructions need post-processing to, for example, tweak which
- // encoding is selected. Loop on it while changes happen so the
- // individual transformations can chain off each other.
- if (!MatchingInlineAsm)
- while (processInstruction(Inst, Operands))
- ;
-
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
+ // Some instructions need post-processing to, for example, tweak which
+ // encoding is selected. Loop on it while changes happen so the
+ // individual transformations can chain off each other.
+ if (!MatchingInlineAsm)
+ while (processInstruction(Inst, Operands))
+ ;
+
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
emitInstruction(Inst, Operands, Out);
@@ -4393,23 +4393,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
MCInst Inst;
- // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
- // encoder and printer.
- if (ForcedVEXEncoding == VEXEncoding_VEX)
- Prefixes |= X86::IP_USE_VEX;
- else if (ForcedVEXEncoding == VEXEncoding_VEX2)
- Prefixes |= X86::IP_USE_VEX2;
- else if (ForcedVEXEncoding == VEXEncoding_VEX3)
+ // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the
+ // encoder and printer.
+ if (ForcedVEXEncoding == VEXEncoding_VEX)
+ Prefixes |= X86::IP_USE_VEX;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX2)
+ Prefixes |= X86::IP_USE_VEX2;
+ else if (ForcedVEXEncoding == VEXEncoding_VEX3)
Prefixes |= X86::IP_USE_VEX3;
- else if (ForcedVEXEncoding == VEXEncoding_EVEX)
- Prefixes |= X86::IP_USE_EVEX;
-
- // Set encoded flags for {disp8} and {disp32}.
- if (ForcedDispEncoding == DispEncoding_Disp8)
- Prefixes |= X86::IP_USE_DISP8;
- else if (ForcedDispEncoding == DispEncoding_Disp32)
- Prefixes |= X86::IP_USE_DISP32;
-
+ else if (ForcedVEXEncoding == VEXEncoding_EVEX)
+ Prefixes |= X86::IP_USE_EVEX;
+
+ // Set encoded flags for {disp8} and {disp32}.
+ if (ForcedDispEncoding == DispEncoding_Disp8)
+ Prefixes |= X86::IP_USE_DISP8;
+ else if (ForcedDispEncoding == DispEncoding_Disp32)
+ Prefixes |= X86::IP_USE_DISP32;
+
if (Prefixes)
Inst.setFlags(Prefixes);
@@ -4603,8 +4603,8 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
MCAsmParser &Parser = getParser();
StringRef IDVal = DirectiveID.getIdentifier();
- if (IDVal.startswith(".arch"))
- return parseDirectiveArch();
+ if (IDVal.startswith(".arch"))
+ return parseDirectiveArch();
if (IDVal.startswith(".code"))
return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
else if (IDVal.startswith(".att_syntax")) {
@@ -4629,9 +4629,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
"a '%' prefix in .intel_syntax");
}
return false;
- } else if (IDVal == ".nops")
- return parseDirectiveNops(DirectiveID.getLoc());
- else if (IDVal == ".even")
+ } else if (IDVal == ".nops")
+ return parseDirectiveNops(DirectiveID.getLoc());
+ else if (IDVal == ".even")
return parseDirectiveEven(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_proc")
return parseDirectiveFPOProc(DirectiveID.getLoc());
@@ -4647,67 +4647,67 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_endproc")
return parseDirectiveFPOEndProc(DirectiveID.getLoc());
- else if (IDVal == ".seh_pushreg" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
+ else if (IDVal == ".seh_pushreg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg")))
return parseDirectiveSEHPushReg(DirectiveID.getLoc());
- else if (IDVal == ".seh_setframe" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
+ else if (IDVal == ".seh_setframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".setframe")))
return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
- else if (IDVal == ".seh_savereg" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
+ else if (IDVal == ".seh_savereg" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savereg")))
return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
- else if (IDVal == ".seh_savexmm" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
+ else if (IDVal == ".seh_savexmm" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128")))
return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
- else if (IDVal == ".seh_pushframe" ||
- (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
+ else if (IDVal == ".seh_pushframe" ||
+ (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe")))
return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
return true;
}
-bool X86AsmParser::parseDirectiveArch() {
- // Ignore .arch for now.
- getParser().parseStringToEndOfStatement();
- return false;
-}
-
-/// parseDirectiveNops
-/// ::= .nops size[, control]
-bool X86AsmParser::parseDirectiveNops(SMLoc L) {
- int64_t NumBytes = 0, Control = 0;
- SMLoc NumBytesLoc, ControlLoc;
- const MCSubtargetInfo STI = getSTI();
- NumBytesLoc = getTok().getLoc();
- if (getParser().checkForValidSection() ||
- getParser().parseAbsoluteExpression(NumBytes))
- return true;
-
- if (parseOptionalToken(AsmToken::Comma)) {
- ControlLoc = getTok().getLoc();
- if (getParser().parseAbsoluteExpression(Control))
- return true;
- }
- if (getParser().parseToken(AsmToken::EndOfStatement,
- "unexpected token in '.nops' directive"))
- return true;
-
- if (NumBytes <= 0) {
- Error(NumBytesLoc, "'.nops' directive with non-positive size");
- return false;
- }
-
- if (Control < 0) {
- Error(ControlLoc, "'.nops' directive with negative NOP size");
- return false;
- }
-
- /// Emit nops
- getParser().getStreamer().emitNops(NumBytes, Control, L);
-
- return false;
-}
-
+bool X86AsmParser::parseDirectiveArch() {
+ // Ignore .arch for now.
+ getParser().parseStringToEndOfStatement();
+ return false;
+}
+
+/// parseDirectiveNops
+/// ::= .nops size[, control]
+bool X86AsmParser::parseDirectiveNops(SMLoc L) {
+ int64_t NumBytes = 0, Control = 0;
+ SMLoc NumBytesLoc, ControlLoc;
+ const MCSubtargetInfo STI = getSTI();
+ NumBytesLoc = getTok().getLoc();
+ if (getParser().checkForValidSection() ||
+ getParser().parseAbsoluteExpression(NumBytes))
+ return true;
+
+ if (parseOptionalToken(AsmToken::Comma)) {
+ ControlLoc = getTok().getLoc();
+ if (getParser().parseAbsoluteExpression(Control))
+ return true;
+ }
+ if (getParser().parseToken(AsmToken::EndOfStatement,
+ "unexpected token in '.nops' directive"))
+ return true;
+
+ if (NumBytes <= 0) {
+ Error(NumBytesLoc, "'.nops' directive with non-positive size");
+ return false;
+ }
+
+ if (Control < 0) {
+ Error(ControlLoc, "'.nops' directive with negative NOP size");
+ return false;
+ }
+
+ /// Emit nops
+ getParser().getStreamer().emitNops(NumBytes, Control, L);
+
+ return false;
+}
+
/// parseDirectiveEven
/// ::= .even
bool X86AsmParser::parseDirectiveEven(SMLoc L) {
diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
index f88283f4e5..b3115c55b9 100644
--- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make
@@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/llvm12
- contrib/libs/llvm12/include
- contrib/libs/llvm12/lib/MC
- contrib/libs/llvm12/lib/MC/MCParser
- contrib/libs/llvm12/lib/Support
- contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
- contrib/libs/llvm12/lib/Target/X86/TargetInfo
+ contrib/libs/llvm12
+ contrib/libs/llvm12/include
+ contrib/libs/llvm12/lib/MC
+ contrib/libs/llvm12/lib/MC/MCParser
+ contrib/libs/llvm12/lib/Support
+ contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
+ contrib/libs/llvm12/lib/Target/X86/TargetInfo
)
ADDINCL(
- ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86/AsmParser
+ ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86/AsmParser
)
NO_COMPILER_WARNINGS()
diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4e6d8e8e1a..1d396796e7 100644
--- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -492,7 +492,7 @@ static int readPrefixes(struct InternalInstruction *insn) {
insn->addressSize = (insn->hasAdSize ? 4 : 8);
insn->displacementSize = 4;
insn->immediateSize = 4;
- insn->hasOpSize = false;
+ insn->hasOpSize = false;
} else {
insn->registerSize = (insn->hasOpSize ? 2 : 4);
insn->addressSize = (insn->hasAdSize ? 4 : 8);
@@ -1663,9 +1663,9 @@ namespace X86 {
sib = 504,
sib64 = 505
};
-} // namespace X86
+} // namespace X86
-} // namespace llvm
+} // namespace llvm
static bool translateInstruction(MCInst &target,
InternalInstruction &source,
@@ -1690,7 +1690,7 @@ private:
DisassemblerMode fMode;
};
-} // namespace
+} // namespace
X86GenericDisassembler::X86GenericDisassembler(
const MCSubtargetInfo &STI,
diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
index b55833692f..07b83642cc 100644
--- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make
@@ -12,17 +12,17 @@ LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/llvm12
- contrib/libs/llvm12/include
- contrib/libs/llvm12/lib/MC/MCDisassembler
- contrib/libs/llvm12/lib/Support
- contrib/libs/llvm12/lib/Target/X86/TargetInfo
+ contrib/libs/llvm12
+ contrib/libs/llvm12/include
+ contrib/libs/llvm12/lib/MC/MCDisassembler
+ contrib/libs/llvm12/lib/Support
+ contrib/libs/llvm12/lib/Target/X86/TargetInfo
)
ADDINCL(
- ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86/Disassembler
+ ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86/Disassembler
)
NO_COMPILER_WARNINGS()
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index c685d7e0db..fef6c33a90 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -16,7 +16,7 @@
#include "X86InstComments.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Casting.h"
@@ -385,16 +385,16 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
raw_ostream &O) {
- // Do not print the exact form of the memory operand if it references a known
- // binary object.
- if (SymbolizeOperands && MIA) {
- uint64_t Target;
- if (MIA->evaluateBranch(*MI, 0, 0, Target))
- return;
- if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
- return;
- }
-
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
+
const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg);
const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg);
const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp);
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index f7a8505712..5926418c57 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -36,7 +36,7 @@ public:
raw_ostream &O);
// Autogenerated by tblgen.
- std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 95012a148d..0b1812c935 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -109,7 +109,7 @@ cl::opt<unsigned> X86PadMaxPrefixSize(
cl::desc("Maximum number of prefixes to use for padding"));
cl::opt<bool> X86PadForAlign(
- "x86-pad-for-align", cl::init(false), cl::Hidden,
+ "x86-pad-for-align", cl::init(false), cl::Hidden,
cl::desc("Pad previous instructions to implement align directives"));
cl::opt<bool> X86PadForBranchAlign(
@@ -207,8 +207,8 @@ public:
void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
- unsigned getMaximumNopSize() const override;
-
+ unsigned getMaximumNopSize() const override;
+
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
} // end anonymous namespace
@@ -957,9 +957,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
if (!X86PadForAlign && !X86PadForBranchAlign)
return;
- // The processed regions are delimitered by LabeledFragments. -g may have more
- // MCSymbols and therefore different relaxation results. X86PadForAlign is
- // disabled by default to eliminate the -g vs non -g difference.
+ // The processed regions are delimitered by LabeledFragments. -g may have more
+ // MCSymbols and therefore different relaxation results. X86PadForAlign is
+ // disabled by default to eliminate the -g vs non -g difference.
DenseSet<MCFragment *> LabeledFragments;
for (const MCSymbol &S : Asm.symbols())
LabeledFragments.insert(S.getFragment(false));
@@ -1072,21 +1072,21 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm,
}
}
-unsigned X86AsmBackend::getMaximumNopSize() const {
- if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
- return 1;
- if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
- return 7;
- if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
- return 15;
- if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
- return 11;
- // FIXME: handle 32-bit mode
- // 15-bytes is the longest single NOP instruction, but 10-bytes is
- // commonly the longest that can be efficiently decoded.
- return 10;
-}
-
+unsigned X86AsmBackend::getMaximumNopSize() const {
+ if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit))
+ return 1;
+ if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
+ return 7;
+ if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ return 15;
+ if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ return 11;
+ // FIXME: handle 32-bit mode
+ // 15-bytes is the longest single NOP instruction, but 10-bytes is
+ // commonly the longest that can be efficiently decoded.
+ return 10;
+}
+
/// Write a sequence of optimal nops to the output, covering \p Count
/// bytes.
/// \return - true on success, false on failure
@@ -1114,7 +1114,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
"\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00",
};
- uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
+ uint64_t MaxNopLength = (uint64_t)getMaximumNopSize();
// Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
// length.
@@ -1241,7 +1241,7 @@ namespace CU {
UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
};
-} // namespace CU
+} // namespace CU
class DarwinX86AsmBackend : public X86AsmBackend {
const MCRegisterInfo &MRI;
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 4db1bfc251..fc60496917 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -55,18 +55,18 @@ namespace X86 {
/// The constants to describe instr prefixes if there are
enum IPREFIXES {
IP_NO_PREFIX = 0,
- IP_HAS_OP_SIZE = 1U << 0,
- IP_HAS_AD_SIZE = 1U << 1,
- IP_HAS_REPEAT_NE = 1U << 2,
- IP_HAS_REPEAT = 1U << 3,
- IP_HAS_LOCK = 1U << 4,
- IP_HAS_NOTRACK = 1U << 5,
- IP_USE_VEX = 1U << 6,
- IP_USE_VEX2 = 1U << 7,
- IP_USE_VEX3 = 1U << 8,
- IP_USE_EVEX = 1U << 9,
- IP_USE_DISP8 = 1U << 10,
- IP_USE_DISP32 = 1U << 11,
+ IP_HAS_OP_SIZE = 1U << 0,
+ IP_HAS_AD_SIZE = 1U << 1,
+ IP_HAS_REPEAT_NE = 1U << 2,
+ IP_HAS_REPEAT = 1U << 3,
+ IP_HAS_LOCK = 1U << 4,
+ IP_HAS_NOTRACK = 1U << 5,
+ IP_USE_VEX = 1U << 6,
+ IP_USE_VEX2 = 1U << 7,
+ IP_USE_VEX3 = 1U << 8,
+ IP_USE_EVEX = 1U << 9,
+ IP_USE_DISP8 = 1U << 10,
+ IP_USE_DISP32 = 1U << 11,
};
enum OperandType : unsigned {
@@ -952,11 +952,11 @@ namespace X86II {
// NOTRACK prefix
NoTrackShift = EVEX_RCShift + 1,
- NOTRACK = 1ULL << NoTrackShift,
-
- // Force VEX encoding
- ExplicitVEXShift = NoTrackShift + 1,
- ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
+ NOTRACK = 1ULL << NoTrackShift,
+
+ // Force VEX encoding
+ ExplicitVEXShift = NoTrackShift + 1,
+ ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
};
/// \returns true if the instruction with given opcode is a prefix.
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index fa937d3816..177f8efdf3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -94,12 +94,12 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
"32 bit reloc applied to a field with a different size");
}
-static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
- if (Type != RT64_64)
- Ctx.reportError(Loc,
- "64 bit reloc applied to a field with a different size");
-}
-
+static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
+ if (Type != RT64_64)
+ Ctx.reportError(Loc,
+ "64 bit reloc applied to a field with a different size");
+}
+
static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
MCSymbolRefExpr::VariantKind Modifier,
X86_64RelType Type, bool IsPCRel,
@@ -218,9 +218,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
return ELF::R_X86_64_REX_GOTPCRELX;
}
llvm_unreachable("unexpected relocation type!");
- case MCSymbolRefExpr::VK_X86_PLTOFF:
- checkIs64(Ctx, Loc, Type);
- return ELF::R_X86_64_PLTOFF64;
+ case MCSymbolRefExpr::VK_X86_PLTOFF:
+ checkIs64(Ctx, Loc, Type);
+ return ELF::R_X86_64_PLTOFF64;
}
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index d8dbbbbf27..9c6db0fcb7 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -295,10 +295,10 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
/// \see MCInstPrinter::printInst
void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
unsigned OpNo, raw_ostream &O) {
- // Do not print the numberic target address when symbolizing.
- if (SymbolizeOperands)
- return;
-
+ // Do not print the numberic target address when symbolizing.
+ if (SymbolizeOperands)
+ return;
+
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isImm()) {
if (PrintBranchImmAsAddress) {
@@ -346,21 +346,21 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
O << "\trepne\t";
else if (Flags & X86::IP_HAS_REPEAT)
O << "\trep\t";
-
- // These all require a pseudo prefix
- if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
- O << "\t{vex}";
- else if (Flags & X86::IP_USE_VEX2)
- O << "\t{vex2}";
- else if (Flags & X86::IP_USE_VEX3)
- O << "\t{vex3}";
- else if (Flags & X86::IP_USE_EVEX)
- O << "\t{evex}";
-
- if (Flags & X86::IP_USE_DISP8)
- O << "\t{disp8}";
- else if (Flags & X86::IP_USE_DISP32)
- O << "\t{disp32}";
+
+ // These all require a pseudo prefix
+ if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
+ O << "\t{vex}";
+ else if (Flags & X86::IP_USE_VEX2)
+ O << "\t{vex2}";
+ else if (Flags & X86::IP_USE_VEX3)
+ O << "\t{vex3}";
+ else if (Flags & X86::IP_USE_EVEX)
+ O << "\t{evex}";
+
+ if (Flags & X86::IP_USE_DISP8)
+ O << "\t{disp8}";
+ else if (Flags & X86::IP_USE_DISP32)
+ O << "\t{disp32}";
}
void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index d5b205ad9a..371f3a223a 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -16,7 +16,7 @@
#include "X86InstComments.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
@@ -343,15 +343,15 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
raw_ostream &O) {
- // Do not print the exact form of the memory operand if it references a known
- // binary object.
- if (SymbolizeOperands && MIA) {
- uint64_t Target;
- if (MIA->evaluateBranch(*MI, 0, 0, Target))
- return;
- if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
- return;
- }
+ // Do not print the exact form of the memory operand if it references a known
+ // binary object.
+ if (SymbolizeOperands && MIA) {
+ uint64_t Target;
+ if (MIA->evaluateBranch(*MI, 0, 0, Target))
+ return;
+ if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0))
+ return;
+ }
const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index aa4d0545ea..48ee4fbfcf 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -37,7 +37,7 @@ public:
raw_ostream &O);
// Autogenerated by tblgen.
- std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
+ std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 260253a530..d1e4d3bd4f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -93,8 +93,8 @@ private:
bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
const MCSubtargetInfo &STI, raw_ostream &OS) const;
- bool emitREXPrefix(int MemOperand, const MCInst &MI,
- const MCSubtargetInfo &STI, raw_ostream &OS) const;
+ bool emitREXPrefix(int MemOperand, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
};
} // end anonymous namespace
@@ -114,28 +114,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
}
}
-/// Determine if this immediate can fit in a disp8 or a compressed disp8 for
-/// EVEX instructions. \p will be set to the value to pass to the ImmOffset
-/// parameter of emitImmediate.
-static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
- bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
+/// Determine if this immediate can fit in a disp8 or a compressed disp8 for
+/// EVEX instructions. \p will be set to the value to pass to the ImmOffset
+/// parameter of emitImmediate.
+static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) {
+ bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
- int CD8_Scale =
+ int CD8_Scale =
(TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
- if (!HasEVEX || CD8_Scale == 0)
- return isInt<8>(Value);
+ if (!HasEVEX || CD8_Scale == 0)
+ return isInt<8>(Value);
- assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!");
- if (Value & (CD8_Scale - 1)) // Unaligned offset
+ assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!");
+ if (Value & (CD8_Scale - 1)) // Unaligned offset
return false;
- int CDisp8 = Value / CD8_Scale;
- if (!isInt<8>(CDisp8))
- return false;
-
- // ImmOffset will be added to Value in emitImmediate leaving just CDisp8.
- ImmOffset = CDisp8 - Value;
- return true;
+ int CDisp8 = Value / CD8_Scale;
+ if (!isInt<8>(CDisp8))
+ return false;
+
+ // ImmOffset will be added to Value in emitImmediate leaving just CDisp8.
+ ImmOffset = CDisp8 - Value;
+ return true;
}
/// \returns the appropriate fixup kind to use for an immediate in an
@@ -160,18 +160,18 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
/// \returns true if the specified instruction has a 16-bit memory operand.
static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
const MCSubtargetInfo &STI) {
- const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
- const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg);
- unsigned BaseReg = Base.getReg();
- unsigned IndexReg = Index.getReg();
-
- if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
+ unsigned BaseReg = Base.getReg();
+ unsigned IndexReg = Index.getReg();
+
+ if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0)
return true;
- if ((BaseReg != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
- (IndexReg != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
+ if ((BaseReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) ||
+ (IndexReg != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)))
return true;
return false;
}
@@ -398,33 +398,33 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
emitByte(modRMByte(0, RegOpcodeField, 5), OS);
unsigned Opcode = MI.getOpcode();
- unsigned FixupKind = [&]() {
- // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a
- // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4).
- if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr())))
- return X86::reloc_riprel_4byte;
-
- // Certain loads for GOT references can be relocated against the symbol
- // directly if the symbol ends up in the same linkage unit.
+ unsigned FixupKind = [&]() {
+ // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a
+ // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4).
+ if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr())))
+ return X86::reloc_riprel_4byte;
+
+ // Certain loads for GOT references can be relocated against the symbol
+ // directly if the symbol ends up in the same linkage unit.
switch (Opcode) {
default:
return X86::reloc_riprel_4byte;
case X86::MOV64rm:
- // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
- // special case because COFF and Mach-O don't support ELF's more
- // flexible R_X86_64_REX_GOTPCRELX relaxation.
+ // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+ // special case because COFF and Mach-O don't support ELF's more
+ // flexible R_X86_64_REX_GOTPCRELX relaxation.
assert(HasREX);
return X86::reloc_riprel_4byte_movq_load;
- case X86::ADC32rm:
- case X86::ADD32rm:
- case X86::AND32rm:
- case X86::CMP32rm:
- case X86::MOV32rm:
- case X86::OR32rm:
- case X86::SBB32rm:
- case X86::SUB32rm:
- case X86::TEST32mr:
- case X86::XOR32rm:
+ case X86::ADC32rm:
+ case X86::ADD32rm:
+ case X86::AND32rm:
+ case X86::CMP32rm:
+ case X86::MOV32rm:
+ case X86::OR32rm:
+ case X86::SBB32rm:
+ case X86::SUB32rm:
+ case X86::TEST32mr:
+ case X86::XOR32rm:
case X86::CALL64m:
case X86::JMP64m:
case X86::TAILJMPm64:
@@ -497,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1);
}
- if (Disp.isImm() && isInt<8>(Disp.getImm())) {
+ if (Disp.isImm() && isInt<8>(Disp.getImm())) {
if (Disp.getImm() == 0 && RMfield != 6) {
// There is no displacement; just the register.
emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
@@ -511,7 +511,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// This is the [REG]+disp16 case.
emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
} else {
- assert(IndexReg.getReg() == 0 && "Unexpected index register!");
+ assert(IndexReg.getReg() == 0 && "Unexpected index register!");
// There is no BaseReg; this is the plain [disp16] case.
emitByte(modRMByte(0, RegOpcodeField, 6), OS);
}
@@ -521,18 +521,18 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return;
}
- // Check for presence of {disp8} or {disp32} pseudo prefixes.
- bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8;
- bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32;
-
- // We only allow no displacement if no pseudo prefix is present.
- bool AllowNoDisp = !UseDisp8 && !UseDisp32;
- // Disp8 is allowed unless the {disp32} prefix is present.
- bool AllowDisp8 = !UseDisp32;
-
+ // Check for presence of {disp8} or {disp32} pseudo prefixes.
+ bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8;
+ bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32;
+
+ // We only allow no displacement if no pseudo prefix is present.
+ bool AllowNoDisp = !UseDisp8 && !UseDisp32;
+ // Disp8 is allowed unless the {disp32} prefix is present.
+ bool AllowDisp8 = !UseDisp32;
+
// Determine whether a SIB byte is needed.
- if (// The SIB byte must be used if there is an index register or the
- // encoding requires a SIB byte.
+ if (// The SIB byte must be used if there is an index register or the
+ // encoding requires a SIB byte.
!ForceSIB && IndexReg.getReg() == 0 &&
// The SIB byte must be used if the base is ESP/RSP/R12, all of which
// encode to an R/M value of 4, which indicates that a SIB byte is
@@ -548,12 +548,12 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return;
}
- // If the base is not EBP/ESP/R12/R13 and there is no displacement, use
- // simple indirect register encoding, this handles addresses like [EAX].
- // The encoding for [EBP] or[R13] with no displacement means [disp32] so we
- // handle it by emitting a displacement of 0 later.
+ // If the base is not EBP/ESP/R12/R13 and there is no displacement, use
+ // simple indirect register encoding, this handles addresses like [EAX].
+ // The encoding for [EBP] or[R13] with no displacement means [disp32] so we
+ // handle it by emitting a displacement of 0 later.
if (BaseRegNo != N86::EBP) {
- if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) {
+ if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) {
emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
return;
}
@@ -572,22 +572,22 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
}
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
- // Including a compressed disp8 for EVEX instructions that support it.
- // This also handles the 0 displacement for [EBP] or [R13]. We can't use
- // disp8 if the {disp32} pseudo prefix is present.
- if (Disp.isImm() && AllowDisp8) {
- int ImmOffset = 0;
- if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+ // Including a compressed disp8 for EVEX instructions that support it.
+ // This also handles the 0 displacement for [EBP] or [R13]. We can't use
+ // disp8 if the {disp32} pseudo prefix is present.
+ if (Disp.isImm() && AllowDisp8) {
+ int ImmOffset = 0;
+ if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
- ImmOffset);
+ ImmOffset);
return;
}
}
- // Otherwise, emit the most general non-SIB encoding: [REG+disp32].
- // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix
- // prevented using disp8 above.
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32].
+ // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix
+ // prevented using disp8 above.
emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
unsigned Opcode = MI.getOpcode();
unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
@@ -607,43 +607,43 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
- BaseRegNo = 5;
+ BaseRegNo = 5;
emitByte(modRMByte(0, RegOpcodeField, 4), OS);
ForceDisp32 = true;
- } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp &&
- // Base reg can't be EBP/RBP/R13 as that would end up with '5' as
- // the base field, but that is the magic [*] nomenclature that
- // indicates no base when mod=0. For these cases we'll emit a 0
- // displacement instead.
+ } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp &&
+ // Base reg can't be EBP/RBP/R13 as that would end up with '5' as
+ // the base field, but that is the magic [*] nomenclature that
+ // indicates no base when mod=0. For these cases we'll emit a 0
+ // displacement instead.
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
emitByte(modRMByte(0, RegOpcodeField, 4), OS);
- } else if (Disp.isImm() && AllowDisp8 &&
- isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
- // Displacement fits in a byte or matches an EVEX compressed disp8, use
- // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless
- // {disp32} pseudo prefix was used.
+ } else if (Disp.isImm() && AllowDisp8 &&
+ isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) {
+ // Displacement fits in a byte or matches an EVEX compressed disp8, use
+ // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless
+ // {disp32} pseudo prefix was used.
emitByte(modRMByte(1, RegOpcodeField, 4), OS);
- ForceDisp8 = true;
+ ForceDisp8 = true;
} else {
- // Otherwise, emit the normal disp32 encoding.
+ // Otherwise, emit the normal disp32 encoding.
emitByte(modRMByte(2, RegOpcodeField, 4), OS);
- ForceDisp32 = true;
+ ForceDisp32 = true;
}
// Calculate what the SS field value should be...
static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3};
unsigned SS = SSTable[Scale.getImm()];
- unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4;
-
- emitSIBByte(SS, IndexRegNo, BaseRegNo, OS);
+ unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4;
+ emitSIBByte(SS, IndexRegNo, BaseRegNo, OS);
+
// Do we need to output a displacement?
if (ForceDisp8)
emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
ImmOffset);
- else if (ForceDisp32)
+ else if (ForceDisp32)
emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
StartByte, OS, Fixups);
}
@@ -1201,7 +1201,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
///
/// \returns true if REX prefix is used, otherwise returns false.
bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
- const MCSubtargetInfo &STI,
+ const MCSubtargetInfo &STI,
raw_ostream &OS) const {
uint8_t REX = [&, MemOperand]() {
uint8_t REX = 0;
@@ -1222,28 +1222,28 @@ bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
// If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
for (unsigned i = CurOp; i != NumOps; ++i) {
const MCOperand &MO = MI.getOperand(i);
- if (MO.isReg()) {
- unsigned Reg = MO.getReg();
- if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH ||
- Reg == X86::DH)
- UsesHighByteReg = true;
- if (X86II::isX86_64NonExtLowByteReg(Reg))
- // FIXME: The caller of determineREXPrefix slaps this prefix onto
- // anything that returns non-zero.
- REX |= 0x40; // REX fixed encoding prefix
- } else if (MO.isExpr() &&
- STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
- // GOTTPOFF and TLSDESC relocations require a REX prefix to allow
- // linker optimizations: even if the instructions we see may not require
- // any prefix, they may be replaced by instructions that do. This is
- // handled as a special case here so that it also works for hand-written
- // assembly without the user needing to write REX, as with GNU as.
- const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
- if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF ||
- Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) {
- REX |= 0x40; // REX fixed encoding prefix
- }
- }
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH ||
+ Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of determineREXPrefix slaps this prefix onto
+ // anything that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ } else if (MO.isExpr() &&
+ STI.getTargetTriple().getEnvironment() == Triple::GNUX32) {
+ // GOTTPOFF and TLSDESC relocations require a REX prefix to allow
+ // linker optimizations: even if the instructions we see may not require
+ // any prefix, they may be replaced by instructions that do. This is
+ // handled as a special case here so that it also works for hand-written
+ // assembly without the user needing to write REX, as with GNU as.
+ const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+ if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF ||
+ Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) {
+ REX |= 0x40; // REX fixed encoding prefix
+ }
+ }
}
switch (TSFlags & X86II::FormMask) {
@@ -1366,7 +1366,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
"REX.W requires 64bit mode.");
bool HasREX = STI.hasFeature(X86::Mode64Bit)
- ? emitREXPrefix(MemOperand, MI, STI, OS)
+ ? emitREXPrefix(MemOperand, MI, STI, OS)
: false;
// 0x0F escape code must be emitted just before the opcode.
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5cf8d77519..7214b80941 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -44,10 +44,10 @@ using namespace llvm;
std::string X86_MC::ParseX86Triple(const Triple &TT) {
std::string FS;
- // SSE2 should default to enabled in 64-bit mode, but can be turned off
- // explicitly.
- if (TT.isArch64Bit())
- FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2";
+ // SSE2 should default to enabled in 64-bit mode, but can be turned off
+ // explicitly.
+ if (TT.isArch64Bit())
+ FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2";
else if (TT.getEnvironment() != Triple::CODE16)
FS = "-64bit-mode,+32bit-mode,-16bit-mode";
else
@@ -292,10 +292,10 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
if (!FS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
- if (CPU.empty())
- CPU = "generic";
+ if (CPU.empty())
+ CPU = "generic";
- return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
+ return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
}
static MCInstrInfo *createX86MCInstrInfo() {
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 35604cd3ec..69fc238074 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -85,11 +85,11 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T,
/// Implements X86-only directives for assembly emission.
MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
formatted_raw_ostream &OS,
- MCInstPrinter *InstPrinter,
- bool IsVerboseAsm);
+ MCInstPrinter *InstPrinter,
+ bool IsVerboseAsm);
/// Implements X86-only directives for object files.
-MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
+MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S,
const MCSubtargetInfo &STI);
/// Construct an X86 Windows COFF machine code streamer which will generate
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index b98e58d653..40fc8527c3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -68,7 +68,7 @@ public:
FixedValue);
}
};
-} // namespace
+} // namespace
static bool isFixupKindRIPRel(unsigned Kind) {
return Kind == X86::reloc_riprel_4byte ||
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 201b22d623..7e3f6d8335 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -568,4 +568,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
}
}
-} // namespace llvm
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index c292112461..72bb41e94b 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -26,7 +26,7 @@ public:
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
void EmitWinEHHandlerData(SMLoc Loc) override;
- void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
+ void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override;
void EmitWindowsUnwindTables() override;
void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
void finishImpl() override;
@@ -38,13 +38,13 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section.
if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
- EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
-}
-
-void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
- EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
+ EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true);
}
+void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) {
+ EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false);
+}
+
void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
if (!getNumWinFrameInfos())
return;
@@ -63,7 +63,7 @@ void X86WinCOFFStreamer::finishImpl() {
MCWinCOFFStreamer::finishImpl();
}
-} // namespace
+} // namespace
MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
std::unique_ptr<MCAsmBackend> &&AB,
diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
index 8da0d02f5b..77b54a6412 100644
--- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make
@@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/llvm12
- contrib/libs/llvm12/include
- contrib/libs/llvm12/lib/BinaryFormat
- contrib/libs/llvm12/lib/MC
- contrib/libs/llvm12/lib/MC/MCDisassembler
- contrib/libs/llvm12/lib/Support
- contrib/libs/llvm12/lib/Target/X86/TargetInfo
+ contrib/libs/llvm12
+ contrib/libs/llvm12/include
+ contrib/libs/llvm12/lib/BinaryFormat
+ contrib/libs/llvm12/lib/MC
+ contrib/libs/llvm12/lib/MC/MCDisassembler
+ contrib/libs/llvm12/lib/Support
+ contrib/libs/llvm12/lib/Target/X86/TargetInfo
)
ADDINCL(
- ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
+ ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
)
NO_COMPILER_WARNINGS()
diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
index 2f30db941e..b21991ca46 100644
--- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make
@@ -12,13 +12,13 @@ LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/llvm12
- contrib/libs/llvm12/lib/Support
+ contrib/libs/llvm12
+ contrib/libs/llvm12/lib/Support
)
ADDINCL(
- contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86/TargetInfo
+ contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86/TargetInfo
)
NO_COMPILER_WARNINGS()
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86.h b/contrib/libs/llvm12/lib/Target/X86/X86.h
index e17b9ba550..f5a9baefa2 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86.h
@@ -76,10 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
-FunctionPass *createX86TileConfigPass();
-
-FunctionPass *createX86PreTileConfigPass();
-
+FunctionPass *createX86TileConfigPass();
+
+FunctionPass *createX86PreTileConfigPass();
+
/// Return a pass that inserts int3 at the end of the function if it ends with a
/// CALL instruction. The pass does the same for each funclet as well. This
/// ensures that the open interval of function start and end PCs contains all
@@ -166,9 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
-void initializeX86PreTileConfigPass(PassRegistry &);
-void initializeX86TileConfigPass(PassRegistry &);
-void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86PreTileConfigPass(PassRegistry &);
+void initializeX86TileConfigPass(PassRegistry &);
+void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86.td b/contrib/libs/llvm12/lib/Target/X86/X86.td
index c492d686c5..d17c7f4f9b 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86.td
@@ -171,9 +171,9 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
-def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
- "Support AVX_VNNI encoding",
- [FeatureAVX2]>;
+def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+ "Support AVX_VNNI encoding",
+ [FeatureAVX2]>;
def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
"Support bfloat16 floating point",
[FeatureBWI]>;
@@ -237,8 +237,8 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
"Support RDSEED instruction">;
-def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
- "Support LAHF and SAHF instructions in 64-bit mode">;
+def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true",
+ "Support LAHF and SAHF instructions in 64-bit mode">;
def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
"Enable MONITORX/MWAITX timer functionality">;
def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true",
@@ -282,20 +282,20 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
"Has ENQCMD instructions">;
-def FeatureKL : SubtargetFeature<"kl", "HasKL", "true",
- "Support Key Locker kl Instructions",
- [FeatureSSE2]>;
-def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true",
- "Support Key Locker wide Instructions",
- [FeatureKL]>;
-def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true",
- "Has hreset instruction">;
+def FeatureKL : SubtargetFeature<"kl", "HasKL", "true",
+ "Support Key Locker kl Instructions",
+ [FeatureSSE2]>;
+def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true",
+ "Support Key Locker wide Instructions",
+ [FeatureKL]>;
+def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true",
+ "Has hreset instruction">;
def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
"Has serialize instruction">;
def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
"Support TSXLDTRK instructions">;
-def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
- "Has UINTR Instructions">;
+def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true",
+ "Has UINTR Instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -385,12 +385,12 @@ def FeatureERMSB
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
-// Icelake and newer processors have Fast Short REP MOV.
-def FeatureFSRM
- : SubtargetFeature<
- "fsrm", "HasFSRM", "true",
- "REP MOVSB of short lengths is faster">;
-
+// Icelake and newer processors have Fast Short REP MOV.
+def FeatureFSRM
+ : SubtargetFeature<
+ "fsrm", "HasFSRM", "true",
+ "REP MOVSB of short lengths is faster">;
+
// Bulldozer and newer processors can merge CMP/TEST (but not other
// instructions) with conditional branches.
def FeatureBranchFusion
@@ -565,59 +565,59 @@ include "X86SchedSkylakeServer.td"
//===----------------------------------------------------------------------===//
def ProcessorFeatures {
- // x86-64 and x86-64-v[234]
- list<SubtargetFeature> X86_64V1Features = [
- FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, Feature64Bit
- ];
- list<SubtargetFeature> X86_64V2Features = !listconcat(
- X86_64V1Features,
- [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
- list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
- FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
- FeatureMOVBE, FeatureXSAVE
- ]);
- list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
- FeatureBWI,
- FeatureCDI,
- FeatureDQI,
- FeatureVLX,
- ]);
-
+ // x86-64 and x86-64-v[234]
+ list<SubtargetFeature> X86_64V1Features = [
+ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, Feature64Bit
+ ];
+ list<SubtargetFeature> X86_64V2Features = !listconcat(
+ X86_64V1Features,
+ [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]);
+ list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [
+ FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT,
+ FeatureMOVBE, FeatureXSAVE
+ ]);
+ list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
+ FeatureBWI,
+ FeatureCDI,
+ FeatureDQI,
+ FeatureVLX,
+ ]);
+
// Nehalem
- list<SubtargetFeature> NHMFeatures = X86_64V2Features;
- list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> NHMFeatures = X86_64V2Features;
+ list<SubtargetFeature> NHMTuning = [FeatureMacroFusion,
+ FeatureInsertVZEROUPPER];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
- list<SubtargetFeature> WSMTuning = NHMTuning;
+ list<SubtargetFeature> WSMTuning = NHMTuning;
list<SubtargetFeature> WSMFeatures =
- !listconcat(NHMFeatures, WSMAdditionalFeatures);
+ !listconcat(NHMFeatures, WSMAdditionalFeatures);
// Sandybridge
list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX,
FeatureXSAVE,
- FeatureXSAVEOPT];
- list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
- FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureSlowUAMem32,
- FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate,
- FeatureFast15ByteNOP,
- FeaturePOPCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ FeatureXSAVEOPT];
+ list<SubtargetFeature> SNBTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowUAMem32,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SNBFeatures =
- !listconcat(WSMFeatures, SNBAdditionalFeatures);
+ !listconcat(WSMFeatures, SNBAdditionalFeatures);
// Ivybridge
list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND,
FeatureF16C,
FeatureFSGSBase];
- list<SubtargetFeature> IVBTuning = SNBTuning;
+ list<SubtargetFeature> IVBTuning = SNBTuning;
list<SubtargetFeature> IVBFeatures =
- !listconcat(SNBFeatures, IVBAdditionalFeatures);
+ !listconcat(SNBFeatures, IVBAdditionalFeatures);
// Haswell
list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2,
@@ -627,86 +627,86 @@ def ProcessorFeatures {
FeatureFMA,
FeatureINVPCID,
FeatureLZCNT,
- FeatureMOVBE];
- list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
- FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate,
- FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
- FeaturePOPCNTFalseDeps,
- FeatureLZCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ FeatureMOVBE];
+ list<SubtargetFeature> HSWTuning = [FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureLZCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> HSWFeatures =
- !listconcat(IVBFeatures, HSWAdditionalFeatures);
+ !listconcat(IVBFeatures, HSWAdditionalFeatures);
// Broadwell
list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX,
FeatureRDSEED,
FeaturePRFCHW];
- list<SubtargetFeature> BDWTuning = HSWTuning;
+ list<SubtargetFeature> BDWTuning = HSWTuning;
list<SubtargetFeature> BDWFeatures =
- !listconcat(HSWFeatures, BDWAdditionalFeatures);
+ !listconcat(HSWFeatures, BDWAdditionalFeatures);
// Skylake
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
- FeatureSGX];
- list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
- FeatureMacroFusion,
- FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureFastScalarFSQRT,
- FeatureFastVectorFSQRT,
- FeatureFastSHLDRotate,
- FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
- FeaturePOPCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ FeatureSGX];
+ list<SubtargetFeature> SKLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKLFeatures =
- !listconcat(BDWFeatures, SKLAdditionalFeatures);
+ !listconcat(BDWFeatures, SKLAdditionalFeatures);
// Skylake-AVX512
- list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES,
- FeatureXSAVEC,
- FeatureXSAVES,
- FeatureCLFLUSHOPT,
- FeatureAVX512,
+ list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES,
+ FeatureXSAVEC,
+ FeatureXSAVES,
+ FeatureCLFLUSHOPT,
+ FeatureAVX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
FeatureVLX,
FeaturePKU,
FeatureCLWB];
- list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
- FeatureMacroFusion,
- FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureFastScalarFSQRT,
- FeatureFastVectorFSQRT,
- FeatureFastSHLDRotate,
- FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
- FeaturePrefer256Bit,
- FeaturePOPCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SKXTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SKXFeatures =
- !listconcat(BDWFeatures, SKXAdditionalFeatures);
+ !listconcat(BDWFeatures, SKXAdditionalFeatures);
// Cascadelake
list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI];
- list<SubtargetFeature> CLXTuning = SKXTuning;
+ list<SubtargetFeature> CLXTuning = SKXTuning;
list<SubtargetFeature> CLXFeatures =
- !listconcat(SKXFeatures, CLXAdditionalFeatures);
+ !listconcat(SKXFeatures, CLXAdditionalFeatures);
// Cooperlake
list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16];
- list<SubtargetFeature> CPXTuning = SKXTuning;
+ list<SubtargetFeature> CPXTuning = SKXTuning;
list<SubtargetFeature> CPXFeatures =
- !listconcat(CLXFeatures, CPXAdditionalFeatures);
+ !listconcat(CLXFeatures, CPXAdditionalFeatures);
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
@@ -717,20 +717,20 @@ def ProcessorFeatures {
FeaturePKU,
FeatureVBMI,
FeatureIFMA,
- FeatureSHA];
- list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
- FeatureMacroFusion,
- FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureFastScalarFSQRT,
- FeatureFastVectorFSQRT,
- FeatureFastSHLDRotate,
- FeatureFast15ByteNOP,
- FeatureFastVariableShuffle,
- FeaturePrefer256Bit,
- FeatureInsertVZEROUPPER];
+ FeatureSHA];
+ list<SubtargetFeature> CNLTuning = [FeatureHasFastGather,
+ FeatureMacroFusion,
+ FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureFastScalarFSQRT,
+ FeatureFastVectorFSQRT,
+ FeatureFastSHLDRotate,
+ FeatureFast15ByteNOP,
+ FeatureFastVariableShuffle,
+ FeaturePrefer256Bit,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> CNLFeatures =
- !listconcat(SKLFeatures, CNLAdditionalFeatures);
+ !listconcat(SKLFeatures, CNLAdditionalFeatures);
// Icelake
list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG,
@@ -741,81 +741,81 @@ def ProcessorFeatures {
FeatureVPOPCNTDQ,
FeatureGFNI,
FeatureCLWB,
- FeatureRDPID,
- FeatureFSRM];
- list<SubtargetFeature> ICLTuning = CNLTuning;
+ FeatureRDPID,
+ FeatureFSRM];
+ list<SubtargetFeature> ICLTuning = CNLTuning;
list<SubtargetFeature> ICLFeatures =
- !listconcat(CNLFeatures, ICLAdditionalFeatures);
+ !listconcat(CNLFeatures, ICLAdditionalFeatures);
// Icelake Server
- list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
- FeatureWBNOINVD];
- list<SubtargetFeature> ICXTuning = CNLTuning;
+ list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG,
+ FeatureWBNOINVD];
+ list<SubtargetFeature> ICXTuning = CNLTuning;
list<SubtargetFeature> ICXFeatures =
- !listconcat(ICLFeatures, ICXAdditionalFeatures);
+ !listconcat(ICLFeatures, ICXAdditionalFeatures);
//Tigerlake
list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureSHSTK];
- list<SubtargetFeature> TGLTuning = CNLTuning;
+ list<SubtargetFeature> TGLTuning = CNLTuning;
list<SubtargetFeature> TGLFeatures =
- !listconcat(ICLFeatures, TGLAdditionalFeatures );
-
- //Sapphirerapids
- list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
- FeatureAMXINT8,
- FeatureAMXBF16,
- FeatureBF16,
- FeatureSERIALIZE,
- FeatureCLDEMOTE,
- FeatureWAITPKG,
- FeaturePTWRITE,
- FeatureAVXVNNI,
- FeatureTSXLDTRK,
- FeatureENQCMD,
- FeatureSHSTK,
- FeatureVP2INTERSECT,
- FeatureMOVDIRI,
- FeatureMOVDIR64B,
- FeatureUINTR];
- list<SubtargetFeature> SPRTuning = ICXTuning;
- list<SubtargetFeature> SPRFeatures =
- !listconcat(ICXFeatures, SPRAdditionalFeatures);
-
- // Alderlake
- list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
- FeatureCLDEMOTE,
- FeatureHRESET,
- FeaturePTWRITE,
- FeatureSERIALIZE,
- FeatureWAITPKG];
- list<SubtargetFeature> ADLTuning = SKLTuning;
- list<SubtargetFeature> ADLFeatures =
- !listconcat(SKLFeatures, ADLAdditionalFeatures);
-
+ !listconcat(ICLFeatures, TGLAdditionalFeatures );
+
+ //Sapphirerapids
+ list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE,
+ FeatureAMXINT8,
+ FeatureAMXBF16,
+ FeatureBF16,
+ FeatureSERIALIZE,
+ FeatureCLDEMOTE,
+ FeatureWAITPKG,
+ FeaturePTWRITE,
+ FeatureAVXVNNI,
+ FeatureTSXLDTRK,
+ FeatureENQCMD,
+ FeatureSHSTK,
+ FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureUINTR];
+ list<SubtargetFeature> SPRTuning = ICXTuning;
+ list<SubtargetFeature> SPRFeatures =
+ !listconcat(ICXFeatures, SPRAdditionalFeatures);
+
+ // Alderlake
+ list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+ FeatureCLDEMOTE,
+ FeatureHRESET,
+ FeaturePTWRITE,
+ FeatureSERIALIZE,
+ FeatureWAITPKG];
+ list<SubtargetFeature> ADLTuning = SKLTuning;
+ list<SubtargetFeature> ADLFeatures =
+ !listconcat(SKLFeatures, ADLAdditionalFeatures);
+
// Atom
- list<SubtargetFeature> AtomFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureMOVBE,
- FeatureLAHFSAHF];
- list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
- FeatureSlowUAMem16,
- FeatureLEAForSP,
- FeatureSlowDivide32,
- FeatureSlowDivide64,
- FeatureSlowTwoMemOps,
- FeatureLEAUsesAG,
- FeaturePadShortFunctions,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> AtomFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureMOVBE,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> AtomTuning = [ProcIntelAtom,
+ FeatureSlowUAMem16,
+ FeatureLEAForSP,
+ FeatureSlowDivide32,
+ FeatureSlowDivide64,
+ FeatureSlowTwoMemOps,
+ FeatureLEAUsesAG,
+ FeaturePadShortFunctions,
+ FeatureInsertVZEROUPPER];
// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
@@ -823,17 +823,17 @@ def ProcessorFeatures {
FeaturePCLMUL,
FeaturePRFCHW,
FeatureRDRAND];
- list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
- FeatureSlowTwoMemOps,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureSlowDivide64,
- FeatureSlowPMULLD,
- FeatureFast7ByteNOP,
- FeaturePOPCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> SLMTuning = [ProcIntelSLM,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureSlowDivide64,
+ FeatureSlowPMULLD,
+ FeatureFast7ByteNOP,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> SLMFeatures =
- !listconcat(AtomFeatures, SLMAdditionalFeatures);
+ !listconcat(AtomFeatures, SLMAdditionalFeatures);
// Goldmont
list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
@@ -845,33 +845,33 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFSGSBase];
- list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
- FeatureSlowTwoMemOps,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeaturePOPCNTFalseDeps,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeaturePOPCNTFalseDeps,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLMFeatures =
- !listconcat(SLMFeatures, GLMAdditionalFeatures);
+ !listconcat(SLMFeatures, GLMAdditionalFeatures);
// Goldmont Plus
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
FeatureRDPID,
FeatureSGX];
- list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
- FeatureSlowTwoMemOps,
- FeatureSlowLEA,
- FeatureSlowIncDec,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts,
+ FeatureSlowTwoMemOps,
+ FeatureSlowLEA,
+ FeatureSlowIncDec,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> GLPFeatures =
- !listconcat(GLMFeatures, GLPAdditionalFeatures);
+ !listconcat(GLMFeatures, GLPAdditionalFeatures);
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMTuning = GLPTuning;
list<SubtargetFeature> TRMFeatures =
- !listconcat(GLPFeatures, TRMAdditionalFeatures);
+ !listconcat(GLPFeatures, TRMAdditionalFeatures);
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -903,56 +903,56 @@ def ProcessorFeatures {
FeatureBMI,
FeatureBMI2,
FeatureFMA,
- FeaturePRFCHW];
- list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
- FeatureSlow3OpsLEA,
- FeatureSlowIncDec,
- FeatureSlowTwoMemOps,
- FeaturePreferMaskRegisters,
- FeatureHasFastGather,
- FeatureSlowPMADDWD];
+ FeaturePRFCHW];
+ list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureSlowTwoMemOps,
+ FeaturePreferMaskRegisters,
+ FeatureHasFastGather,
+ FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
// Barcelona
- list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureSSE4A,
- Feature3DNowA,
- FeatureFXSR,
- FeatureNOPL,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureLAHFSAHF,
- FeatureCMOV,
- Feature64Bit];
- list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
- FeatureSlowSHLD,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureSSE4A,
+ Feature3DNowA,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF,
+ FeatureCMOV,
+ Feature64Bit];
+ list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
// Bobcat
- list<SubtargetFeature> BtVer1Features = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureMMX,
- FeatureSSSE3,
- FeatureSSE4A,
- FeatureFXSR,
- FeatureNOPL,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeaturePRFCHW,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureLAHFSAHF];
- list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
- FeatureFastScalarShiftMasks,
- FeatureFastVectorShiftMasks,
- FeatureSlowSHLD,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> BtVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureMMX,
+ FeatureSSSE3,
+ FeatureSSE4A,
+ FeatureFXSR,
+ FeatureNOPL,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeaturePRFCHW,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@@ -963,39 +963,39 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureXSAVE,
FeatureXSAVEOPT];
- list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
- FeatureFastBEXTR,
- FeatureFastHorizontalOps,
- FeatureFast15ByteNOP,
- FeatureFastScalarShiftMasks,
- FeatureFastVectorShiftMasks,
- FeatureSlowSHLD];
+ list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFastHorizontalOps,
+ FeatureFast15ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureFastVectorShiftMasks,
+ FeatureSlowSHLD];
list<SubtargetFeature> BtVer2Features =
- !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
+ !listconcat(BtVer1Features, BtVer2AdditionalFeatures);
// Bulldozer
- list<SubtargetFeature> BdVer1Features = [FeatureX87,
- FeatureCMPXCHG8B,
- FeatureCMOV,
- FeatureXOP,
- Feature64Bit,
- FeatureCMPXCHG16B,
- FeatureAES,
- FeaturePRFCHW,
- FeaturePCLMUL,
- FeatureMMX,
- FeatureFXSR,
- FeatureNOPL,
- FeatureLZCNT,
- FeaturePOPCNT,
- FeatureXSAVE,
- FeatureLWP,
- FeatureLAHFSAHF];
- list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
- FeatureFast11ByteNOP,
- FeatureFastScalarShiftMasks,
- FeatureBranchFusion,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> BdVer1Features = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureCMOV,
+ FeatureXOP,
+ Feature64Bit,
+ FeatureCMPXCHG16B,
+ FeatureAES,
+ FeaturePRFCHW,
+ FeaturePCLMUL,
+ FeatureMMX,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureXSAVE,
+ FeatureLWP,
+ FeatureLAHFSAHF];
+ list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD,
+ FeatureFast11ByteNOP,
+ FeatureFastScalarShiftMasks,
+ FeatureBranchFusion,
+ FeatureInsertVZEROUPPER];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1003,16 +1003,16 @@ def ProcessorFeatures {
FeatureTBM,
FeatureFMA,
FeatureFastBEXTR];
- list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
- list<SubtargetFeature> BdVer2Features =
- !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
+ list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning;
+ list<SubtargetFeature> BdVer2Features =
+ !listconcat(BdVer1Features, BdVer2AdditionalFeatures);
// Steamroller
list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT,
FeatureFSGSBase];
- list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning;
- list<SubtargetFeature> BdVer3Features =
- !listconcat(BdVer2Features, BdVer3AdditionalFeatures);
+ list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning;
+ list<SubtargetFeature> BdVer3Features =
+ !listconcat(BdVer2Features, BdVer3AdditionalFeatures);
// Excavator
list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
@@ -1020,9 +1020,9 @@ def ProcessorFeatures {
FeatureMOVBE,
FeatureRDRAND,
FeatureMWAITX];
- list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning;
- list<SubtargetFeature> BdVer4Features =
- !listconcat(BdVer3Features, BdVer4AdditionalFeatures);
+ list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning;
+ list<SubtargetFeature> BdVer4Features =
+ !listconcat(BdVer3Features, BdVer4AdditionalFeatures);
// AMD Zen Processors common ISAs
@@ -1058,80 +1058,80 @@ def ProcessorFeatures {
FeatureXSAVEC,
FeatureXSAVEOPT,
FeatureXSAVES];
- list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
- FeatureFastBEXTR,
- FeatureFast15ByteNOP,
- FeatureBranchFusion,
- FeatureFastScalarShiftMasks,
- FeatureSlowSHLD,
- FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT,
+ FeatureFastBEXTR,
+ FeatureFast15ByteNOP,
+ FeatureBranchFusion,
+ FeatureFastScalarShiftMasks,
+ FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureWBNOINVD];
- list<SubtargetFeature> ZN2Tuning = ZNTuning;
+ list<SubtargetFeature> ZN2Tuning = ZNTuning;
list<SubtargetFeature> ZN2Features =
!listconcat(ZNFeatures, ZN2AdditionalFeatures);
- list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM,
- FeatureINVPCID,
- FeaturePKU,
- FeatureVAES,
- FeatureVPCLMULQDQ];
- list<SubtargetFeature> ZN3Tuning = ZNTuning;
- list<SubtargetFeature> ZN3Features =
- !listconcat(ZN2Features, ZN3AdditionalFeatures);
+ list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM,
+ FeatureINVPCID,
+ FeaturePKU,
+ FeatureVAES,
+ FeatureVPCLMULQDQ];
+ list<SubtargetFeature> ZN3Tuning = ZNTuning;
+ list<SubtargetFeature> ZN3Features =
+ !listconcat(ZN2Features, ZN3AdditionalFeatures);
}
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
-class Proc<string Name, list<SubtargetFeature> Features,
- list<SubtargetFeature> TuneFeatures>
- : ProcessorModel<Name, GenericModel, Features, TuneFeatures>;
-
-class ProcModel<string Name, SchedMachineModel Model,
- list<SubtargetFeature> Features,
- list<SubtargetFeature> TuneFeatures>
- : ProcessorModel<Name, Model, Features, TuneFeatures>;
+class Proc<string Name, list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, GenericModel, Features, TuneFeatures>;
+class ProcModel<string Name, SchedMachineModel Model,
+ list<SubtargetFeature> Features,
+ list<SubtargetFeature> TuneFeatures>
+ : ProcessorModel<Name, Model, Features, TuneFeatures>;
+
// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
// if i386/i486 is specifically requested.
-// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
-// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
-// It has no effect on code generation.
-def : ProcModel<"generic", SandyBridgeModel,
- [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
- [FeatureSlow3OpsLEA,
- FeatureSlowDivide64,
- FeatureSlowIncDec,
- FeatureMacroFusion,
- FeatureInsertVZEROUPPER]>;
-
-def : Proc<"i386", [FeatureX87],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"i486", [FeatureX87],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
- FeatureNOPL],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
- FeatureFXSR, FeatureNOPL],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-
+// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget
+// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled.
+// It has no effect on code generation.
+def : ProcModel<"generic", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, Feature64Bit],
+ [FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
+ FeatureSlowIncDec,
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i386", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i486", [FeatureX87],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV,
+ FeatureFXSR, FeatureNOPL],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
foreach P = ["pentium3", "pentium3m"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1144,34 +1144,34 @@ foreach P = ["pentium3", "pentium3m"] in {
// measure to avoid performance surprises, in case clang's default cpu
// changes slightly.
-def : ProcModel<"pentium-m", GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"pentium-m", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
- def : ProcModel<P, GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+ def : ProcModel<P, GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
// Intel Quark.
-def : Proc<"lakemont", [FeatureCMPXCHG8B],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"lakemont", [FeatureCMPXCHG8B],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// Intel Core Duo.
-def : ProcModel<"yonah", SandyBridgeModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"yonah", SandyBridgeModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// NetBurst.
-def : ProcModel<"prescott", GenericPostRAModel,
- [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureNOPL, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : ProcModel<"nocona", GenericPostRAModel, [
+def : ProcModel<"prescott", GenericPostRAModel,
+ [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3,
+ FeatureFXSR, FeatureNOPL, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : ProcModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
@@ -1181,14 +1181,14 @@ def : ProcModel<"nocona", GenericPostRAModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
-],
-[
- FeatureSlowUAMem16,
+],
+[
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
// Intel Core 2 Solo/Duo.
-def : ProcModel<"core2", SandyBridgeModel, [
+def : ProcModel<"core2", SandyBridgeModel, [
FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
@@ -1198,14 +1198,14 @@ def : ProcModel<"core2", SandyBridgeModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
- FeatureLAHFSAHF
-],
-[
+ FeatureLAHFSAHF
+],
+[
FeatureMacroFusion,
- FeatureSlowUAMem16,
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
-def : ProcModel<"penryn", SandyBridgeModel, [
+def : ProcModel<"penryn", SandyBridgeModel, [
FeatureX87,
FeatureCMPXCHG8B,
FeatureCMOV,
@@ -1215,171 +1215,171 @@ def : ProcModel<"penryn", SandyBridgeModel, [
FeatureNOPL,
Feature64Bit,
FeatureCMPXCHG16B,
- FeatureLAHFSAHF
-],
-[
+ FeatureLAHFSAHF
+],
+[
FeatureMacroFusion,
- FeatureSlowUAMem16,
+ FeatureSlowUAMem16,
FeatureInsertVZEROUPPER
]>;
// Atom CPUs.
foreach P = ["bonnell", "atom"] in {
- def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures,
- ProcessorFeatures.AtomTuning>;
+ def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures,
+ ProcessorFeatures.AtomTuning>;
}
foreach P = ["silvermont", "slm"] in {
- def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures,
- ProcessorFeatures.SLMTuning>;
+ def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures,
+ ProcessorFeatures.SLMTuning>;
}
-def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures,
- ProcessorFeatures.GLMTuning>;
-def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures,
- ProcessorFeatures.GLPTuning>;
-def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
- ProcessorFeatures.TRMTuning>;
+def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures,
+ ProcessorFeatures.GLMTuning>;
+def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures,
+ ProcessorFeatures.GLPTuning>;
+def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures,
+ ProcessorFeatures.TRMTuning>;
// "Arrandale" along with corei3 and corei5
foreach P = ["nehalem", "corei7"] in {
- def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures,
- ProcessorFeatures.NHMTuning>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures,
+ ProcessorFeatures.NHMTuning>;
}
// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures,
- ProcessorFeatures.WSMTuning>;
+def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures,
+ ProcessorFeatures.WSMTuning>;
foreach P = ["sandybridge", "corei7-avx"] in {
- def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures,
- ProcessorFeatures.SNBTuning>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures,
+ ProcessorFeatures.SNBTuning>;
}
foreach P = ["ivybridge", "core-avx-i"] in {
- def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures,
- ProcessorFeatures.IVBTuning>;
+ def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures,
+ ProcessorFeatures.IVBTuning>;
}
foreach P = ["haswell", "core-avx2"] in {
- def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures,
- ProcessorFeatures.HSWTuning>;
+ def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures,
+ ProcessorFeatures.HSWTuning>;
}
-def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures,
- ProcessorFeatures.BDWTuning>;
+def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures,
+ ProcessorFeatures.BDWTuning>;
-def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures,
- ProcessorFeatures.SKLTuning>;
+def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures,
+ ProcessorFeatures.SKLTuning>;
// FIXME: define KNL scheduler model
-def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures,
- ProcessorFeatures.KNLTuning>;
-def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures,
- ProcessorFeatures.KNLTuning>;
+def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures,
+ ProcessorFeatures.KNLTuning>;
+def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures,
+ ProcessorFeatures.KNLTuning>;
foreach P = ["skylake-avx512", "skx"] in {
- def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures,
- ProcessorFeatures.SKXTuning>;
+ def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures,
+ ProcessorFeatures.SKXTuning>;
}
-def : ProcModel<"cascadelake", SkylakeServerModel,
- ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>;
-def : ProcModel<"cooperlake", SkylakeServerModel,
- ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
-def : ProcModel<"cannonlake", SkylakeServerModel,
- ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
-def : ProcModel<"icelake-client", SkylakeServerModel,
- ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
-def : ProcModel<"icelake-server", SkylakeServerModel,
- ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
-def : ProcModel<"tigerlake", SkylakeServerModel,
- ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
-def : ProcModel<"sapphirerapids", SkylakeServerModel,
- ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
-def : ProcModel<"alderlake", SkylakeClientModel,
- ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
+def : ProcModel<"cascadelake", SkylakeServerModel,
+ ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>;
+def : ProcModel<"cooperlake", SkylakeServerModel,
+ ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>;
+def : ProcModel<"cannonlake", SkylakeServerModel,
+ ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>;
+def : ProcModel<"icelake-client", SkylakeServerModel,
+ ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>;
+def : ProcModel<"icelake-server", SkylakeServerModel,
+ ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>;
+def : ProcModel<"tigerlake", SkylakeServerModel,
+ ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>;
+def : ProcModel<"sapphirerapids", SkylakeServerModel,
+ ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>;
+def : ProcModel<"alderlake", SkylakeClientModel,
+ ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>;
// AMD CPUs.
-def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
- FeatureNOPL],
- [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA,
+ FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
- FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
- [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV,
+ FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
+ [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
- [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
- FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
- Feature64Bit],
- [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
- FeatureInsertVZEROUPPER]>;
+ def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
+ Feature64Bit],
+ [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, ProcessorFeatures.BarcelonaFeatures,
- ProcessorFeatures.BarcelonaTuning>;
+ def : Proc<P, ProcessorFeatures.BarcelonaFeatures,
+ ProcessorFeatures.BarcelonaTuning>;
}
// Bobcat
-def : Proc<"btver1", ProcessorFeatures.BtVer1Features,
- ProcessorFeatures.BtVer1Tuning>;
+def : Proc<"btver1", ProcessorFeatures.BtVer1Features,
+ ProcessorFeatures.BtVer1Tuning>;
// Jaguar
-def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features,
- ProcessorFeatures.BtVer2Tuning>;
+def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features,
+ ProcessorFeatures.BtVer2Tuning>;
// Bulldozer
-def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
- ProcessorFeatures.BdVer1Tuning>;
+def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
+ ProcessorFeatures.BdVer1Tuning>;
// Piledriver
-def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
- ProcessorFeatures.BdVer2Tuning>;
+def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
+ ProcessorFeatures.BdVer2Tuning>;
// Steamroller
-def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
- ProcessorFeatures.BdVer3Tuning>;
+def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
+ ProcessorFeatures.BdVer3Tuning>;
// Excavator
-def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
- ProcessorFeatures.BdVer4Tuning>;
-
-def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
- ProcessorFeatures.ZNTuning>;
-def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
- ProcessorFeatures.ZN2Tuning>;
-def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
- ProcessorFeatures.ZN3Tuning>;
-
-def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-
-def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"winchip2", [FeatureX87, Feature3DNow],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"c3", [FeatureX87, Feature3DNow],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
-def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureCMOV],
- [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
+ ProcessorFeatures.BdVer4Tuning>;
+
+def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
+ ProcessorFeatures.ZNTuning>;
+def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
+ ProcessorFeatures.ZN2Tuning>;
+def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features,
+ ProcessorFeatures.ZN3Tuning>;
+
+def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureMMX],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3", [FeatureX87, Feature3DNow],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
+def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
+ FeatureSSE1, FeatureFXSR, FeatureCMOV],
+ [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1391,8 +1391,8 @@ def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX,
// covers a huge swath of x86 processors. If there are specific scheduling
// knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
-def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
-[
+def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
+[
FeatureSlow3OpsLEA,
FeatureSlowDivide64,
FeatureSlowIncDec,
@@ -1400,16 +1400,16 @@ def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features,
FeatureInsertVZEROUPPER
]>;
-// x86-64 micro-architecture levels.
-def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features,
- ProcessorFeatures.SNBTuning>;
-// Close to Haswell.
-def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
- ProcessorFeatures.HSWTuning>;
-// Close to the AVX-512 level implemented by Xeon Scalable Processors.
-def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
- ProcessorFeatures.SKXTuning>;
-
+// x86-64 micro-architecture levels.
+def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features,
+ ProcessorFeatures.SNBTuning>;
+// Close to Haswell.
+def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features,
+ ProcessorFeatures.HSWTuning>;
+// Close to the AVX-512 level implemented by Xeon Scalable Processors.
+def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features,
+ ProcessorFeatures.SKXTuning>;
+
//===----------------------------------------------------------------------===//
// Calling Conventions
//===----------------------------------------------------------------------===//
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp
index 2d434bda55..7086ee858b 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp
@@ -404,7 +404,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
O << ']';
}
-static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
+static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
Register Reg = MO.getReg();
bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
@@ -446,9 +446,9 @@ static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO,
return false;
}
-static bool printAsmVRegister(const MachineOperand &MO, char Mode,
- raw_ostream &O) {
- Register Reg = MO.getReg();
+static bool printAsmVRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ Register Reg = MO.getReg();
bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
unsigned Index;
@@ -560,7 +560,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
case 't': // Print V8SFmode register
case 'g': // Print V16SFmode register
if (MO.isReg())
- return printAsmVRegister(MO, ExtraCode[0], O);
+ return printAsmVRegister(MO, ExtraCode[0], O);
PrintOperand(MI, OpNo, O);
return false;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h
index a3b74c8ee3..fe0a4d551a 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h
@@ -134,9 +134,9 @@ public:
}
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
- const char *ExtraCode, raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
- const char *ExtraCode, raw_ostream &O) override;
+ const char *ExtraCode, raw_ostream &O) override;
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
@@ -145,7 +145,7 @@ public:
return AsmPrinter::doInitialization(M);
}
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
void emitFunctionBodyStart() override;
void emitFunctionBodyEnd() override;
};
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index fdc65acffe..f95e27173f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -154,7 +154,7 @@ static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
}
-static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
+static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
switch (LdOpcode) {
case X86::MOVUPSrm:
case X86::MOVAPSrm:
@@ -206,7 +206,7 @@ static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
}
}
-static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
+static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
bool PBlock = false;
PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp
index fae4e688c8..46e18b6c46 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -105,7 +105,7 @@ private:
void adjustCallSequence(MachineFunction &MF, const CallContext &Context);
MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
- Register Reg);
+ Register Reg);
enum InstClassification { Convert, Skip, Exit };
@@ -202,7 +202,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
Align StackAlign = TFL->getStackAlign();
int64_t Advantage = 0;
- for (const auto &CC : CallSeqVector) {
+ for (const auto &CC : CallSeqVector) {
// Call sites where no parameters are passed on the stack
// do not affect the cost, since there needs to be no
// stack adjustment.
@@ -265,7 +265,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
if (!isProfitable(MF, CallSeqVector))
return false;
- for (const auto &CC : CallSeqVector) {
+ for (const auto &CC : CallSeqVector) {
if (CC.UsePush) {
adjustCallSequence(MF, CC);
Changed = true;
@@ -288,13 +288,13 @@ X86CallFrameOptimization::classifyInstruction(
case X86::AND16mi8:
case X86::AND32mi8:
case X86::AND64mi8: {
- const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
return ImmOp.getImm() == 0 ? Convert : Exit;
}
case X86::OR16mi8:
case X86::OR32mi8:
case X86::OR64mi8: {
- const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
+ const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands);
return ImmOp.getImm() == -1 ? Convert : Exit;
}
case X86::MOV32mi:
@@ -336,7 +336,7 @@ X86CallFrameOptimization::classifyInstruction(
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (!Reg.isPhysical())
+ if (!Reg.isPhysical())
continue;
if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
return Exit;
@@ -454,7 +454,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (Reg.isPhysical())
+ if (Reg.isPhysical())
UsedRegs.insert(Reg);
}
}
@@ -506,7 +506,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
// replace uses.
for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];
- const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);
+ const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands);
MachineBasicBlock::iterator Push = nullptr;
unsigned PushOpcode;
switch (Store->getOpcode()) {
@@ -563,7 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
unsigned NumOps = DefMov->getDesc().getNumOperands();
for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
Push->addOperand(DefMov->getOperand(i));
- Push->cloneMergedMemRefs(MF, {DefMov, &*Store});
+ Push->cloneMergedMemRefs(MF, {DefMov, &*Store});
DefMov->eraseFromParent();
} else {
PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
@@ -599,7 +599,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
}
MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
- MachineBasicBlock::iterator FrameSetup, Register Reg) {
+ MachineBasicBlock::iterator FrameSetup, Register Reg) {
// Do an extremely restricted form of load folding.
// ISel will often create patterns like:
// movl 4(%edi), %eax
@@ -610,7 +610,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
// movl %eax, (%esp)
// call
// Get rid of those with prejudice.
- if (!Reg.isVirtual())
+ if (!Reg.isVirtual())
return nullptr;
// Make sure this is the only use of Reg.
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp
index 53f57565d5..a497375bd0 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp
@@ -95,11 +95,11 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
namespace {
-struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
- X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
- CCAssignFn *AssignFn)
- : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
+ X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, MachineInstrBuilder &MIB,
+ CCAssignFn *AssignFn)
+ : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
@@ -134,10 +134,10 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
unsigned ValSize = VA.getValVT().getSizeInBits();
unsigned LocSize = VA.getLocVT().getSizeInBits();
if (PhysRegSize > ValSize && LocSize == ValSize) {
- assert((PhysRegSize == 128 || PhysRegSize == 80) &&
- "We expect that to be 128 bit");
- ExtReg =
- MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
+ assert((PhysRegSize == 128 || PhysRegSize == 80) &&
+ "We expect that to be 128 bit");
+ ExtReg =
+ MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0);
} else
ExtReg = extendRegister(ValVReg, VA);
@@ -149,9 +149,9 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler {
MachineFunction &MF = MIRBuilder.getMF();
Register ExtReg = extendRegister(ValVReg, VA);
- auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
- VA.getLocVT().getStoreSize(),
- inferAlignFromPtrInfo(MF, MPO));
+ auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+ VA.getLocVT().getStoreSize(),
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -184,9 +184,9 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
- const Value *Val, ArrayRef<Register> VRegs,
- FunctionLoweringInfo &FLI) const {
+bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+ const Value *Val, ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const {
assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) &&
"Return value without a vreg");
auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -195,7 +195,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const DataLayout &DL = MF.getDataLayout();
+ const DataLayout &DL = MF.getDataLayout();
LLVMContext &Ctx = Val->getType()->getContext();
const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
@@ -215,7 +215,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
return false;
}
- X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
}
@@ -226,10 +226,10 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
namespace {
-struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
- X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
- MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
- : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
+struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
+ X86IncomingValueHandler(MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, CCAssignFn *AssignFn)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
Register getStackAddress(uint64_t Size, int64_t Offset,
@@ -246,7 +246,7 @@ struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
- auto *MMO = MF.getMachineMemOperand(
+ auto *MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
@@ -296,10 +296,10 @@ protected:
const DataLayout &DL;
};
-struct FormalArgHandler : public X86IncomingValueHandler {
+struct FormalArgHandler : public X86IncomingValueHandler {
FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn)
- : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMRI()->addLiveIn(PhysReg);
@@ -307,10 +307,10 @@ struct FormalArgHandler : public X86IncomingValueHandler {
}
};
-struct CallReturnHandler : public X86IncomingValueHandler {
+struct CallReturnHandler : public X86IncomingValueHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
- : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+ : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -322,10 +322,10 @@ protected:
} // end anonymous namespace
-bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
- const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs,
- FunctionLoweringInfo &FLI) const {
+bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+ const Function &F,
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const {
if (F.arg_empty())
return true;
@@ -339,7 +339,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 8> SplitArgs;
unsigned Idx = 0;
- for (const auto &Arg : F.args()) {
+ for (const auto &Arg : F.args()) {
// TODO: handle not simple cases.
if (Arg.hasAttribute(Attribute::ByVal) ||
Arg.hasAttribute(Attribute::InReg) ||
@@ -378,10 +378,10 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- const DataLayout &DL = F.getParent()->getDataLayout();
+ const DataLayout &DL = F.getParent()->getDataLayout();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const TargetInstrInfo &TII = *STI.getInstrInfo();
- const X86RegisterInfo *TRI = STI.getRegisterInfo();
+ const X86RegisterInfo *TRI = STI.getRegisterInfo();
// Handle only Linux C, X86_64_SysV calling conventions for now.
if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
@@ -419,7 +419,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
}
// Do the actual argument marshalling.
- X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
+ X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h
index 9390122d76..33d2143ef8 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h
@@ -29,12 +29,12 @@ public:
X86CallLowering(const X86TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
- ArrayRef<Register> VRegs,
- FunctionLoweringInfo &FLI) const override;
+ ArrayRef<Register> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
- ArrayRef<ArrayRef<Register>> VRegs,
- FunctionLoweringInfo &FLI) const override;
+ ArrayRef<ArrayRef<Register>> VRegs,
+ FunctionLoweringInfo &FLI) const override;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp
index c80a5d5bb3..eada1f1540 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp
@@ -330,15 +330,15 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
return true;
}
-static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
- CCValAssign::LocInfo &LocInfo,
- ISD::ArgFlagsTy &ArgFlags, CCState &State) {
- if (LocVT != MVT::i64) {
- LocVT = MVT::i64;
- LocInfo = CCValAssign::ZExt;
- }
- return false;
-}
-
+static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ if (LocVT != MVT::i64) {
+ LocVT = MVT::i64;
+ LocInfo = CCValAssign::ZExt;
+ }
+ return false;
+}
+
// Provides entry points of CC_X86 and RetCC_X86.
#include "X86GenCallingConv.inc"
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td
index 3735fab818..2fbd0f5cfd 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td
@@ -336,9 +336,9 @@ def RetCC_X86_64_C : CallingConv<[
// MMX vector types are always returned in XMM0.
CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
- // Pointers are always returned in full 64-bit registers.
- CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
-
+ // Pointers are always returned in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
CCDelegateTo<RetCC_X86Common>
@@ -521,9 +521,9 @@ def CC_X86_64_C : CallingConv<[
CCIfCC<"CallingConv::Swift",
CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
- // Pointers are always passed in full 64-bit registers.
- CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
-
+ // Pointers are always passed in full 64-bit registers.
+ CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
+
// The first 6 integer arguments are passed in integer registers.
CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
@@ -1102,7 +1102,7 @@ def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)
// All GPRs - except r11
def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
- R8, R9, R10)>;
+ R8, R9, R10)>;
// All registers - except r11
def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs,
@@ -1160,16 +1160,16 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
// Register calling convention preserves few GPR and XMM8-15
-def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
(sequence "XMM%u", 4, 7))>;
def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>;
def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>;
-def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
+def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
(sequence "R%u", 10, 15))>;
def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
(sequence "XMM%u", 8, 15))>;
-def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
+def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP,
(sequence "R%u", 12, 15))>;
def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,
(sequence "XMM%u", 8, 15))>;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp
index a2de0dc082..e840d30ce3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp
@@ -439,7 +439,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
if (!MO.isReg() || !MO.isUse())
continue;
Register Reg = MO.getReg();
- auto &RDM = RegDefMaps[Reg.isVirtual()];
+ auto &RDM = RegDefMaps[Reg.isVirtual()];
if (MachineInstr *DefMI = RDM.lookup(Reg)) {
OperandToDefMap[&MO] = DefMI;
DepthInfo Info = DepthMap.lookup(DefMI);
@@ -459,7 +459,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
if (!MO.isReg() || !MO.isDef())
continue;
Register Reg = MO.getReg();
- RegDefMaps[Reg.isVirtual()][Reg] = &MI;
+ RegDefMaps[Reg.isVirtual()][Reg] = &MI;
}
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -537,7 +537,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// This is another conservative check to avoid converting CMOV instruction
// used with tree-search like algorithm, where the branch is unpredicted.
auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
- if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
+ if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
unsigned Op = UIs.begin()->getOpcode();
if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
WorthOpGroup = false;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp b/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp
index a2ae6345c0..8fd8b38f70 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp
@@ -141,7 +141,7 @@ public:
return false;
// It's illegal to replace an instruction that implicitly defines a register
// with an instruction that doesn't, unless that register dead.
- for (const auto &MO : MI->implicit_operands())
+ for (const auto &MO : MI->implicit_operands())
if (MO.isReg() && MO.isDef() && !MO.isDead() &&
!TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
return false;
@@ -180,7 +180,7 @@ public:
MachineRegisterInfo *MRI) const override {
assert(isLegal(MI, TII) && "Cannot convert instruction");
MachineBasicBlock *MBB = MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
+ const DebugLoc &DL = MI->getDebugLoc();
Register Reg = MRI->createVirtualRegister(
TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
@@ -220,12 +220,12 @@ public:
// Don't allow copies to/flow GR8/GR16 physical registers.
// FIXME: Is there some better way to support this?
Register DstReg = MI->getOperand(0).getReg();
- if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) ||
- X86::GR16RegClass.contains(DstReg)))
+ if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) ||
+ X86::GR16RegClass.contains(DstReg)))
return false;
Register SrcReg = MI->getOperand(1).getReg();
- if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) ||
- X86::GR16RegClass.contains(SrcReg)))
+ if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) ||
+ X86::GR16RegClass.contains(SrcReg)))
return false;
return true;
@@ -235,7 +235,7 @@ public:
MachineRegisterInfo *MRI) const override {
assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");
- for (const auto &MO : MI->operands()) {
+ for (const auto &MO : MI->operands()) {
// Physical registers will not be converted. Assume that converting the
// COPY to the destination domain will eventually result in a actual
// instruction.
@@ -298,7 +298,7 @@ typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
class Closure {
private:
/// Virtual registers in the closure.
- DenseSet<Register> Edges;
+ DenseSet<Register> Edges;
/// Instructions in the closure.
SmallVector<MachineInstr *, 8> Instrs;
@@ -330,9 +330,9 @@ public:
bool empty() const { return Edges.empty(); }
- bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
+ bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
- using const_edge_iterator = DenseSet<Register>::const_iterator;
+ using const_edge_iterator = DenseSet<Register>::const_iterator;
iterator_range<const_edge_iterator> edges() const {
return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
}
@@ -348,7 +348,7 @@ public:
LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const {
dbgs() << "Registers: ";
bool First = true;
- for (Register Reg : Edges) {
+ for (Register Reg : Edges) {
if (!First)
dbgs() << ", ";
First = false;
@@ -403,10 +403,10 @@ private:
void initConverters();
/// Starting from \Reg, expand the closure as much as possible.
- void buildClosure(Closure &, Register Reg);
+ void buildClosure(Closure &, Register Reg);
/// Enqueue \p Reg to be considered for addition to the closure.
- void visitRegister(Closure &, Register Reg, RegDomain &Domain,
+ void visitRegister(Closure &, Register Reg, RegDomain &Domain,
SmallVectorImpl<unsigned> &Worklist);
/// Reassign the closure to \p Domain.
@@ -426,13 +426,13 @@ char X86DomainReassignment::ID = 0;
} // End anonymous namespace.
-void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
+void X86DomainReassignment::visitRegister(Closure &C, Register Reg,
RegDomain &Domain,
SmallVectorImpl<unsigned> &Worklist) {
if (EnclosedEdges.count(Reg))
return;
- if (!Reg.isVirtual())
+ if (!Reg.isVirtual())
return;
if (!MRI->hasOneDef(Reg))
@@ -503,7 +503,7 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
// Iterate all registers in the closure, replace them with registers in the
// destination domain.
- for (Register Reg : C.edges()) {
+ for (Register Reg : C.edges()) {
MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
for (auto &MO : MRI->use_operands(Reg)) {
if (MO.isReg())
@@ -513,13 +513,13 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
}
}
- for (auto *MI : ToErase)
+ for (auto *MI : ToErase)
MI->eraseFromParent();
}
/// \returns true when \p Reg is used as part of an address calculation in \p
/// MI.
-static bool usedAsAddr(const MachineInstr &MI, Register Reg,
+static bool usedAsAddr(const MachineInstr &MI, Register Reg,
const TargetInstrInfo *TII) {
if (!MI.mayLoadOrStore())
return false;
@@ -533,14 +533,14 @@ static bool usedAsAddr(const MachineInstr &MI, Register Reg,
for (unsigned MemOpIdx = MemOpStart,
MemOpEnd = MemOpStart + X86::AddrNumOperands;
MemOpIdx < MemOpEnd; ++MemOpIdx) {
- const MachineOperand &Op = MI.getOperand(MemOpIdx);
+ const MachineOperand &Op = MI.getOperand(MemOpIdx);
if (Op.isReg() && Op.getReg() == Reg)
return true;
}
return false;
}
-void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
+void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
SmallVector<unsigned, 4> Worklist;
RegDomain Domain = NoDomain;
visitRegister(C, Reg, Domain, Worklist);
@@ -590,7 +590,7 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) {
continue;
Register DefReg = DefOp.getReg();
- if (!DefReg.isVirtual()) {
+ if (!DefReg.isVirtual()) {
C.setAllIllegal();
continue;
}
@@ -749,7 +749,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
// Go over all virtual registers and calculate a closure.
unsigned ClosureID = 0;
for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
- Register Reg = Register::index2VirtReg(Idx);
+ Register Reg = Register::index2VirtReg(Idx);
// GPR only current source domain supported.
if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp b/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp
index 97f843fa24..1ac8851ecd 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp
@@ -85,8 +85,8 @@ public:
private:
/// Machine instruction info used throughout the class.
const X86InstrInfo *TII = nullptr;
-
- const X86Subtarget *ST = nullptr;
+
+ const X86Subtarget *ST = nullptr;
};
} // end anonymous namespace
@@ -96,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- ST = &MF.getSubtarget<X86Subtarget>();
- if (!ST->hasAVX512())
+ ST = &MF.getSubtarget<X86Subtarget>();
+ if (!ST->hasAVX512())
return false;
bool Changed = false;
@@ -146,29 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
}
// Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
- const X86Subtarget *ST) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+ const X86Subtarget *ST) {
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
- case X86::VPDPBUSDSZ256m:
- case X86::VPDPBUSDSZ256r:
- case X86::VPDPBUSDSZ128m:
- case X86::VPDPBUSDSZ128r:
- case X86::VPDPBUSDZ256m:
- case X86::VPDPBUSDZ256r:
- case X86::VPDPBUSDZ128m:
- case X86::VPDPBUSDZ128r:
- case X86::VPDPWSSDSZ256m:
- case X86::VPDPWSSDSZ256r:
- case X86::VPDPWSSDSZ128m:
- case X86::VPDPWSSDSZ128r:
- case X86::VPDPWSSDZ256m:
- case X86::VPDPWSSDZ256r:
- case X86::VPDPWSSDZ128m:
- case X86::VPDPWSSDZ128r:
- // These can only VEX convert if AVXVNNI is enabled.
- return ST->hasAVXVNNI();
+ case X86::VPDPBUSDSZ256m:
+ case X86::VPDPBUSDSZ256r:
+ case X86::VPDPBUSDSZ128m:
+ case X86::VPDPBUSDSZ128r:
+ case X86::VPDPBUSDZ256m:
+ case X86::VPDPBUSDZ256r:
+ case X86::VPDPBUSDZ128m:
+ case X86::VPDPBUSDZ128r:
+ case X86::VPDPWSSDSZ256m:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ128m:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDZ256m:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ128m:
+ case X86::VPDPWSSDZ128r:
+ // These can only VEX convert if AVXVNNI is enabled.
+ return ST->hasAVXVNNI();
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
@@ -271,7 +271,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
(Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable)
: makeArrayRef(X86EvexToVex128CompressTable);
- const auto *I = llvm::lower_bound(Table, MI.getOpcode());
+ const auto *I = llvm::lower_bound(Table, MI.getOpcode());
if (I == Table.end() || I->EvexOpcode != MI.getOpcode())
return false;
@@ -280,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (usesExtendedRegister(MI))
return false;
- if (!performCustomAdjustments(MI, NewOpc, ST))
+ if (!performCustomAdjustments(MI, NewOpc, ST))
return false;
MI.setDesc(TII->get(NewOpc));
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp
index 15af0fb2e8..9998b30754 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp
@@ -338,24 +338,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Perform the following transformation.
// SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
// =>
- // RBX = InArg
+ // RBX = InArg
// actualcmpxchg Addr
- // RBX = SaveRbx
+ // RBX = SaveRbx
const MachineOperand &InArg = MBBI->getOperand(6);
Register SaveRbx = MBBI->getOperand(7).getReg();
// Copy the input argument of the pseudo into the argument of the
// actual instruction.
- // NOTE: We don't copy the kill flag since the input might be the same reg
- // as one of the other operands of LCMPXCHG16B.
- TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false);
+ // NOTE: We don't copy the kill flag since the input might be the same reg
+ // as one of the other operands of LCMPXCHG16B.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false);
// Create the actual instruction.
- MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B));
+ MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B));
// Copy the operands related to the address.
for (unsigned Idx = 1; Idx < 6; ++Idx)
NewInstr->addOperand(MBBI->getOperand(Idx));
// Finally, restore the value of RBX.
- TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx,
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx,
/*SrcIsKill*/ true);
// Delete the pseudo.
@@ -438,69 +438,69 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBB.erase(MBBI);
return true;
}
- case X86::MWAITX_SAVE_RBX: {
- // Perform the following transformation.
- // SaveRbx = pseudomwaitx InArg, SaveRbx
- // =>
- // [E|R]BX = InArg
- // actualmwaitx
- // [E|R]BX = SaveRbx
- const MachineOperand &InArg = MBBI->getOperand(1);
- // Copy the input argument of the pseudo into the argument of the
- // actual instruction.
- TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill());
- // Create the actual instruction.
- BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr));
- // Finally, restore the value of RBX.
- Register SaveRbx = MBBI->getOperand(2).getReg();
- TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true);
- // Delete the pseudo.
- MBBI->eraseFromParent();
- return true;
- }
+ case X86::MWAITX_SAVE_RBX: {
+ // Perform the following transformation.
+ // SaveRbx = pseudomwaitx InArg, SaveRbx
+ // =>
+ // [E|R]BX = InArg
+ // actualmwaitx
+ // [E|R]BX = SaveRbx
+ const MachineOperand &InArg = MBBI->getOperand(1);
+ // Copy the input argument of the pseudo into the argument of the
+ // actual instruction.
+ TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill());
+ // Create the actual instruction.
+ BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr));
+ // Finally, restore the value of RBX.
+ Register SaveRbx = MBBI->getOperand(2).getReg();
+ TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true);
+ // Delete the pseudo.
+ MBBI->eraseFromParent();
+ return true;
+ }
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
- case X86::PLDTILECFG: {
- MI.RemoveOperand(0);
- MI.setDesc(TII->get(X86::LDTILECFG));
- return true;
- }
- case X86::PSTTILECFG: {
- MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
- MI.setDesc(TII->get(X86::STTILECFG));
- return true;
- }
- case X86::PTILELOADDV: {
- MI.RemoveOperand(8); // Remove $tmmcfg
- for (unsigned i = 2; i > 0; --i)
- MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TILELOADD));
- return true;
- }
- case X86::PTDPBSSDV: {
- MI.RemoveOperand(7); // Remove $tmmcfg
- MI.untieRegOperand(4);
- for (unsigned i = 3; i > 0; --i)
- MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TDPBSSD));
- MI.tieOperands(0, 1);
- return true;
- }
- case X86::PTILESTOREDV: {
- MI.RemoveOperand(8); // Remove $tmmcfg
- for (int i = 1; i >= 0; --i)
- MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TILESTORED));
- return true;
- }
- case X86::PTILEZEROV: {
- for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
- MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TILEZERO));
- return true;
- }
+ case X86::PLDTILECFG: {
+ MI.RemoveOperand(0);
+ MI.setDesc(TII->get(X86::LDTILECFG));
+ return true;
}
+ case X86::PSTTILECFG: {
+ MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
+ MI.setDesc(TII->get(X86::STTILECFG));
+ return true;
+ }
+ case X86::PTILELOADDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (unsigned i = 2; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILELOADD));
+ return true;
+ }
+ case X86::PTDPBSSDV: {
+ MI.RemoveOperand(7); // Remove $tmmcfg
+ MI.untieRegOperand(4);
+ for (unsigned i = 3; i > 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TDPBSSD));
+ MI.tieOperands(0, 1);
+ return true;
+ }
+ case X86::PTILESTOREDV: {
+ MI.RemoveOperand(8); // Remove $tmmcfg
+ for (int i = 1; i >= 0; --i)
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILESTORED));
+ return true;
+ }
+ case X86::PTILEZEROV: {
+ for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+ MI.RemoveOperand(i);
+ MI.setDesc(TII->get(X86::TILEZERO));
+ return true;
+ }
+ }
llvm_unreachable("Previous switch has a fallthrough?");
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp
index a1a16a19f5..b53aa41575 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp
@@ -284,14 +284,14 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
return false;
}
- // Make sure no potentially eflags clobbering phi moves can be inserted in
- // between.
- auto HasPhis = [](const BasicBlock *Succ) {
- return !llvm::empty(Succ->phis());
- };
- if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
- return false;
-
+ // Make sure no potentially eflags clobbering phi moves can be inserted in
+ // between.
+ auto HasPhis = [](const BasicBlock *Succ) {
+ return !llvm::empty(Succ->phis());
+ };
+ if (I->isTerminator() && llvm::any_of(successors(I), HasPhis))
+ return false;
+
CC = TmpCC;
return true;
}
@@ -792,9 +792,9 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
RC = &X86::GR32RegClass;
}
- if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
- StubAM.Base.Reg = X86::RIP;
-
+ if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL)
+ StubAM.Base.Reg = X86::RIP;
+
LoadReg = createResultReg(RC);
MachineInstrBuilder LoadMI =
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
@@ -1090,35 +1090,35 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
// If all else fails, try to materialize the value in a register.
if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
- auto GetCallRegForValue = [this](const Value *V) {
- Register Reg = getRegForValue(V);
-
- // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
- if (Reg && Subtarget->isTarget64BitILP32()) {
- Register CopyReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr),
- CopyReg)
- .addReg(Reg);
-
- Register ExtReg = createResultReg(&X86::GR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
- .addImm(0)
- .addReg(CopyReg)
- .addImm(X86::sub_32bit);
- Reg = ExtReg;
- }
-
- return Reg;
- };
-
+ auto GetCallRegForValue = [this](const Value *V) {
+ Register Reg = getRegForValue(V);
+
+ // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits.
+ if (Reg && Subtarget->isTarget64BitILP32()) {
+ Register CopyReg = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr),
+ CopyReg)
+ .addReg(Reg);
+
+ Register ExtReg = createResultReg(&X86::GR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg)
+ .addImm(0)
+ .addReg(CopyReg)
+ .addImm(X86::sub_32bit);
+ Reg = ExtReg;
+ }
+
+ return Reg;
+ };
+
if (AM.Base.Reg == 0) {
- AM.Base.Reg = GetCallRegForValue(V);
+ AM.Base.Reg = GetCallRegForValue(V);
return AM.Base.Reg != 0;
}
if (AM.IndexReg == 0) {
assert(AM.Scale == 1 && "Scale with no index!");
- AM.IndexReg = GetCallRegForValue(V);
+ AM.IndexReg = GetCallRegForValue(V);
return AM.IndexReg != 0;
}
}
@@ -1261,15 +1261,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
if (SrcVT == MVT::i1) {
if (Outs[0].Flags.isSExt())
return false;
- // TODO
- SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
+ // TODO
+ SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false);
SrcVT = MVT::i8;
}
unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
ISD::SIGN_EXTEND;
- // TODO
- SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
- /*Op0IsKill=*/false);
+ // TODO
+ SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg,
+ /*Op0IsKill=*/false);
}
// Make the copy.
@@ -1463,8 +1463,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
ResultReg = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
- /*Op0IsKill=*/true, X86::sub_8bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg,
+ /*Op0IsKill=*/true, X86::sub_8bit);
if (!ResultReg)
return false;
break;
@@ -1587,11 +1587,11 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
- /*Op0IsKill=*/true, X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
- ResultReg, /*Op0IsKill=*/true);
+ ResultReg, /*Op0IsKill=*/true);
if (ResultReg == 0)
return false;
}
@@ -1633,11 +1633,11 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
Result32).addReg(ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
- /*Op0IsKill=*/true, X86::sub_16bit);
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32,
+ /*Op0IsKill=*/true, X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
- ResultReg, /*Op0IsKill=*/true);
+ ResultReg, /*Op0IsKill=*/true);
if (ResultReg == 0)
return false;
}
@@ -1789,7 +1789,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), OpReg)
.addReg(KOpReg);
- OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
+ OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2021,7 +2021,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
// Now reference the 8-bit subreg of the result.
ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
- /*Op0IsKill=*/true, X86::sub_8bit);
+ /*Op0IsKill=*/true, X86::sub_8bit);
}
// Copy the result out of the physreg if we haven't already.
if (!ResultReg) {
@@ -2135,7 +2135,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
.addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2289,12 +2289,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
- /*Op0IsKill=*/false, LHSReg, LHSIsKill);
- Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
- /*Op0IsKill=*/true, RHSReg, RHSIsKill);
- Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
- AndReg, /*Op1IsKill=*/true);
+ Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg,
+ /*Op0IsKill=*/false, LHSReg, LHSIsKill);
+ Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg,
+ /*Op0IsKill=*/true, RHSReg, RHSIsKill);
+ Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true,
+ AndReg, /*Op1IsKill=*/true);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg);
@@ -2353,7 +2353,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), CondReg)
.addReg(KCondReg, getKillRegState(CondIsKill));
- CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
+ CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true,
X86::sub_8bit);
}
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
@@ -2610,7 +2610,7 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
unsigned Reg;
bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
- RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
+ RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM);
assert(RV && "Failed to emit load or store??");
unsigned Size = VT.getSizeInBits()/8;
@@ -2674,15 +2674,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
// Explicitly zero-extend the input to 32-bit.
InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
- /*Op0IsKill=*/false);
+ /*Op0IsKill=*/false);
// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
- InputReg, /*Op0IsKill=*/true);
+ InputReg, /*Op0IsKill=*/true);
unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
: X86::VCVTPH2PSrr;
- InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
+ InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true);
// The result value is in the lower 32-bits of ResultReg.
// Emit an explicit copy from register class VR128 to register class FR32.
@@ -2740,7 +2740,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// ...
unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
while (Depth--) {
- Register DestReg = createResultReg(RC);
+ Register DestReg = createResultReg(RC);
addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), DestReg), SrcReg);
SrcReg = DestReg;
@@ -2910,7 +2910,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
const Value *RHS = II->getArgOperand(1);
// Canonicalize immediate to the RHS.
- if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative())
std::swap(LHS, RHS);
unsigned BaseOpc, CondCode;
@@ -3723,10 +3723,10 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
case MVT::i8:
- return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
+ return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true,
X86::sub_8bit);
case MVT::i16:
- return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
+ return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true,
X86::sub_16bit);
case MVT::i32:
return SrcReg;
@@ -3823,7 +3823,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
.addConstantPoolIndex(CPI, 0, OpFlag);
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg);
- addRegReg(MIB, AddrReg, false, PICBase, false);
+ addRegReg(MIB, AddrReg, false, PICBase, false);
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getConstantPool(*FuncInfo.MF),
MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp
index f8d822aebc..ddf96bc7b6 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp
@@ -187,7 +187,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
Register &SuperDestReg) const {
- const X86RegisterInfo *TRI = &TII->getRegisterInfo();
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
Register OrigDestReg = OrigMI->getOperand(0).getReg();
SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
@@ -319,7 +319,7 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
// This is only correct if we access the same subregister index: otherwise,
// we could try to replace "movb %ah, %al" with "movl %eax, %eax".
- const X86RegisterInfo *TRI = &TII->getRegisterInfo();
+ const X86RegisterInfo *TRI = &TII->getRegisterInfo();
if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
return nullptr;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp
index 0054d5818a..482708e30d 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp
@@ -376,8 +376,8 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 ||
- MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) !=
- MachineBasicBlock::LQR_Dead)
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead)
return false;
Register DestReg = MI.getOperand(0).getReg();
@@ -450,7 +450,7 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
} else
return false;
- MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return true;
@@ -486,7 +486,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
// now to replace with an equivalent LEA...
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
- MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1);
MBB.erase(MBI);
MachineBasicBlock::iterator J =
static_cast<MachineBasicBlock::iterator>(NewMI);
@@ -508,8 +508,8 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (Segment.getReg() != 0 || !Offset.isImm() ||
- MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
- MachineBasicBlock::LQR_Dead)
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead)
return;
const Register DstR = Dst.getReg();
const Register SrcR1 = Base.getReg();
@@ -540,7 +540,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
LLVM_DEBUG(NewMI->dump(););
}
if (NewMI) {
- MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
}
@@ -560,8 +560,8 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
- MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
- MachineBasicBlock::LQR_Dead ||
+ MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) !=
+ MachineBasicBlock::LQR_Dead ||
Segment.getReg() != X86::NoRegister)
return;
@@ -647,7 +647,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
}
}
- MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return;
@@ -673,7 +673,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
.add(Index);
LLVM_DEBUG(NewMI->dump(););
- MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
return;
@@ -696,7 +696,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
.add(Base);
LLVM_DEBUG(NewMI->dump(););
- MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
+ MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1);
MBB.erase(I);
I = NewMI;
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp
index 269f8ce6bd..a46da8aa48 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp
@@ -101,24 +101,24 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
? &X86::GR32RegClass
: &X86::GR32_ABCDRegClass;
- if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
- // If we cannot constrain the register, we would need an additional copy
- // and are better off keeping the MOVZX32rr8 we have now.
- continue;
- }
-
- ++NumSubstZexts;
- Changed = true;
-
+ if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) {
+ // If we cannot constrain the register, we would need an additional copy
+ // and are better off keeping the MOVZX32rr8 we have now.
+ continue;
+ }
+
+ ++NumSubstZexts;
+ Changed = true;
+
// Initialize a register with 0. This must go before the eflags def
- Register ZeroReg = MRI->createVirtualRegister(RC);
+ Register ZeroReg = MRI->createVirtualRegister(RC);
BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
ZeroReg);
// X86 setcc only takes an output GR8, so fake a GR32 input by inserting
// the setcc result into the low byte of the zeroed register.
BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(),
- TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
+ TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg())
.addReg(ZeroReg)
.addReg(MI.getOperand(0).getReg())
.addImm(X86::sub_8bit);
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp
index d43fd807a5..539a3e2a20 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -97,7 +97,7 @@ private:
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
MachineBasicBlock::iterator CopyDefI);
- Register promoteCondToReg(MachineBasicBlock &MBB,
+ Register promoteCondToReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond);
std::pair<unsigned, bool>
@@ -739,7 +739,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
X86::CondCode Cond = X86::getCondFromSETCC(MI);
if (Cond != X86::COND_INVALID && !MI.mayStore() &&
- MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) {
+ MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) {
assert(MI.getOperand(0).isDef() &&
"A non-storing SETcc should always define a register!");
CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -753,7 +753,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
return CondRegs;
}
-Register X86FlagsCopyLoweringPass::promoteCondToReg(
+Register X86FlagsCopyLoweringPass::promoteCondToReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond) {
Register Reg = MRI->createVirtualRegister(PromoteRC);
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp
index 866f113640..54003a72c1 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp
@@ -28,7 +28,7 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
@@ -235,7 +235,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
if (isSub && !isEAXLiveIn(MBB))
Reg = Rax;
else
- Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+ Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
unsigned AddSubRROpc =
@@ -292,7 +292,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
// need to find a dead register when using pop.
unsigned Reg = isSub
? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
- : TRI->findDeadCallerSavedReg(MBB, MBBI);
+ : TRI->findDeadCallerSavedReg(MBB, MBBI);
if (Reg) {
unsigned Opc = isSub
? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
@@ -437,9 +437,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
}
const MachineModuleInfo &MMI = MF.getMMI();
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const Register FramePtr = TRI->getFrameRegister(MF);
- const Register MachineFramePtr =
- STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
+ const Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
+ STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64))
: FramePtr;
unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
// Offset = space for return address + size of the frame pointer itself.
@@ -1690,7 +1690,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
assert(Personality == EHPersonality::MSVC_CXX);
Register FrameReg;
int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
- int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
+ int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed();
// ESP is the first field, so no extra displacement is needed.
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg,
false, EHRegOffset)
@@ -1711,9 +1711,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (IsWin64Prologue && IsFunclet)
Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
else
- Offset =
- getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
- SEHFrameOffset;
+ Offset =
+ getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() +
+ SEHFrameOffset;
HasWinCFI = true;
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1785,8 +1785,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
Register UsedReg;
int Offset =
- getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
- .getFixed();
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
assert(UsedReg == BasePtr);
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset)
.addReg(FramePtr)
@@ -1864,8 +1864,8 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
Register SPReg;
int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
- /*IgnoreSPUpdates*/ true)
- .getFixed();
+ /*IgnoreSPUpdates*/ true)
+ .getFixed();
assert(Offset >= 0 && SPReg == TRI->getStackRegister());
return static_cast<unsigned>(Offset);
}
@@ -1920,7 +1920,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
const bool Is64BitILP32 = STI.isTarget64BitILP32();
Register FramePtr = TRI->getFrameRegister(MF);
- Register MachineFramePtr =
+ Register MachineFramePtr =
Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -2091,16 +2091,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
-
- // Emit tilerelease for AMX kernel.
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (!MRI.reg_nodbg_empty(X86::TMMCFG))
- BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+
+ // Emit tilerelease for AMX kernel.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!MRI.reg_nodbg_empty(X86::TMMCFG))
+ BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
}
-StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
- int FI,
- Register &FrameReg) const {
+StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
bool IsFixed = MFI.isFixedObjectIndex(FI);
@@ -2147,7 +2147,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes);
if (FI && FI == X86FI->getFAIndex())
- return StackOffset::getFixed(-SEHFrameOffset);
+ return StackOffset::getFixed(-SEHFrameOffset);
// FPDelta is the offset from the "traditional" FP location of the old base
// pointer followed by return address and the location required by the
@@ -2163,23 +2163,23 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
if (FI < 0) {
// Skip the saved EBP.
- return StackOffset::getFixed(Offset + SlotSize + FPDelta);
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
} else {
assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
- return StackOffset::getFixed(Offset + StackSize);
+ return StackOffset::getFixed(Offset + StackSize);
}
} else if (TRI->needsStackRealignment(MF)) {
if (FI < 0) {
// Skip the saved EBP.
- return StackOffset::getFixed(Offset + SlotSize + FPDelta);
+ return StackOffset::getFixed(Offset + SlotSize + FPDelta);
} else {
assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
- return StackOffset::getFixed(Offset + StackSize);
+ return StackOffset::getFixed(Offset + StackSize);
}
// FIXME: Support tail calls
} else {
if (!HasFP)
- return StackOffset::getFixed(Offset + StackSize);
+ return StackOffset::getFixed(Offset + StackSize);
// Skip the saved EBP.
Offset += SlotSize;
@@ -2190,7 +2190,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
Offset -= TailCallReturnAddrDelta;
}
- return StackOffset::getFixed(Offset + FPDelta);
+ return StackOffset::getFixed(Offset + FPDelta);
}
int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
@@ -2201,27 +2201,27 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
const auto it = WinEHXMMSlotInfo.find(FI);
if (it == WinEHXMMSlotInfo.end())
- return getFrameIndexReference(MF, FI, FrameReg).getFixed();
+ return getFrameIndexReference(MF, FI, FrameReg).getFixed();
FrameReg = TRI->getStackRegister();
return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
it->second;
}
-StackOffset
-X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
- Register &FrameReg,
- int Adjustment) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ int Adjustment) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
FrameReg = TRI->getStackRegister();
- return StackOffset::getFixed(MFI.getObjectOffset(FI) -
- getOffsetOfLocalArea() + Adjustment);
+ return StackOffset::getFixed(MFI.getObjectOffset(FI) -
+ getOffsetOfLocalArea() + Adjustment);
}
-StackOffset
-X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
- int FI, Register &FrameReg,
- bool IgnoreSPUpdates) const {
+StackOffset
+X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
+ int FI, Register &FrameReg,
+ bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
// Does not include any dynamic realign.
@@ -2898,8 +2898,8 @@ static unsigned getHiPELiteral(
// non-meta instructions between MBBI and MBB.end().
static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
MachineBasicBlock::const_iterator MBBI) {
- return llvm::all_of(
- MBB.successors(),
+ return llvm::all_of(
+ MBB.successors(),
[](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
return MI.isMetaInstruction();
@@ -3082,8 +3082,8 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
unsigned Regs[2];
unsigned FoundRegs = 0;
- const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- const MachineOperand &RegMask = Prev->getOperand(1);
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const MachineOperand &RegMask = Prev->getOperand(1);
auto &RegClass =
Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass;
@@ -3269,14 +3269,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
// If we may need to emit frameless compact unwind information, give
// up as this is currently broken: PR25614.
- bool CompactUnwind =
- MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
- nullptr;
- return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
- !CompactUnwind) &&
- // The lowering of segmented stack and HiPE only support entry
- // blocks as prologue blocks: PR26107. This limitation may be
- // lifted if we fix:
+ bool CompactUnwind =
+ MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() !=
+ nullptr;
+ return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) ||
+ !CompactUnwind) &&
+ // The lowering of segmented stack and HiPE only support entry
+ // blocks as prologue blocks: PR26107. This limitation may be
+ // lifted if we fix:
// - adjustForSegmentedStacks
// - adjustForHiPEPrologue
MF.getFunction().getCallingConv() != CallingConv::HiPE &&
@@ -3311,7 +3311,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
}
Register UsedReg;
- int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
+ int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed();
int EndOffset = -EHRegOffset - EHRegSize;
FuncInfo.EHRegNodeEndOffset = EndOffset;
@@ -3334,8 +3334,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
// MOV32rm SavedEBPOffset(%esi), %ebp
assert(X86FI->getHasSEHFramePtrSave());
int Offset =
- getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
- .getFixed();
+ getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg)
+ .getFixed();
assert(UsedReg == BasePtr);
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr),
UsedReg, true, Offset)
@@ -3380,7 +3380,7 @@ struct X86FrameSortingObject {
// at the end of our list.
struct X86FrameSortingComparator {
inline bool operator()(const X86FrameSortingObject &A,
- const X86FrameSortingObject &B) const {
+ const X86FrameSortingObject &B) const {
uint64_t DensityAScaled, DensityBScaled;
// For consistency in our comparison, all invalid objects are placed
@@ -3516,21 +3516,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// emitPrologue if it gets called and emits CFI.
MF.setHasWinCFI(false);
- // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
- // aligned. The format doesn't support misaligned stack adjustments.
- if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
- MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
-
+ // If we are using Windows x64 CFI, ensure that the stack is always 8 byte
+ // aligned. The format doesn't support misaligned stack adjustments.
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+ MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize));
+
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
- if (STI.is64Bit() && MF.hasEHFunclets() &&
- classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
- EHPersonality::MSVC_CXX) {
- adjustFrameForMsvcCxxEh(MF);
- }
-}
-
-void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
+ if (STI.is64Bit() && MF.hasEHFunclets() &&
+ classifyEHPersonality(MF.getFunction().getPersonalityFn()) ==
+ EHPersonality::MSVC_CXX) {
+ adjustFrameForMsvcCxxEh(MF);
+ }
+}
+
+void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const {
// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
// relative to RSP after the prologue. Find the offset of the last fixed
// object, so that we can allocate a slot immediately following it. If there
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h
index 26e80811af..a8c6320afa 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,7 @@
#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/TypeSize.h"
namespace llvm {
@@ -103,17 +103,17 @@ public:
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
bool needsFrameIndexResolution(const MachineFunction &MF) const override;
- StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
- Register &FrameReg) const override;
+ StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const override;
int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
Register &SPReg) const;
- StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
- Register &SPReg, int Adjustment) const;
- StackOffset
- getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
- Register &FrameReg,
- bool IgnoreSPUpdates) const override;
+ StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &SPReg, int Adjustment) const;
+ StackOffset
+ getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
+ Register &FrameReg,
+ bool IgnoreSPUpdates) const override;
MachineBasicBlock::iterator
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -224,7 +224,7 @@ private:
const DebugLoc &DL, uint64_t Offset,
uint64_t Align) const;
- void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
+ void adjustFrameForMsvcCxxEh(MachineFunction &MF) const;
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp
index 1df9a0d170..c4f21ed402 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -17,7 +17,7 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
@@ -45,8 +45,8 @@ static cl::opt<bool> EnablePromoteAnyextLoad(
"x86-promote-anyext-load", cl::init(true),
cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
-extern cl::opt<bool> IndirectBranchTracking;
-
+extern cl::opt<bool> IndirectBranchTracking;
+
//===----------------------------------------------------------------------===//
// Pattern Matcher Implementation
//===----------------------------------------------------------------------===//
@@ -207,8 +207,8 @@ namespace {
void Select(SDNode *N) override;
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
- bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
- bool AllowSegmentRegForX32 = false);
+ bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32 = false);
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
@@ -503,8 +503,8 @@ namespace {
bool tryShiftAmountMod(SDNode *N);
bool tryShrinkShlLogicImm(SDNode *N);
bool tryVPTERNLOG(SDNode *N);
- bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
- SDValue A, SDValue B, SDValue C, uint8_t Imm);
+ bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC,
+ SDValue A, SDValue B, SDValue C, uint8_t Imm);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
bool tryMatchBitSelect(SDNode *N);
@@ -527,9 +527,9 @@ namespace {
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
- if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
- Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
- Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
+ if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
+ Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
+ Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
@@ -801,69 +801,69 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
return false;
}
-static bool isEndbrImm64(uint64_t Imm) {
-// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
-// i.g: 0xF3660F1EFA, 0xF3670F1EFA
- if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
- return false;
-
- uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
- 0x65, 0x66, 0x67, 0xf0, 0xf2};
- int i = 24; // 24bit 0x0F1EFA has matched
- while (i < 64) {
- uint8_t Byte = (Imm >> i) & 0xFF;
- if (Byte == 0xF3)
- return true;
- if (!llvm::is_contained(OptionalPrefixBytes, Byte))
- return false;
- i += 8;
- }
-
- return false;
-}
-
+static bool isEndbrImm64(uint64_t Imm) {
+// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
+// i.g: 0xF3660F1EFA, 0xF3670F1EFA
+ if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
+ return false;
+
+ uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
+ 0x65, 0x66, 0x67, 0xf0, 0xf2};
+ int i = 24; // 24bit 0x0F1EFA has matched
+ while (i < 64) {
+ uint8_t Byte = (Imm >> i) & 0xFF;
+ if (Byte == 0xF3)
+ return true;
+ if (!llvm::is_contained(OptionalPrefixBytes, Byte))
+ return false;
+ i += 8;
+ }
+
+ return false;
+}
+
void X86DAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
- // This is for CET enhancement.
- //
- // ENDBR32 and ENDBR64 have specific opcodes:
- // ENDBR32: F3 0F 1E FB
- // ENDBR64: F3 0F 1E FA
- // And we want that attackers won’t find unintended ENDBR32/64
- // opcode matches in the binary
- // Here’s an example:
- // If the compiler had to generate asm for the following code:
- // a = 0xF30F1EFA
- // it could, for example, generate:
- // mov 0xF30F1EFA, dword ptr[a]
- // In such a case, the binary would include a gadget that starts
- // with a fake ENDBR64 opcode. Therefore, we split such generation
- // into multiple operations, let it not shows in the binary
- if (N->getOpcode() == ISD::Constant) {
- MVT VT = N->getSimpleValueType(0);
- int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
- int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
- if (Imm == EndbrImm || isEndbrImm64(Imm)) {
- // Check that the cf-protection-branch is enabled.
- Metadata *CFProtectionBranch =
- MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
- if (CFProtectionBranch || IndirectBranchTracking) {
- SDLoc dl(N);
- SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
- Complement = CurDAG->getNOT(dl, Complement, VT);
- --I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
- ++I;
- MadeChange = true;
- continue;
- }
- }
- }
-
+ // This is for CET enhancement.
+ //
+ // ENDBR32 and ENDBR64 have specific opcodes:
+ // ENDBR32: F3 0F 1E FB
+ // ENDBR64: F3 0F 1E FA
+ // And we want that attackers won’t find unintended ENDBR32/64
+ // opcode matches in the binary
+ // Here’s an example:
+ // If the compiler had to generate asm for the following code:
+ // a = 0xF30F1EFA
+ // it could, for example, generate:
+ // mov 0xF30F1EFA, dword ptr[a]
+ // In such a case, the binary would include a gadget that starts
+ // with a fake ENDBR64 opcode. Therefore, we split such generation
+ // into multiple operations, let it not shows in the binary
+ if (N->getOpcode() == ISD::Constant) {
+ MVT VT = N->getSimpleValueType(0);
+ int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
+ int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
+ if (Imm == EndbrImm || isEndbrImm64(Imm)) {
+ // Check that the cf-protection-branch is enabled.
+ Metadata *CFProtectionBranch =
+ MF->getMMI().getModule()->getModuleFlag("cf-protection-branch");
+ if (CFProtectionBranch || IndirectBranchTracking) {
+ SDLoc dl(N);
+ SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
+ Complement = CurDAG->getNOT(dl, Complement, VT);
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+ }
+
// If this is a target specific AND node with no flag usages, turn it back
// into ISD::AND to enable test instruction matching.
if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
@@ -1068,8 +1068,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::STRICT_FFLOOR:
case ISD::FTRUNC:
case ISD::STRICT_FTRUNC:
- case ISD::FROUNDEVEN:
- case ISD::STRICT_FROUNDEVEN:
+ case ISD::FROUNDEVEN:
+ case ISD::STRICT_FROUNDEVEN:
case ISD::FNEARBYINT:
case ISD::STRICT_FNEARBYINT:
case ISD::FRINT:
@@ -1085,8 +1085,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::FFLOOR: Imm = 0x9; break;
case ISD::STRICT_FTRUNC:
case ISD::FTRUNC: Imm = 0xB; break;
- case ISD::STRICT_FROUNDEVEN:
- case ISD::FROUNDEVEN: Imm = 0x8; break;
+ case ISD::STRICT_FROUNDEVEN:
+ case ISD::FROUNDEVEN: Imm = 0x8; break;
case ISD::STRICT_FNEARBYINT:
case ISD::FNEARBYINT: Imm = 0xC; break;
case ISD::STRICT_FRINT:
@@ -1099,11 +1099,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
{N->getValueType(0), MVT::Other},
{N->getOperand(0), N->getOperand(1),
- CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
else
Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
N->getOperand(0),
- CurDAG->getTargetConstant(Imm, dl, MVT::i32));
+ CurDAG->getTargetConstant(Imm, dl, MVT::i32));
--I;
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
@@ -1614,26 +1614,26 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
}
-bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
- bool AllowSegmentRegForX32) {
+bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
+ bool AllowSegmentRegForX32) {
SDValue Address = N->getOperand(1);
// load gs:0 -> GS segment register.
// load fs:0 -> FS segment register.
//
- // This optimization is generally valid because the GNU TLS model defines that
- // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
- // with 32-bit registers, as we get in ILP32 mode, those registers are first
- // zero-extended to 64 bits and then added it to the base address, which gives
- // unwanted results when the register holds a negative value.
+ // This optimization is generally valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
+ // with 32-bit registers, as we get in ILP32 mode, those registers are first
+ // zero-extended to 64 bits and then added it to the base address, which gives
+ // unwanted results when the register holds a negative value.
// For more information see http://people.redhat.com/drepper/tls.pdf
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) {
if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
!IndirectTlsSegRefs &&
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
- Subtarget->isTargetFuchsia())) {
- if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
- return true;
+ Subtarget->isTargetFuchsia())) {
+ if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
+ return true;
switch (N->getPointerInfo().getAddrSpace()) {
case X86AS::GS:
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1644,8 +1644,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
// Address space X86AS::SS is not handled here, because it is not used to
// address TLS areas.
}
- }
- }
+ }
+ }
return true;
}
@@ -1729,21 +1729,21 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
if (matchAddressRecursively(N, AM, 0))
return true;
- // Post-processing: Make a second attempt to fold a load, if we now know
- // that there will not be any other register. This is only performed for
- // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
- // any foldable load the first time.
- if (Subtarget->isTarget64BitILP32() &&
- AM.BaseType == X86ISelAddressMode::RegBase &&
- AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
- SDValue Save_Base_Reg = AM.Base_Reg;
- if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
- AM.Base_Reg = SDValue();
- if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
- AM.Base_Reg = Save_Base_Reg;
- }
- }
-
+ // Post-processing: Make a second attempt to fold a load, if we now know
+ // that there will not be any other register. This is only performed for
+ // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
+ // any foldable load the first time.
+ if (Subtarget->isTarget64BitILP32() &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
+ SDValue Save_Base_Reg = AM.Base_Reg;
+ if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
+ AM.Base_Reg = SDValue();
+ if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
+ AM.Base_Reg = Save_Base_Reg;
+ }
+ }
+
// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
// a smaller encoding and avoids a scaled-index.
if (AM.Scale == 2 &&
@@ -2718,12 +2718,12 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
AM.Disp += GA->getOffset();
AM.SymbolFlags = GA->getTargetFlags();
- if (Subtarget->is32Bit()) {
+ if (Subtarget->is32Bit()) {
AM.Scale = 1;
AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
}
- MVT VT = N.getSimpleValueType();
+ MVT VT = N.getSimpleValueType();
getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
return true;
}
@@ -2813,10 +2813,10 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
return false;
Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
- if (!CR)
- return Width == 32 && TM.getCodeModel() == CodeModel::Small;
-
- return CR->getSignedMin().sge(-1ull << Width) &&
+ if (!CR)
+ return Width == 32 && TM.getCodeModel() == CodeModel::Small;
+
+ return CR->getSignedMin().sge(-1ull << Width) &&
CR->getSignedMax().slt(1ull << Width);
}
@@ -3210,7 +3210,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
- unsigned NewOpc =
+ unsigned NewOpc =
((Opc == X86ISD::ADD) == IsOne)
? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
@@ -3466,7 +3466,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// Match the shift amount as: (bitwidth - y). It should go away, too.
if (ShiftAmt.getOpcode() != ISD::SUB)
return false;
- auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+ auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
if (!V0 || V0->getZExtValue() != Bitwidth)
return false;
NBits = ShiftAmt.getOperand(1);
@@ -3589,7 +3589,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
// Shift NBits left by 8 bits, thus producing 'control'.
// This makes the low 8 bits to be zero.
SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
- insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
@@ -4019,129 +4019,129 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
-bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
- SDNode *ParentBC, SDValue A, SDValue B,
- SDValue C, uint8_t Imm) {
- assert(A.isOperandOf(ParentA));
- assert(B.isOperandOf(ParentBC));
- assert(C.isOperandOf(ParentBC));
-
- auto tryFoldLoadOrBCast =
- [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
- SDValue &Index, SDValue &Disp, SDValue &Segment) {
- if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
- return true;
-
- // Not a load, check for broadcast which may be behind a bitcast.
- if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
- P = L.getNode();
- L = L.getOperand(0);
- }
-
- if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
- return false;
-
- // Only 32 and 64 bit broadcasts are supported.
- auto *MemIntr = cast<MemIntrinsicSDNode>(L);
- unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
- if (Size != 32 && Size != 64)
- return false;
-
- return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
- };
-
- bool FoldedLoad = false;
- SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- FoldedLoad = true;
- } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
- Tmp4)) {
- FoldedLoad = true;
- std::swap(A, C);
- // Swap bits 1/4 and 3/6.
- uint8_t OldImm = Imm;
- Imm = OldImm & 0xa5;
- if (OldImm & 0x02) Imm |= 0x10;
- if (OldImm & 0x10) Imm |= 0x02;
- if (OldImm & 0x08) Imm |= 0x40;
- if (OldImm & 0x40) Imm |= 0x08;
- } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
- Tmp4)) {
- FoldedLoad = true;
- std::swap(B, C);
- // Swap bits 1/2 and 5/6.
- uint8_t OldImm = Imm;
- Imm = OldImm & 0x99;
- if (OldImm & 0x02) Imm |= 0x04;
- if (OldImm & 0x04) Imm |= 0x02;
- if (OldImm & 0x20) Imm |= 0x40;
- if (OldImm & 0x40) Imm |= 0x20;
- }
-
- SDLoc DL(Root);
-
- SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
-
- MVT NVT = Root->getSimpleValueType(0);
-
- MachineSDNode *MNode;
- if (FoldedLoad) {
- SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
-
- unsigned Opc;
- if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
- auto *MemIntr = cast<MemIntrinsicSDNode>(C);
- unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
- assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
-
- bool UseD = EltSize == 32;
- if (NVT.is128BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
- else if (NVT.is256BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
- else if (NVT.is512BitVector())
- Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
- else
- llvm_unreachable("Unexpected vector size!");
- } else {
- bool UseD = NVT.getVectorElementType() == MVT::i32;
- if (NVT.is128BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
- else if (NVT.is256BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
- else if (NVT.is512BitVector())
- Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
- else
- llvm_unreachable("Unexpected vector size!");
- }
-
- SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
- MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
-
- // Update the chain.
- ReplaceUses(C.getValue(1), SDValue(MNode, 1));
- // Record the mem-refs
- CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
- } else {
- bool UseD = NVT.getVectorElementType() == MVT::i32;
- unsigned Opc;
- if (NVT.is128BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
- else if (NVT.is256BitVector())
- Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
- else if (NVT.is512BitVector())
- Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
- else
- llvm_unreachable("Unexpected vector size!");
-
- MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
- }
-
- ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
- CurDAG->RemoveDeadNode(Root);
- return true;
-}
-
+bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
+ SDNode *ParentBC, SDValue A, SDValue B,
+ SDValue C, uint8_t Imm) {
+ assert(A.isOperandOf(ParentA));
+ assert(B.isOperandOf(ParentBC));
+ assert(C.isOperandOf(ParentBC));
+
+ auto tryFoldLoadOrBCast =
+ [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment) {
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
+
+ // Not a load, check for broadcast which may be behind a bitcast.
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
+ }
+
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
+
+ // Only 32 and 64 bit broadcasts are supported.
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
+ if (Size != 32 && Size != 64)
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+ };
+
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ FoldedLoad = true;
+ } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(A, C);
+ // Swap bits 1/4 and 3/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0xa5;
+ if (OldImm & 0x02) Imm |= 0x10;
+ if (OldImm & 0x10) Imm |= 0x02;
+ if (OldImm & 0x08) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x08;
+ } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4)) {
+ FoldedLoad = true;
+ std::swap(B, C);
+ // Swap bits 1/2 and 5/6.
+ uint8_t OldImm = Imm;
+ Imm = OldImm & 0x99;
+ if (OldImm & 0x02) Imm |= 0x04;
+ if (OldImm & 0x04) Imm |= 0x02;
+ if (OldImm & 0x20) Imm |= 0x40;
+ if (OldImm & 0x40) Imm |= 0x20;
+ }
+
+ SDLoc DL(Root);
+
+ SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
+
+ MVT NVT = Root->getSimpleValueType(0);
+
+ MachineSDNode *MNode;
+ if (FoldedLoad) {
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+
+ unsigned Opc;
+ if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(C);
+ unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
+ assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
+
+ bool UseD = EltSize == 32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
+ else
+ llvm_unreachable("Unexpected vector size!");
+ }
+
+ SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
+ MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+
+ // Update the chain.
+ ReplaceUses(C.getValue(1), SDValue(MNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
+ } else {
+ bool UseD = NVT.getVectorElementType() == MVT::i32;
+ unsigned Opc;
+ if (NVT.is128BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
+ else if (NVT.is256BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
+ else if (NVT.is512BitVector())
+ Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
+ else
+ llvm_unreachable("Unexpected vector size!");
+
+ MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
// Try to match two logic ops to a VPTERNLOG.
// FIXME: Handle inverted inputs?
// FIXME: Handle more complex patterns that use an operand more than once?
@@ -4160,62 +4160,62 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- auto getFoldableLogicOp = [](SDValue Op) {
- // Peek through single use bitcast.
- if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
- Op = Op.getOperand(0);
-
- if (!Op.hasOneUse())
- return SDValue();
-
- unsigned Opc = Op.getOpcode();
- if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
- Opc == X86ISD::ANDNP)
- return Op;
-
- return SDValue();
+ auto getFoldableLogicOp = [](SDValue Op) {
+ // Peek through single use bitcast.
+ if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
+ Op = Op.getOperand(0);
+
+ if (!Op.hasOneUse())
+ return SDValue();
+
+ unsigned Opc = Op.getOpcode();
+ if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+ Opc == X86ISD::ANDNP)
+ return Op;
+
+ return SDValue();
};
- SDValue A, FoldableOp;
- if ((FoldableOp = getFoldableLogicOp(N1))) {
+ SDValue A, FoldableOp;
+ if ((FoldableOp = getFoldableLogicOp(N1))) {
A = N0;
- } else if ((FoldableOp = getFoldableLogicOp(N0))) {
+ } else if ((FoldableOp = getFoldableLogicOp(N0))) {
A = N1;
} else
return false;
- SDValue B = FoldableOp.getOperand(0);
- SDValue C = FoldableOp.getOperand(1);
-
- // We can build the appropriate control immediate by performing the logic
- // operation we're matching using these constants for A, B, and C.
- const uint8_t TernlogMagicA = 0xf0;
- const uint8_t TernlogMagicB = 0xcc;
- const uint8_t TernlogMagicC = 0xaa;
-
- uint8_t Imm;
- switch (FoldableOp.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
- case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
- case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
- case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
- }
-
- switch (N->getOpcode()) {
+ SDValue B = FoldableOp.getOperand(0);
+ SDValue C = FoldableOp.getOperand(1);
+
+ // We can build the appropriate control immediate by performing the logic
+ // operation we're matching using these constants for A, B, and C.
+ const uint8_t TernlogMagicA = 0xf0;
+ const uint8_t TernlogMagicB = 0xcc;
+ const uint8_t TernlogMagicC = 0xaa;
+
+ uint8_t Imm;
+ switch (FoldableOp.getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
- case X86ISD::ANDNP:
- if (A == N0)
- Imm &= ~TernlogMagicA;
- else
- Imm = ~(Imm) & TernlogMagicA;
+ case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
+ case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
+ case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
+ case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
+ }
+
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::ANDNP:
+ if (A == N0)
+ Imm &= ~TernlogMagicA;
+ else
+ Imm = ~(Imm) & TernlogMagicA;
break;
- case ISD::AND: Imm &= TernlogMagicA; break;
- case ISD::OR: Imm |= TernlogMagicA; break;
- case ISD::XOR: Imm ^= TernlogMagicA; break;
+ case ISD::AND: Imm &= TernlogMagicA; break;
+ case ISD::OR: Imm |= TernlogMagicA; break;
+ case ISD::XOR: Imm ^= TernlogMagicA; break;
}
- return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
+ return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm);
}
/// If the high bits of an 'and' operand are known zero, try setting the
@@ -4282,7 +4282,7 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
// A negative mask allows a smaller encoding. Create a new 'and' node.
SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
- insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
+ insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
ReplaceNode(And, NewAnd.getNode());
SelectCode(NewAnd.getNode());
@@ -4316,15 +4316,15 @@ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
VPTESTM_CASE(v64i8, BZ##SUFFIX) \
VPTESTM_CASE(v32i16, WZ##SUFFIX)
- if (FoldedBCast) {
+ if (FoldedBCast) {
switch (TestVT.SimpleTy) {
- VPTESTM_BROADCAST_CASES(rmb)
+ VPTESTM_BROADCAST_CASES(rmb)
}
}
- if (FoldedLoad) {
+ if (FoldedLoad) {
switch (TestVT.SimpleTy) {
- VPTESTM_FULL_CASES(rm)
+ VPTESTM_FULL_CASES(rm)
}
}
@@ -4383,56 +4383,56 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
}
}
- // Without VLX we need to widen the operation.
+ // Without VLX we need to widen the operation.
bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
- auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
- SDValue &Base, SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment) {
- // If we need to widen, we can't fold the load.
- if (!Widen)
- if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
- return true;
+ auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
+ SDValue &Base, SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ // If we need to widen, we can't fold the load.
+ if (!Widen)
+ if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
+ return true;
- // If we didn't fold a load, try to match broadcast. No widening limitation
- // for this. But only 32 and 64 bit types are supported.
- if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
- return false;
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
+ return false;
// Look through single use bitcasts.
- if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
- P = L.getNode();
- L = L.getOperand(0);
+ if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
+ P = L.getNode();
+ L = L.getOperand(0);
}
- if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
- return false;
-
- auto *MemIntr = cast<MemIntrinsicSDNode>(L);
- if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
- return false;
+ if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
- return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
+ auto *MemIntr = cast<MemIntrinsicSDNode>(L);
+ if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
+ return false;
+
+ return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
};
- // We can only fold loads if the sources are unique.
- bool CanFoldLoads = Src0 != Src1;
-
- bool FoldedLoad = false;
- SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- if (CanFoldLoads) {
- FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
- Tmp3, Tmp4);
- if (!FoldedLoad) {
- // And is commutative.
- FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
- Tmp2, Tmp3, Tmp4);
- if (FoldedLoad)
- std::swap(Src0, Src1);
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
+
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (CanFoldLoads) {
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (!FoldedLoad) {
+ // And is commutative.
+ FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
+ Tmp2, Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
}
}
- bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
+ bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
bool IsMasked = InMask.getNode() != nullptr;
@@ -4456,7 +4456,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
if (IsMasked) {
// Widen the mask.
- unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
+ unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, MaskVT, InMask, RC), 0);
@@ -4468,23 +4468,23 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
IsMasked);
MachineSDNode *CNode;
- if (FoldedLoad) {
+ if (FoldedLoad) {
SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
if (IsMasked) {
SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
- Src1.getOperand(0) };
+ Src1.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
} else {
SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
- Src1.getOperand(0) };
+ Src1.getOperand(0) };
CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
}
// Update the chain.
- ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
+ ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
// Record the mem-refs
- CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
+ CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
} else {
if (IsMasked)
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4494,7 +4494,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
// If we widened, we need to shrink the mask VT.
if (Widen) {
- unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
+ unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
dl, ResVT, SDValue(CNode, 0), RC);
@@ -4550,9 +4550,9 @@ bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
ReplaceNode(N, Ternlog.getNode());
-
- return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
- A, B, C, 0xCA);
+
+ return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
+ A, B, C, 0xCA);
}
void X86DAGToDAGISel::Select(SDNode *Node) {
@@ -4568,95 +4568,95 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (Opcode) {
default: break;
- case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = Node->getConstantOperandVal(1);
- switch (IntNo) {
- default: break;
- case Intrinsic::x86_encodekey128:
- case Intrinsic::x86_encodekey256: {
- if (!Subtarget->hasKL())
- break;
-
- unsigned Opcode;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
- case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
- case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
- }
-
- SDValue Chain = Node->getOperand(0);
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
- SDValue());
- if (Opcode == X86::ENCODEKEY256)
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
- Chain.getValue(1));
-
- MachineSDNode *Res = CurDAG->getMachineNode(
- Opcode, dl, Node->getVTList(),
- {Node->getOperand(2), Chain, Chain.getValue(1)});
- ReplaceNode(Node, Res);
- return;
- }
- case Intrinsic::x86_tileloadd64_internal: {
- if (!Subtarget->hasAMXTILE())
- break;
- unsigned Opc = X86::PTILELOADDV;
- // _tile_loadd_internal(row, col, buf, STRIDE)
- SDValue Base = Node->getOperand(4);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(5);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
- SDValue Ops[] = {Node->getOperand(2),
- Node->getOperand(3),
- Base,
- Scale,
- Index,
- Disp,
- Segment,
- CFG,
- Chain};
- CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_tdpbssd_internal: {
- if (!Subtarget->hasAMXTILE())
- break;
- SDValue Chain = Node->getOperand(0);
- unsigned Opc = X86::PTDPBSSDV;
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Ops[] = {Node->getOperand(2),
- Node->getOperand(3),
- Node->getOperand(4),
- Node->getOperand(5),
- Node->getOperand(6),
- Node->getOperand(7),
- CFG,
- Chain};
- MachineSDNode *CNode =
- CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- case Intrinsic::x86_tilezero_internal: {
- if (!Subtarget->hasAMXTILE())
- break;
- unsigned Opc = X86::PTILEZEROV;
- SDValue Chain = Node->getOperand(0);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
- MachineSDNode *CNode =
- CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
- }
- break;
- }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntNo = Node->getConstantOperandVal(1);
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_encodekey128:
+ case Intrinsic::x86_encodekey256: {
+ if (!Subtarget->hasKL())
+ break;
+
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
+ case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
+ SDValue());
+ if (Opcode == X86::ENCODEKEY256)
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Node->getOperand(2), Chain, Chain.getValue(1)});
+ ReplaceNode(Node, Res);
+ return;
+ }
+ case Intrinsic::x86_tileloadd64_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILELOADDV;
+ // _tile_loadd_internal(row, col, buf, STRIDE)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tdpbssd_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ SDValue Chain = Node->getOperand(0);
+ unsigned Opc = X86::PTDPBSSDV;
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Node->getOperand(4),
+ Node->getOperand(5),
+ Node->getOperand(6),
+ Node->getOperand(7),
+ CFG,
+ Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ case Intrinsic::x86_tilezero_internal: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc = X86::PTILEZEROV;
+ SDValue Chain = Node->getOperand(0);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+ MachineSDNode *CNode =
+ CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
+ }
+ break;
+ }
case ISD::INTRINSIC_VOID: {
unsigned IntNo = Node->getConstantOperandVal(1);
switch (IntNo) {
@@ -4711,31 +4711,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
- case Intrinsic::x86_tilestored64_internal: {
- unsigned Opc = X86::PTILESTOREDV;
- // _tile_stored_internal(row, col, buf, STRIDE, c)
- SDValue Base = Node->getOperand(4);
- SDValue Scale = getI8Imm(1, dl);
- SDValue Index = Node->getOperand(5);
- SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i16);
- SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
- SDValue Chain = Node->getOperand(0);
- MachineSDNode *CNode;
- SDValue Ops[] = {Node->getOperand(2),
- Node->getOperand(3),
- Base,
- Scale,
- Index,
- Disp,
- Segment,
- Node->getOperand(6),
- CFG,
- Chain};
- CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
- ReplaceNode(Node, CNode);
- return;
- }
+ case Intrinsic::x86_tilestored64_internal: {
+ unsigned Opc = X86::PTILESTOREDV;
+ // _tile_stored_internal(row, col, buf, STRIDE, c)
+ SDValue Base = Node->getOperand(4);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(5);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ SDValue Ops[] = {Node->getOperand(2),
+ Node->getOperand(3),
+ Base,
+ Scale,
+ Index,
+ Disp,
+ Segment,
+ Node->getOperand(6),
+ CFG,
+ Chain};
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ ReplaceNode(Node, CNode);
+ return;
+ }
case Intrinsic::x86_tileloadd64:
case Intrinsic::x86_tileloaddt164:
case Intrinsic::x86_tilestored64: {
@@ -4816,19 +4816,19 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
return;
break;
- case X86ISD::VPTERNLOG: {
- uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
- if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
- Node->getOperand(1), Node->getOperand(2), Imm))
- return;
- break;
- }
-
- case X86ISD::ANDNP:
- if (tryVPTERNLOG(Node))
- return;
- break;
-
+ case X86ISD::VPTERNLOG: {
+ uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
+ if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2), Imm))
+ return;
+ break;
+ }
+
+ case X86ISD::ANDNP:
+ if (tryVPTERNLOG(Node))
+ return;
+ break;
+
case ISD::AND:
if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
// Try to form a masked VPTESTM. Operands can be in either order.
@@ -5927,63 +5927,63 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->RemoveDeadNode(Node);
return;
}
- case X86ISD::AESENCWIDE128KL:
- case X86ISD::AESDECWIDE128KL:
- case X86ISD::AESENCWIDE256KL:
- case X86ISD::AESDECWIDE256KL: {
- if (!Subtarget->hasWIDEKL())
- break;
-
- unsigned Opcode;
- switch (Node->getOpcode()) {
- default:
- llvm_unreachable("Unexpected opcode!");
- case X86ISD::AESENCWIDE128KL:
- Opcode = X86::AESENCWIDE128KL;
- break;
- case X86ISD::AESDECWIDE128KL:
- Opcode = X86::AESDECWIDE128KL;
- break;
- case X86ISD::AESENCWIDE256KL:
- Opcode = X86::AESENCWIDE256KL;
- break;
- case X86ISD::AESDECWIDE256KL:
- Opcode = X86::AESDECWIDE256KL;
- break;
- }
-
- SDValue Chain = Node->getOperand(0);
- SDValue Addr = Node->getOperand(1);
-
- SDValue Base, Scale, Index, Disp, Segment;
- if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
- break;
-
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
- SDValue());
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
- Chain.getValue(1));
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
- Chain.getValue(1));
-
- MachineSDNode *Res = CurDAG->getMachineNode(
- Opcode, dl, Node->getVTList(),
- {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
- CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
- ReplaceNode(Node, Res);
- return;
- }
+ case X86ISD::AESENCWIDE128KL:
+ case X86ISD::AESDECWIDE128KL:
+ case X86ISD::AESENCWIDE256KL:
+ case X86ISD::AESDECWIDE256KL: {
+ if (!Subtarget->hasWIDEKL())
+ break;
+
+ unsigned Opcode;
+ switch (Node->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+ case X86ISD::AESENCWIDE128KL:
+ Opcode = X86::AESENCWIDE128KL;
+ break;
+ case X86ISD::AESDECWIDE128KL:
+ Opcode = X86::AESDECWIDE128KL;
+ break;
+ case X86ISD::AESENCWIDE256KL:
+ Opcode = X86::AESENCWIDE256KL;
+ break;
+ case X86ISD::AESDECWIDE256KL:
+ Opcode = X86::AESDECWIDE256KL;
+ break;
+ }
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue Addr = Node->getOperand(1);
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
+ break;
+
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
+ SDValue());
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
+ Chain.getValue(1));
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
+ Chain.getValue(1));
+
+ MachineSDNode *Res = CurDAG->getMachineNode(
+ Opcode, dl, Node->getVTList(),
+ {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
+ CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
+ ReplaceNode(Node, Res);
+ return;
}
+ }
SelectCode(Node);
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp
index 1e2407c7e7..c1320facd0 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp
@@ -35,7 +35,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
@@ -77,14 +77,14 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
-static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
- "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
- cl::desc(
- "Sets the preferable loop alignment for experiments (as log2 bytes) "
- "for innermost loops only. If specified, this option overrides "
- "alignment set by x86-experimental-pref-loop-alignment."),
- cl::Hidden);
-
+static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
+ "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes) "
+ "for innermost loops only. If specified, this option overrides "
+ "alignment set by x86-experimental-pref-loop-alignment."),
+ cl::Hidden);
+
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -144,24 +144,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addBypassSlowDiv(64, 32);
}
- // Setup Windows compiler runtime calls.
- if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
- static const struct {
- const RTLIB::Libcall Op;
- const char * const Name;
- const CallingConv::ID CC;
- } LibraryCalls[] = {
- { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
- { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
- { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
- { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
- { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
- };
-
- for (const auto &LC : LibraryCalls) {
- setLibcallName(LC.Op, LC.Name);
- setLibcallCallingConv(LC.Op, LC.CC);
- }
+ // Setup Windows compiler runtime calls.
+ if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
+ static const struct {
+ const RTLIB::Libcall Op;
+ const char * const Name;
+ const CallingConv::ID CC;
+ } LibraryCalls[] = {
+ { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
+ { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
+ { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
+ { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
+ { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
+ };
+
+ for (const auto &LC : LibraryCalls) {
+ setLibcallName(LC.Op, LC.Name);
+ setLibcallCallingConv(LC.Op, LC.CC);
+ }
}
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -207,8 +207,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
setOperationAction(ISD::ABS , MVT::i32 , Custom);
- if (Subtarget.is64Bit())
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
}
// Funnel shifts.
@@ -293,19 +293,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (Subtarget.hasSSE2()) {
- // Custom lowering for saturating float to int conversions.
- // We handle promotion to larger result types manually.
- for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
- setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
- setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
- }
- if (Subtarget.is64Bit()) {
- setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
- setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
- }
- }
-
+ if (Subtarget.hasSSE2()) {
+ // Custom lowering for saturating float to int conversions.
+ // We handle promotion to larger result types manually.
+ for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+ }
+ if (Subtarget.is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
+ }
+ }
+
// Handle address space casts between mixed sized pointers.
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
@@ -412,7 +412,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
- setOperationAction(ISD::PARITY, MVT::i8, Custom);
+ setOperationAction(ISD::PARITY, MVT::i8, Custom);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
@@ -423,11 +423,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
else
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
-
- setOperationAction(ISD::PARITY, MVT::i16, Custom);
- setOperationAction(ISD::PARITY, MVT::i32, Custom);
- if (Subtarget.is64Bit())
- setOperationAction(ISD::PARITY, MVT::i64, Custom);
+
+ setOperationAction(ISD::PARITY, MVT::i16, Custom);
+ setOperationAction(ISD::PARITY, MVT::i32, Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::PARITY, MVT::i64, Custom);
}
setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
@@ -521,7 +521,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
- setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
+ setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
@@ -1114,8 +1114,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
- setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
+ setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
setOperationAction(ISD::FROUND, RoundedTy, Custom);
}
@@ -1129,8 +1129,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
- setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
-
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
@@ -1171,10 +1171,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
- setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
- }
-
+ if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -1216,8 +1216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FROUNDEVEN, VT, Legal);
- setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
@@ -1345,10 +1345,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
- setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
@@ -1607,8 +1607,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FROUNDEVEN, VT, Legal);
- setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
setOperationAction(ISD::FROUND, VT, Custom);
}
@@ -1737,17 +1737,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasVBMI2()) {
- for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
- MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
+ MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
-
- setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
- setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
- setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
- setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
+
+ setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
}
}// useAVX512Regs
@@ -1919,10 +1919,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
- if (Subtarget.hasAMXTILE()) {
- addRegisterClass(MVT::x86amx, &X86::TILERegClass);
- }
-
+ if (Subtarget.hasAMXTILE()) {
+ addRegisterClass(MVT::x86amx, &X86::TILERegClass);
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1952,8 +1952,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ADDCARRY, VT, Custom);
setOperationAction(ISD::SUBCARRY, VT, Custom);
setOperationAction(ISD::SETCCCARRY, VT, Custom);
- setOperationAction(ISD::SADDO_CARRY, VT, Custom);
- setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
+ setOperationAction(ISD::SADDO_CARRY, VT, Custom);
+ setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
}
if (!Subtarget.is64Bit()) {
@@ -2507,21 +2507,21 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
- unsigned AddressSpace = getAddressSpace();
- // Specially, some users may customize the base reg and offset.
- unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
- // If we don't set -stack-protector-guard-offset value:
+ unsigned AddressSpace = getAddressSpace();
+ // Specially, some users may customize the base reg and offset.
+ unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset;
+ // If we don't set -stack-protector-guard-offset value:
// %fs:0x28, unless we're using a Kernel code model, in which case
// it's %gs:0x28. gs:0x14 on i386.
- if (Offset == (unsigned)-1)
- Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
-
- const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
- if (GuardReg == "fs")
- AddressSpace = X86AS::FS;
- else if (GuardReg == "gs")
- AddressSpace = X86AS::GS;
- return SegmentOffset(IRB, Offset, AddressSpace);
+ if (Offset == (unsigned)-1)
+ Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+
+ const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
+ if (GuardReg == "fs")
+ AddressSpace = X86AS::FS;
+ else if (GuardReg == "gs")
+ AddressSpace = X86AS::GS;
+ return SegmentOffset(IRB, Offset, AddressSpace);
}
}
return TargetLowering::getIRStackGuard(IRB);
@@ -2545,13 +2545,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
}
return;
}
-
- auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
-
+
+ auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
+
// glibc, bionic, and Fuchsia have a special slot for the stack guard.
- if ((GuardMode == llvm::StackProtectorGuards::TLS ||
- GuardMode == llvm::StackProtectorGuards::None)
- && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
+ if ((GuardMode == llvm::StackProtectorGuards::TLS ||
+ GuardMode == llvm::StackProtectorGuards::None)
+ && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
@@ -3101,9 +3101,9 @@ SDValue X86TargetLowering::LowerCallResult(
// This truncation won't change the value.
DAG.getIntPtrConstant(1, dl));
- if (VA.isExtInLoc()) {
+ if (VA.isExtInLoc()) {
if (VA.getValVT().isVector() &&
- VA.getValVT().getScalarType() == MVT::i1 &&
+ VA.getValVT().getScalarType() == MVT::i1 &&
((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
@@ -3171,7 +3171,7 @@ argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SDValue Chain, ISD::ArgFlagsTy Flags,
SelectionDAG &DAG, const SDLoc &dl) {
- SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
+ SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
return DAG.getMemcpy(
Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
@@ -3420,8 +3420,8 @@ private:
void forwardMustTailParameters(SDValue &Chain);
- bool is64Bit() const { return Subtarget.is64Bit(); }
- bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
+ bool is64Bit() const { return Subtarget.is64Bit(); }
+ bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
X86MachineFunctionInfo *FuncInfo;
const SDLoc &DL;
@@ -3532,10 +3532,10 @@ void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
SaveXMMOps.push_back(Chain);
SaveXMMOps.push_back(ALVal);
SaveXMMOps.push_back(
- DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
+ DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
SaveXMMOps.push_back(
- DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
- llvm::append_range(SaveXMMOps, LiveXMMRegs);
+ DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
+ llvm::append_range(SaveXMMOps, LiveXMMRegs);
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
MVT::Other, SaveXMMOps));
}
@@ -3809,7 +3809,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
- int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -3916,7 +3916,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
- bool IsIndirectCall = (CI && CI->isIndirectCall());
+ bool IsIndirectCall = (CI && CI->isIndirectCall());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
@@ -4156,13 +4156,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Subtarget.isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
- // GOT pointer (except regcall).
+ // GOT pointer (except regcall).
if (!isTailCall) {
- // Indirect call with RegCall calling convertion may use up all the
- // general registers, so it is not suitable to bind EBX reister for
- // GOT address, just let register allocator handle it.
- if (CallConv != CallingConv::X86_RegCall)
- RegsToPass.push_back(std::make_pair(
+ // Indirect call with RegCall calling convertion may use up all the
+ // general registers, so it is not suitable to bind EBX reister for
+ // GOT address, just let register allocator handle it.
+ if (CallConv != CallingConv::X86_RegCall)
+ RegsToPass.push_back(std::make_pair(
Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
@@ -4329,7 +4329,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Ops.push_back(Callee);
if (isTailCall)
- Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
+ Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
@@ -4403,7 +4403,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
return Ret;
}
- if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
+ if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
} else {
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
@@ -4522,7 +4522,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!VR.isVirtual())
+ if (!VR.isVirtual())
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -4574,8 +4574,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
return false;
- if (VA.getLocVT().getFixedSizeInBits() >
- Arg.getValueSizeInBits().getFixedSize()) {
+ if (VA.getLocVT().getFixedSizeInBits() >
+ Arg.getValueSizeInBits().getFixedSize()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
@@ -5083,47 +5083,47 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
unsigned Intrinsic) const {
- Info.flags = MachineMemOperand::MONone;
- Info.offset = 0;
+ Info.flags = MachineMemOperand::MONone;
+ Info.offset = 0;
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
- if (!IntrData) {
- switch (Intrinsic) {
- case Intrinsic::x86_aesenc128kl:
- case Intrinsic::x86_aesdec128kl:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.ptrVal = I.getArgOperand(1);
- Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
- Info.align = Align(1);
- Info.flags |= MachineMemOperand::MOLoad;
- return true;
- case Intrinsic::x86_aesenc256kl:
- case Intrinsic::x86_aesdec256kl:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.ptrVal = I.getArgOperand(1);
- Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
- Info.align = Align(1);
- Info.flags |= MachineMemOperand::MOLoad;
- return true;
- case Intrinsic::x86_aesencwide128kl:
- case Intrinsic::x86_aesdecwide128kl:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.ptrVal = I.getArgOperand(0);
- Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
- Info.align = Align(1);
- Info.flags |= MachineMemOperand::MOLoad;
- return true;
- case Intrinsic::x86_aesencwide256kl:
- case Intrinsic::x86_aesdecwide256kl:
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.ptrVal = I.getArgOperand(0);
- Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
- Info.align = Align(1);
- Info.flags |= MachineMemOperand::MOLoad;
- return true;
- }
+ if (!IntrData) {
+ switch (Intrinsic) {
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(1);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
+ Info.align = Align(1);
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
return false;
- }
+ }
switch (IntrData->Type) {
case TRUNCATE_TO_MEM_VI8:
@@ -5193,7 +5193,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
-
+
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -5366,7 +5366,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
// width.
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
return false;
-
+
return true;
}
@@ -5510,14 +5510,14 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
}
-/// Return true if every element in Mask is the undef sentinel value or equal to
-/// the specified value..
-static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
- return llvm::all_of(Mask, [CmpVal](int M) {
- return (M == SM_SentinelUndef) || (M == CmpVal);
- });
-}
-
+/// Return true if every element in Mask is the undef sentinel value or equal to
+/// the specified value..
+static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
+ return llvm::all_of(Mask, [CmpVal](int M) {
+ return (M == SM_SentinelUndef) || (M == CmpVal);
+ });
+}
+
/// Val is either the undef or zero sentinel value.
static bool isUndefOrZero(int Val) {
return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
@@ -5924,7 +5924,7 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl) {
- assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
+ assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type");
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
@@ -6288,22 +6288,22 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
return DAG.getBitcast(VT, Vec);
}
-// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
-static unsigned getOpcode_EXTEND(unsigned Opcode) {
- switch (Opcode) {
- case ISD::ANY_EXTEND:
- case ISD::ANY_EXTEND_VECTOR_INREG:
- return ISD::ANY_EXTEND;
- case ISD::ZERO_EXTEND:
- case ISD::ZERO_EXTEND_VECTOR_INREG:
- return ISD::ZERO_EXTEND;
- case ISD::SIGN_EXTEND:
- case ISD::SIGN_EXTEND_VECTOR_INREG:
- return ISD::SIGN_EXTEND;
- }
- llvm_unreachable("Unknown opcode");
-}
-
+// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
+static unsigned getOpcode_EXTEND(unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ case ISD::ANY_EXTEND_VECTOR_INREG:
+ return ISD::ANY_EXTEND;
+ case ISD::ZERO_EXTEND:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return ISD::ZERO_EXTEND;
+ case ISD::SIGN_EXTEND:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return ISD::SIGN_EXTEND;
+ }
+ llvm_unreachable("Unknown opcode");
+}
+
// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
switch (Opcode) {
@@ -6320,8 +6320,8 @@ static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
llvm_unreachable("Unknown opcode");
}
-static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
- SDValue In, SelectionDAG &DAG) {
+static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
+ SDValue In, SelectionDAG &DAG) {
EVT InVT = In.getValueType();
assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
@@ -6373,10 +6373,10 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
return SDValue();
}
-void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
bool Lo, bool Unary) {
- assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
- "Illegal vector type to unpack");
+ assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
+ "Illegal vector type to unpack");
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
int NumEltsInLane = 128 / VT.getScalarSizeInBits();
@@ -6405,7 +6405,7 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
}
/// Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
@@ -6413,7 +6413,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
}
/// Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
+static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
SDValue V1, SDValue V2) {
SmallVector<int, 8> Mask;
createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
@@ -6660,30 +6660,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a subvector broadcast.
- if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
- auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
- SDValue Ptr = MemIntr->getBasePtr();
- if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
- Type *CstTy = Cst->getType();
- unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
- if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
- return false;
- unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
- unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
- unsigned NumSubVecs = SizeInBits / CstSizeInBits;
- APInt UndefSubElts(NumSubElts, 0);
- SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
- APInt(SubEltSizeInBits, 0));
- for (unsigned i = 0; i != NumSubElts; ++i) {
- if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
- UndefSubElts, i))
- return false;
- for (unsigned j = 1; j != NumSubVecs; ++j)
- SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
- }
- UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
- UndefSubElts);
- return CastBitData(UndefSubElts, SubEltBits);
+ if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
+ Type *CstTy = Cst->getType();
+ unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+ if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0)
+ return false;
+ unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits;
+ unsigned NumSubVecs = SizeInBits / CstSizeInBits;
+ APInt UndefSubElts(NumSubElts, 0);
+ SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
+ APInt(SubEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSubElts; ++i) {
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
+ UndefSubElts, i))
+ return false;
+ for (unsigned j = 1; j != NumSubVecs; ++j)
+ SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
+ }
+ UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
+ UndefSubElts);
+ return CastBitData(UndefSubElts, SubEltBits);
}
}
@@ -6704,26 +6704,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
- // If bitcasts to larger elements we might lose track of undefs - don't
- // allow any to be safe.
- unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
- bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
-
- APInt UndefSrcElts, UndefSubElts;
- SmallVector<APInt, 32> EltSrcBits, EltSubBits;
- if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
+ // If bitcasts to larger elements we might lose track of undefs - don't
+ // allow any to be safe.
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
+
+ APInt UndefSrcElts, UndefSubElts;
+ SmallVector<APInt, 32> EltSrcBits, EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
UndefSubElts, EltSubBits,
- AllowWholeUndefs && AllowUndefs,
- AllowPartialUndefs && AllowUndefs) &&
- getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
- UndefSrcElts, EltSrcBits,
- AllowWholeUndefs && AllowUndefs,
- AllowPartialUndefs && AllowUndefs)) {
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
+ UndefSrcElts, EltSrcBits,
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
- UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
+ UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
- EltSrcBits[BaseIdx + i] = EltSubBits[i];
- return CastBitData(UndefSrcElts, EltSrcBits);
+ EltSrcBits[BaseIdx + i] = EltSubBits[i];
+ return CastBitData(UndefSrcElts, EltSrcBits);
}
}
@@ -6836,7 +6836,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
return false;
// Insert the extracted elements into the mask.
- for (const APInt &Elt : EltBits)
+ for (const APInt &Elt : EltBits)
RawMask.push_back(Elt.getZExtValue());
return true;
@@ -7517,8 +7517,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
case ISD::OR: {
// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
// is a valid shuffle index.
- SDValue N0 = peekThroughBitcasts(N.getOperand(0));
- SDValue N1 = peekThroughBitcasts(N.getOperand(1));
+ SDValue N0 = peekThroughBitcasts(N.getOperand(0));
+ SDValue N1 = peekThroughBitcasts(N.getOperand(1));
if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
@@ -7533,20 +7533,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVector<int, 64> Mask0, Mask1;
narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
- for (int i = 0; i != (int)MaskSize; ++i) {
+ for (int i = 0; i != (int)MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
Mask.push_back(SM_SentinelZero);
else if (Mask1[i] == SM_SentinelZero)
- Mask.push_back(i);
+ Mask.push_back(i);
else if (Mask0[i] == SM_SentinelZero)
- Mask.push_back(i + MaskSize);
+ Mask.push_back(i + MaskSize);
else
return false;
}
- Ops.push_back(N0);
- Ops.push_back(N1);
+ Ops.push_back(N0);
+ Ops.push_back(N1);
return true;
}
case ISD::INSERT_SUBVECTOR: {
@@ -7578,8 +7578,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// Subvector shuffle inputs must not be larger than the subvector.
if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
- return SubVT.getFixedSizeInBits() <
- SubInput.getValueSizeInBits().getFixedSize();
+ return SubVT.getFixedSizeInBits() <
+ SubInput.getValueSizeInBits().getFixedSize();
}))
return false;
@@ -7600,11 +7600,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
Ops.push_back(Src);
Ops.append(SubInputs.begin(), SubInputs.end());
- if (ISD::isBuildVectorAllZeros(Src.getNode()))
- Mask.append(NumElts, SM_SentinelZero);
- else
- for (int i = 0; i != (int)NumElts; ++i)
- Mask.push_back(i);
+ if (ISD::isBuildVectorAllZeros(Src.getNode()))
+ Mask.append(NumElts, SM_SentinelZero);
+ else
+ for (int i = 0; i != (int)NumElts; ++i)
+ Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
int M = SubMask[i];
if (0 <= M) {
@@ -7705,33 +7705,33 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
APInt EltsLHS, EltsRHS;
getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
- // If we know input saturation won't happen (or we don't care for particular
- // lanes), we can treat this as a truncation shuffle.
- bool Offset0 = false, Offset1 = false;
+ // If we know input saturation won't happen (or we don't care for particular
+ // lanes), we can treat this as a truncation shuffle.
+ bool Offset0 = false, Offset1 = false;
if (Opcode == X86ISD::PACKSS) {
- if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
- (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
- // We can't easily fold ASHR into a shuffle, but if it was feeding a
- // PACKSS then it was likely being used for sign-extension for a
- // truncation, so just peek through and adjust the mask accordingly.
- if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
- N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
- Offset0 = true;
- N0 = N0.getOperand(0);
- }
- if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
- N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
- Offset1 = true;
- N1 = N1.getOperand(0);
- }
+ // We can't easily fold ASHR into a shuffle, but if it was feeding a
+ // PACKSS then it was likely being used for sign-extension for a
+ // truncation, so just peek through and adjust the mask accordingly.
+ if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
+ N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset0 = true;
+ N0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
+ N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
+ Offset1 = true;
+ N1 = N1.getOperand(0);
+ }
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
+ if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
- (!(N1.isUndef() || EltsRHS.isNullValue()) &&
+ (!(N1.isUndef() || EltsRHS.isNullValue()) &&
!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
}
@@ -7743,13 +7743,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(N1);
createPackShuffleMask(VT, Mask, IsUnary);
-
- if (Offset0 || Offset1) {
- for (int &M : Mask)
- if ((Offset0 && isInRange(M, 0, NumElts)) ||
- (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
- ++M;
- }
+
+ if (Offset0 || Offset1) {
+ for (int &M : Mask)
+ if ((Offset0 && isInRange(M, 0, NumElts)) ||
+ (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
+ ++M;
+ }
return true;
}
case X86ISD::VTRUNC: {
@@ -8037,7 +8037,7 @@ static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
}
// Use PINSRB/PINSRW/PINSRD to create a build vector.
-static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
+static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8052,7 +8052,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
bool First = true;
for (unsigned i = 0; i < NumElts; ++i) {
- bool IsNonZero = NonZeroMask[i];
+ bool IsNonZero = NonZeroMask[i];
if (!IsNonZero)
continue;
@@ -8079,7 +8079,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
}
/// Custom lower build_vector of v16i8.
-static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
+static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8088,7 +8088,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41())
- return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
Subtarget);
SDLoc dl(Op);
@@ -8096,8 +8096,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; i += 2) {
- bool ThisIsNonZero = NonZeroMask[i];
- bool NextIsNonZero = NonZeroMask[i + 1];
+ bool ThisIsNonZero = NonZeroMask[i];
+ bool NextIsNonZero = NonZeroMask[i + 1];
if (!ThisIsNonZero && !NextIsNonZero)
continue;
@@ -8145,7 +8145,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
}
/// Custom lower build_vector of v8i16.
-static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
+static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8153,7 +8153,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
return SDValue();
// Use PINSRW to insert each byte directly.
- return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
+ return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
Subtarget);
}
@@ -8487,8 +8487,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
assert(LDBase && "Did not find base load for merging consecutive loads");
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
- int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
- int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
+ int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
+ int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
// TODO: Support offsetting the base load.
@@ -8550,7 +8550,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// base pointer. If the vector contains zeros, then attempt to shuffle those
// elements.
if (FirstLoadedElt == 0 &&
- (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
+ (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
@@ -8638,11 +8638,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (!Subtarget.hasAVX2() && ScalarSize < 32)
continue;
- // Don't attempt a 1:N subvector broadcast - it should be caught by
- // combineConcatVectorOps, else will cause infinite loops.
- if (RepeatSize > ScalarSize && SubElems == 1)
- continue;
-
+ // Don't attempt a 1:N subvector broadcast - it should be caught by
+ // combineConcatVectorOps, else will cause infinite loops.
+ if (RepeatSize > ScalarSize && SubElems == 1)
+ continue;
+
bool Match = true;
SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
for (unsigned i = 0; i != NumElems && Match; ++i) {
@@ -8674,14 +8674,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (TLI.isTypeLegal(BroadcastVT)) {
if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
- SDValue Broadcast = RepeatLoad;
- if (RepeatSize > ScalarSize) {
- while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
- Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
- } else {
- Broadcast =
- DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
- }
+ SDValue Broadcast = RepeatLoad;
+ if (RepeatSize > ScalarSize) {
+ while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
+ Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
+ } else {
+ Broadcast =
+ DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
+ }
return DAG.getBitcast(VT, Broadcast);
}
}
@@ -8769,21 +8769,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
return SDValue();
MVT VT = BVOp->getSimpleValueType(0);
- unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumElts = VT.getVectorNumElements();
SDLoc dl(BVOp);
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Unsupported vector type for broadcast.");
- // See if the build vector is a repeating sequence of scalars (inc. splat).
- SDValue Ld;
+ // See if the build vector is a repeating sequence of scalars (inc. splat).
+ SDValue Ld;
BitVector UndefElements;
- SmallVector<SDValue, 16> Sequence;
- if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
- assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
- if (Sequence.size() == 1)
- Ld = Sequence[0];
- }
+ SmallVector<SDValue, 16> Sequence;
+ if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
+ assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
+ if (Sequence.size() == 1)
+ Ld = Sequence[0];
+ }
// Attempt to use VBROADCASTM
// From this pattern:
@@ -8791,34 +8791,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// b. t1 = (build_vector t0 t0)
//
// Create (VBROADCASTM v2i1 X)
- if (!Sequence.empty() && Subtarget.hasCDI()) {
- // If not a splat, are the upper sequence values zeroable?
- unsigned SeqLen = Sequence.size();
- bool UpperZeroOrUndef =
- SeqLen == 1 ||
- llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
- return !V || V.isUndef() || isNullConstant(V);
- });
- SDValue Op0 = Sequence[0];
- if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
- (Op0.getOpcode() == ISD::ZERO_EXTEND &&
- Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
- SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
- ? Op0.getOperand(0)
- : Op0.getOperand(0).getOperand(0);
+ if (!Sequence.empty() && Subtarget.hasCDI()) {
+ // If not a splat, are the upper sequence values zeroable?
+ unsigned SeqLen = Sequence.size();
+ bool UpperZeroOrUndef =
+ SeqLen == 1 ||
+ llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
+ return !V || V.isUndef() || isNullConstant(V);
+ });
+ SDValue Op0 = Sequence[0];
+ if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
+ (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
+ SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
+ ? Op0.getOperand(0)
+ : Op0.getOperand(0).getOperand(0);
MVT MaskVT = BOperand.getSimpleValueType();
- MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
- if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
+ MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
+ if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
- MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
- if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
- unsigned Scale = 512 / VT.getSizeInBits();
- BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
- }
- SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
- if (BcstVT.getSizeInBits() != VT.getSizeInBits())
- Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
- return DAG.getBitcast(VT, Bcst);
+ MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ unsigned Scale = 512 / VT.getSizeInBits();
+ BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
+ }
+ SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
+ if (BcstVT.getSizeInBits() != VT.getSizeInBits())
+ Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
+ return DAG.getBitcast(VT, Bcst);
}
}
}
@@ -8868,15 +8868,15 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
- MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
+ MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = {DAG.getEntryNode(), VCP};
- MachinePointerInfo MPI =
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
- return DAG.getMemIntrinsicNode(
- X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
- MachineMemOperand::MOLoad);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), VCP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
}
}
}
@@ -8897,8 +8897,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
- // TODO: Handle broadcasts of non-constant sequences.
-
+ // TODO: Handle broadcasts of non-constant sequences.
+
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
// FIXME: Is the use count needed for non-constant, non-load case?
@@ -10233,69 +10233,69 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
unsigned EVTBits = EltVT.getSizeInBits();
- APInt UndefMask = APInt::getNullValue(NumElems);
- APInt ZeroMask = APInt::getNullValue(NumElems);
- APInt NonZeroMask = APInt::getNullValue(NumElems);
+ APInt UndefMask = APInt::getNullValue(NumElems);
+ APInt ZeroMask = APInt::getNullValue(NumElems);
+ APInt NonZeroMask = APInt::getNullValue(NumElems);
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
- if (Elt.isUndef()) {
- UndefMask.setBit(i);
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
continue;
- }
+ }
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
NumConstants--;
}
- if (X86::isZeroNode(Elt)) {
- ZeroMask.setBit(i);
- } else {
- NonZeroMask.setBit(i);
+ if (X86::isZeroNode(Elt)) {
+ ZeroMask.setBit(i);
+ } else {
+ NonZeroMask.setBit(i);
}
}
- // All undef vector. Return an UNDEF. All zero vectors were handled above.
- if (NonZeroMask == 0) {
- assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NonZeroMask == 0) {
+ assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
return DAG.getUNDEF(VT);
- }
-
- BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
-
- // If the upper elts of a ymm/zmm are undef/zero then we might be better off
- // lowering to a smaller build vector and padding with undef/zero.
- if ((VT.is256BitVector() || VT.is512BitVector()) &&
- !isFoldableUseOfShuffle(BV)) {
- unsigned UpperElems = NumElems / 2;
- APInt UndefOrZeroMask = UndefMask | ZeroMask;
- unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
- if (NumUpperUndefsOrZeros >= UpperElems) {
- if (VT.is512BitVector() &&
- NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
- UpperElems = NumElems - (NumElems / 4);
- bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
- MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
- SDValue NewBV =
- DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
- return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
- }
- }
-
- if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
- return AddSub;
- if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
- return HorizontalOp;
- if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
- return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
- return BitOp;
-
- unsigned NumZero = ZeroMask.countPopulation();
- unsigned NumNonZero = NonZeroMask.countPopulation();
-
+ }
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+
+ // If the upper elts of a ymm/zmm are undef/zero then we might be better off
+ // lowering to a smaller build vector and padding with undef/zero.
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ !isFoldableUseOfShuffle(BV)) {
+ unsigned UpperElems = NumElems / 2;
+ APInt UndefOrZeroMask = UndefMask | ZeroMask;
+ unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
+ if (NumUpperUndefsOrZeros >= UpperElems) {
+ if (VT.is512BitVector() &&
+ NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
+ UpperElems = NumElems - (NumElems / 4);
+ bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
+ MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
+ SDValue NewBV =
+ DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
+ return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
+ }
+ }
+
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+ return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+ return BitOp;
+
+ unsigned NumZero = ZeroMask.countPopulation();
+ unsigned NumNonZero = NonZeroMask.countPopulation();
+
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
// vector and then insert the variable scalar element. If insertion is not
@@ -10358,7 +10358,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
- unsigned Idx = NonZeroMask.countTrailingZeros();
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue Item = Op.getOperand(Idx);
// If we have a constant or non-constant insertion into the low element of
@@ -10422,7 +10422,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
- unsigned Idx = NonZeroMask.countTrailingZeros();
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -10491,7 +10491,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
- unsigned Idx = NonZeroMask.countTrailingZeros();
+ unsigned Idx = NonZeroMask.countTrailingZeros();
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -10501,12 +10501,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
- if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
- if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
DAG, Subtarget))
return V;
@@ -10519,7 +10519,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumElems == 4 && NumZero > 0) {
SmallVector<SDValue, 8> Ops(NumElems);
for (unsigned i = 0; i < 4; ++i) {
- bool isZero = !NonZeroMask[i];
+ bool isZero = !NonZeroMask[i];
if (isZero)
Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
else
@@ -10527,7 +10527,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
for (unsigned i = 0; i < 2; ++i) {
- switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
+ switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
@@ -10544,8 +10544,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
}
- bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
- bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
+ bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
+ bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
int MaskVec[] = {
Reverse1 ? 1 : 0,
Reverse1 ? 0 : 1,
@@ -10817,35 +10817,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
}
-/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
-/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
-/// better support 'repeated mask + lane permute' style shuffles.
-static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
- unsigned ScalarSizeInBits,
- ArrayRef<int> Mask) {
- assert(LaneSizeInBits && ScalarSizeInBits &&
- (LaneSizeInBits % ScalarSizeInBits) == 0 &&
- "Illegal shuffle lane size");
- int NumElts = Mask.size();
- int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
- int NumLanes = NumElts / NumEltsPerLane;
- if (NumLanes > 1) {
- for (int i = 0; i != NumLanes; ++i) {
- int SrcLane = -1;
- for (int j = 0; j != NumEltsPerLane; ++j) {
- int M = Mask[(i * NumEltsPerLane) + j];
- if (M < 0)
- continue;
- int Lane = (M % NumElts) / NumEltsPerLane;
- if (SrcLane >= 0 && SrcLane != Lane)
- return true;
- SrcLane = Lane;
- }
- }
- }
- return false;
-}
-
+/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
+/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
+/// better support 'repeated mask + lane permute' style shuffles.
+static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int NumElts = Mask.size();
+ int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
+ int NumLanes = NumElts / NumEltsPerLane;
+ if (NumLanes > 1) {
+ for (int i = 0; i != NumLanes; ++i) {
+ int SrcLane = -1;
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[(i * NumEltsPerLane) + j];
+ if (M < 0)
+ continue;
+ int Lane = (M % NumElts) / NumEltsPerLane;
+ if (SrcLane >= 0 && SrcLane != Lane)
+ return true;
+ SrcLane = Lane;
+ }
+ }
+ }
+ return false;
+}
+
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
@@ -10907,11 +10907,11 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
/// Test whether a target shuffle mask is equivalent within each sub-lane.
/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
-static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
- unsigned EltSizeInBits,
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
+ unsigned EltSizeInBits,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
- int LaneSize = LaneSizeInBits / EltSizeInBits;
+ int LaneSize = LaneSizeInBits / EltSizeInBits;
RepeatedMask.assign(LaneSize, SM_SentinelUndef);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
@@ -10942,67 +10942,67 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
return true;
}
-/// Test whether a target shuffle mask is equivalent within each sub-lane.
-/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
-static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
- ArrayRef<int> Mask,
- SmallVectorImpl<int> &RepeatedMask) {
- return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
- Mask, RepeatedMask);
-}
-
-/// Checks whether the vector elements referenced by two shuffle masks are
-/// equivalent.
-static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
- int Idx, int ExpectedIdx) {
- assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
- ExpectedIdx < MaskSize && "Out of range element index");
- if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
- return false;
-
- switch (Op.getOpcode()) {
- case ISD::BUILD_VECTOR:
- // If the values are build vectors, we can look through them to find
- // equivalent inputs that make the shuffles equivalent.
- // TODO: Handle MaskSize != Op.getNumOperands()?
- if (MaskSize == (int)Op.getNumOperands() &&
- MaskSize == (int)ExpectedOp.getNumOperands())
- return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
- break;
- case X86ISD::VBROADCAST:
- case X86ISD::VBROADCAST_LOAD:
- // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
- return (Op == ExpectedOp &&
- (int)Op.getValueType().getVectorNumElements() == MaskSize);
- case X86ISD::HADD:
- case X86ISD::HSUB:
- case X86ISD::FHADD:
- case X86ISD::FHSUB:
- case X86ISD::PACKSS:
- case X86ISD::PACKUS:
- // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
- // TODO: Handle MaskSize != NumElts?
- // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
- if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
- MVT VT = Op.getSimpleValueType();
- int NumElts = VT.getVectorNumElements();
- if (MaskSize == NumElts) {
- int NumLanes = VT.getSizeInBits() / 128;
- int NumEltsPerLane = NumElts / NumLanes;
- int NumHalfEltsPerLane = NumEltsPerLane / 2;
- bool SameLane =
- (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
- bool SameElt =
- (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
- return SameLane && SameElt;
- }
- }
- break;
- }
-
- return false;
-}
-
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
+ Mask, RepeatedMask);
+}
+
+/// Checks whether the vector elements referenced by two shuffle masks are
+/// equivalent.
+static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
+ int Idx, int ExpectedIdx) {
+ assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
+ ExpectedIdx < MaskSize && "Out of range element index");
+ if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
+ return false;
+
+ switch (Op.getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ // TODO: Handle MaskSize != Op.getNumOperands()?
+ if (MaskSize == (int)Op.getNumOperands() &&
+ MaskSize == (int)ExpectedOp.getNumOperands())
+ return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
+ break;
+ case X86ISD::VBROADCAST:
+ case X86ISD::VBROADCAST_LOAD:
+ // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
+ return (Op == ExpectedOp &&
+ (int)Op.getValueType().getVectorNumElements() == MaskSize);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
+ // TODO: Handle MaskSize != NumElts?
+ // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
+ if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
+ MVT VT = Op.getSimpleValueType();
+ int NumElts = VT.getVectorNumElements();
+ if (MaskSize == NumElts) {
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+ bool SameLane =
+ (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
+ bool SameElt =
+ (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
+ return SameLane && SameElt;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
/// Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -11013,23 +11013,23 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
- SDValue V1 = SDValue(),
- SDValue V2 = SDValue()) {
- int Size = Mask.size();
- if (Size != (int)ExpectedMask.size())
+static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
+ int Size = Mask.size();
+ if (Size != (int)ExpectedMask.size())
return false;
for (int i = 0; i < Size; ++i) {
assert(Mask[i] >= -1 && "Out of bound mask element!");
- int MaskIdx = Mask[i];
- int ExpectedIdx = ExpectedMask[i];
- if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
- SDValue MaskV = MaskIdx < Size ? V1 : V2;
- SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
- MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
- ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
- if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
return false;
}
}
@@ -11045,7 +11045,7 @@ static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
///
/// SM_SentinelZero is accepted as a valid negative index but must match in
/// both.
-static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
+static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
ArrayRef<int> ExpectedMask,
SDValue V1 = SDValue(),
SDValue V2 = SDValue()) {
@@ -11059,23 +11059,23 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
return false;
- // Don't use V1/V2 if they're not the same size as the shuffle mask type.
- if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
- V1 = SDValue();
- if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
- V2 = SDValue();
+ // Don't use V1/V2 if they're not the same size as the shuffle mask type.
+ if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
+ V1 = SDValue();
+ if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
+ V2 = SDValue();
for (int i = 0; i < Size; ++i) {
- int MaskIdx = Mask[i];
- int ExpectedIdx = ExpectedMask[i];
- if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
+ int MaskIdx = Mask[i];
+ int ExpectedIdx = ExpectedMask[i];
+ if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
continue;
- if (0 <= MaskIdx && 0 <= ExpectedIdx) {
- SDValue MaskV = MaskIdx < Size ? V1 : V2;
- SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
- MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
- ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
- if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
+ if (0 <= MaskIdx && 0 <= ExpectedIdx) {
+ SDValue MaskV = MaskIdx < Size ? V1 : V2;
+ SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
+ MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
+ ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
+ if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
continue;
}
// TODO - handle SM_Sentinel equivalences.
@@ -11087,25 +11087,25 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
// Attempt to create a shuffle mask from a VSELECT condition mask.
static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
SDValue Cond) {
- EVT CondVT = Cond.getValueType();
- unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
- unsigned NumElts = CondVT.getVectorNumElements();
-
- APInt UndefElts;
- SmallVector<APInt, 32> EltBits;
- if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
- true, false))
+ EVT CondVT = Cond.getValueType();
+ unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
+ unsigned NumElts = CondVT.getVectorNumElements();
+
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
+ true, false))
return false;
- Mask.resize(NumElts, SM_SentinelUndef);
+ Mask.resize(NumElts, SM_SentinelUndef);
- for (int i = 0; i != (int)NumElts; ++i) {
+ for (int i = 0; i != (int)NumElts; ++i) {
Mask[i] = i;
// Arbitrarily choose from the 2nd operand if the select condition element
// is undef.
// TODO: Can we do better by matching patterns such as even/odd?
- if (UndefElts[i] || EltBits[i].isNullValue())
- Mask[i] += NumElts;
+ if (UndefElts[i] || EltBits[i].isNullValue())
+ Mask[i] += NumElts;
}
return true;
@@ -11123,8 +11123,8 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
SmallVector<int, 8> Unpckhwd;
createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
/* Unary = */ false);
- bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
- isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
+ isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
return IsUnpackwdMask;
}
@@ -11141,8 +11141,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
for (unsigned i = 0; i != 4; ++i) {
SmallVector<int, 16> UnpackMask;
createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
- if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
- isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
+ if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
+ isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
return true;
}
return false;
@@ -11177,15 +11177,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
- // If the mask only uses one non-undef element, then fully 'splat' it to
- // improve later broadcast matching.
- int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
- assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
-
- int FirstElt = Mask[FirstIndex];
- if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
- return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
-
+ // If the mask only uses one non-undef element, then fully 'splat' it to
+ // improve later broadcast matching.
+ int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
+ assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
+
+ int FirstElt = Mask[FirstIndex];
+ if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
+ return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
+
unsigned Imm = 0;
Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
@@ -11335,8 +11335,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// Attempt to match the target mask against the unpack lo/hi mask patterns.
SmallVector<int, 64> Unpckl, Unpckh;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
- (IsUnary ? V1 : V2))) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
+ (IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKL;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11344,8 +11344,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
}
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
- (IsUnary ? V1 : V2))) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
+ (IsUnary ? V1 : V2))) {
UnpackOpcode = X86ISD::UNPCKH;
V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
@@ -11383,14 +11383,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
// If a binary shuffle, commute and try again.
if (!IsUnary) {
ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
UnpackOpcode = X86ISD::UNPCKL;
std::swap(V1, V2);
return true;
}
ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
+ if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
UnpackOpcode = X86ISD::UNPCKH;
std::swap(V1, V2);
return true;
@@ -11407,21 +11407,21 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
SelectionDAG &DAG) {
SmallVector<int, 8> Unpckl;
createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
- if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
SmallVector<int, 8> Unpckh;
createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
- if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
// Commute and try again.
ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
return SDValue();
@@ -11437,9 +11437,9 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
unsigned UnpackOpcode;
- if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
+ if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
UnpackOpcode = X86ISD::UNPCKL;
- else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
+ else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
UnpackOpcode = X86ISD::UNPCKH;
else
return SDValue();
@@ -11491,51 +11491,51 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
return false;
}
-// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
-// element padding to the final DstVT.
-static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG, bool ZeroUppers) {
- MVT SrcVT = Src.getSimpleValueType();
- MVT DstSVT = DstVT.getScalarType();
- unsigned NumDstElts = DstVT.getVectorNumElements();
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
-
- if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
- return SDValue();
-
- // Perform a direct ISD::TRUNCATE if possible.
- if (NumSrcElts == NumDstElts)
- return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
-
- if (NumSrcElts > NumDstElts) {
- MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
- return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
- }
-
- if ((NumSrcElts * DstEltSizeInBits) >= 128) {
- MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
- return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
- DstVT.getSizeInBits());
- }
-
- // Non-VLX targets must truncate from a 512-bit type, so we need to
- // widen, truncate and then possibly extract the original subvector.
- if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
- SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
- return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
- }
-
- // Fallback to a X86ISD::VTRUNC, padding if necessary.
- MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
- SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
- if (DstVT != TruncVT)
- Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
- DstVT.getSizeInBits());
- return Trunc;
+// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
+// element padding to the final DstVT.
+static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, bool ZeroUppers) {
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstSVT = DstVT.getScalarType();
+ unsigned NumDstElts = DstVT.getVectorNumElements();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
+
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
+ return SDValue();
+
+ // Perform a direct ISD::TRUNCATE if possible.
+ if (NumSrcElts == NumDstElts)
+ return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
+
+ if (NumSrcElts > NumDstElts) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
+ }
+
+ if ((NumSrcElts * DstEltSizeInBits) >= 128) {
+ MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+ return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ }
+
+ // Non-VLX targets must truncate from a 512-bit type, so we need to
+ // widen, truncate and then possibly extract the original subvector.
+ if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
+ SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
+ return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
+ }
+
+ // Fallback to a X86ISD::VTRUNC, padding if necessary.
+ MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
+ SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
+ if (DstVT != TruncVT)
+ Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+ DstVT.getSizeInBits());
+ return Trunc;
}
// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
@@ -11551,99 +11551,99 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
// t18: v2i64 = bitcast t51
//
-// One can just use a single vpmovdw instruction, without avx512vl we need to
-// use the zmm variant and extract the lower subvector, padding with zeroes.
-// TODO: Merge with lowerShuffleAsVTRUNC.
-static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
- if (!Subtarget.hasAVX512())
- return SDValue();
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned EltSizeInBits = VT.getScalarSizeInBits();
- unsigned MaxScale = 64 / EltSizeInBits;
- for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
- unsigned NumSrcElts = NumElts / Scale;
- unsigned UpperElts = NumElts - NumSrcElts;
- if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
- !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
- continue;
-
- SDValue Src = V1;
- if (!Src.hasOneUse())
- return SDValue();
-
- Src = peekThroughOneUseBitcasts(Src);
- if (Src.getOpcode() != ISD::TRUNCATE ||
- Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
- return SDValue();
- Src = Src.getOperand(0);
-
- // VPMOVWB is only available with avx512bw.
- MVT SrcVT = Src.getSimpleValueType();
- if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
- !Subtarget.hasBWI())
+// One can just use a single vpmovdw instruction, without avx512vl we need to
+// use the zmm variant and extract the lower subvector, padding with zeroes.
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned NumSrcElts = NumElts / Scale;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+
+ SDValue Src = V1;
+ if (!Src.hasOneUse())
+ return SDValue();
+
+ Src = peekThroughOneUseBitcasts(Src);
+ if (Src.getOpcode() != ISD::TRUNCATE ||
+ Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
return SDValue();
-
- bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
- return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
- }
-
- return SDValue();
-}
-
-// Attempt to match binary shuffle patterns as a truncate.
-static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unexpected VTRUNC type");
- if (!Subtarget.hasAVX512())
- return SDValue();
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned EltSizeInBits = VT.getScalarSizeInBits();
- unsigned MaxScale = 64 / EltSizeInBits;
- for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
- // TODO: Support non-BWI VPMOVWB truncations?
- unsigned SrcEltBits = EltSizeInBits * Scale;
- if (SrcEltBits < 32 && !Subtarget.hasBWI())
- continue;
-
- // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
- // Bail if the V2 elements are undef.
- unsigned NumHalfSrcElts = NumElts / Scale;
- unsigned NumSrcElts = 2 * NumHalfSrcElts;
- if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
- isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
- continue;
-
- // The elements beyond the truncation must be undef/zero.
- unsigned UpperElts = NumElts - NumSrcElts;
- if (UpperElts > 0 &&
- !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
- continue;
- bool UndefUppers =
- UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
-
- // As we're using both sources then we need to concat them together
- // and truncate from the double-sized src.
- MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
- SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
-
- MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
- MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
- Src = DAG.getBitcast(SrcVT, Src);
- return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
- }
-
- return SDValue();
+ Src = Src.getOperand(0);
+
+ // VPMOVWB is only available with avx512bw.
+ MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
+ !Subtarget.hasBWI())
+ return SDValue();
+
+ bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+ }
+
+ return SDValue();
+}
+
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ // TODO: Support non-BWI VPMOVWB truncations?
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+
+ // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+ // Bail if the V2 elements are undef.
+ unsigned NumHalfSrcElts = NumElts / Scale;
+ unsigned NumSrcElts = 2 * NumHalfSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+ continue;
+
+ // The elements beyond the truncation must be undef/zero.
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (UpperElts > 0 &&
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ bool UndefUppers =
+ UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
+
+ // As we're using both sources then we need to concat them together
+ // and truncate from the double-sized src.
+ MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+ SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
+
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ Src = DAG.getBitcast(SrcVT, Src);
+ return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
+ }
+
+ return SDValue();
}
/// Check whether a compaction lowering can be done by dropping even
@@ -11761,14 +11761,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false, NumStages);
- if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
+ if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2, PackVT))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true, NumStages);
- if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
+ if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1, PackVT))
return true;
}
@@ -12317,32 +12317,32 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
/// operations. It will try to pick the best arrangement of shuffles and
-/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
-static SDValue lowerShuffleAsDecomposedShuffleMerge(
+/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
+static SDValue lowerShuffleAsDecomposedShuffleMerge(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget, SelectionDAG &DAG) {
- int NumElts = Mask.size();
- int NumLanes = VT.getSizeInBits() / 128;
- int NumEltsPerLane = NumElts / NumLanes;
-
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+
// Shuffle the input elements into the desired positions in V1 and V2 and
- // unpack/blend them together.
- bool IsAlternating = true;
- SmallVector<int, 32> V1Mask(NumElts, -1);
- SmallVector<int, 32> V2Mask(NumElts, -1);
- SmallVector<int, 32> FinalMask(NumElts, -1);
- for (int i = 0; i < NumElts; ++i) {
- int M = Mask[i];
- if (M >= 0 && M < NumElts) {
- V1Mask[i] = M;
- FinalMask[i] = i;
- IsAlternating &= (i & 1) == 0;
- } else if (M >= NumElts) {
- V2Mask[i] = M - NumElts;
- FinalMask[i] = i + NumElts;
- IsAlternating &= (i & 1) == 1;
- }
- }
+ // unpack/blend them together.
+ bool IsAlternating = true;
+ SmallVector<int, 32> V1Mask(NumElts, -1);
+ SmallVector<int, 32> V2Mask(NumElts, -1);
+ SmallVector<int, 32> FinalMask(NumElts, -1);
+ for (int i = 0; i < NumElts; ++i) {
+ int M = Mask[i];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i] = M;
+ FinalMask[i] = i;
+ IsAlternating &= (i & 1) == 0;
+ } else if (M >= NumElts) {
+ V2Mask[i] = M - NumElts;
+ FinalMask[i] = i + NumElts;
+ IsAlternating &= (i & 1) == 1;
+ }
+ }
// Try to lower with the simpler initial blend/unpack/rotate strategies unless
// one of the input shuffles would be a no-op. We prefer to shuffle inputs as
@@ -12366,30 +12366,30 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
return BlendPerm;
}
- // If the final mask is an alternating blend of vXi8/vXi16, convert to an
- // UNPCKL(SHUFFLE, SHUFFLE) pattern.
- // TODO: It doesn't have to be alternating - but each lane mustn't have more
- // than half the elements coming from each source.
- if (IsAlternating && VT.getScalarSizeInBits() < 32) {
- V1Mask.assign(NumElts, -1);
- V2Mask.assign(NumElts, -1);
- FinalMask.assign(NumElts, -1);
- for (int i = 0; i != NumElts; i += NumEltsPerLane)
- for (int j = 0; j != NumEltsPerLane; ++j) {
- int M = Mask[i + j];
- if (M >= 0 && M < NumElts) {
- V1Mask[i + (j / 2)] = M;
- FinalMask[i + j] = i + (j / 2);
- } else if (M >= NumElts) {
- V2Mask[i + (j / 2)] = M - NumElts;
- FinalMask[i + j] = i + (j / 2) + NumElts;
- }
- }
- }
-
+ // If the final mask is an alternating blend of vXi8/vXi16, convert to an
+ // UNPCKL(SHUFFLE, SHUFFLE) pattern.
+ // TODO: It doesn't have to be alternating - but each lane mustn't have more
+ // than half the elements coming from each source.
+ if (IsAlternating && VT.getScalarSizeInBits() < 32) {
+ V1Mask.assign(NumElts, -1);
+ V2Mask.assign(NumElts, -1);
+ FinalMask.assign(NumElts, -1);
+ for (int i = 0; i != NumElts; i += NumEltsPerLane)
+ for (int j = 0; j != NumEltsPerLane; ++j) {
+ int M = Mask[i + j];
+ if (M >= 0 && M < NumElts) {
+ V1Mask[i + (j / 2)] = M;
+ FinalMask[i + j] = i + (j / 2);
+ } else if (M >= NumElts) {
+ V2Mask[i + (j / 2)] = M - NumElts;
+ FinalMask[i + j] = i + (j / 2) + NumElts;
+ }
+ }
+ }
+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
- return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
}
/// Try to lower a vector shuffle as a bit rotation.
@@ -13047,8 +13047,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
- DL, ExtVT, InputV, DAG);
+ InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
+ DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -13656,8 +13656,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
MVT SVT = VT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
- SDValue NewAddr =
- DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
+ SDValue NewAddr =
+ DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
// than MOVDDUP.
@@ -13830,7 +13830,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
// Attempt to match the insertps pattern.
- unsigned InsertPSMask = 0;
+ unsigned InsertPSMask = 0;
if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
return SDValue();
@@ -14018,8 +14018,8 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
- isShuffleEquivalent(Mask, {1, 3}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
+ isShuffleEquivalent(Mask, {1, 3}, V1, V2))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
@@ -14065,10 +14065,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
V1 = DAG.getBitcast(MVT::v4i32, V1);
- int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
- Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
- Mask[1] < 0 ? -1 : (Mask[1] * 2),
- Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
+ int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
+ Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
+ Mask[1] < 0 ? -1 : (Mask[1] * 2),
+ Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
return DAG.getBitcast(
MVT::v2i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -14128,7 +14128,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG);
// We implement this with SHUFPD which is pretty lame because it will likely
@@ -14222,12 +14222,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
NewMask[2] = Mask[2] < 4 ? 1 : 3;
NewMask[3] = Mask[2] < 4 ? 3 : 1;
}
- } else if (NumV2Elements == 3) {
- // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
- // we can get here due to other paths (e.g repeated mask matching) that we
- // don't want to do another round of lowerVECTOR_SHUFFLE.
- ShuffleVectorSDNode::commuteMask(NewMask);
- return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
+ } else if (NumV2Elements == 3) {
+ // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
+ // we can get here due to other paths (e.g repeated mask matching) that we
+ // don't want to do another round of lowerVECTOR_SHUFFLE.
+ ShuffleVectorSDNode::commuteMask(NewMask);
+ return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
}
return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
@@ -14256,9 +14256,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use even/odd duplicate instructions for masks that match their pattern.
if (Subtarget.hasSSE3()) {
- if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
- if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
+ if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
}
@@ -14272,9 +14272,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
// in SSE1 because otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
- if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
- if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
+ if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
}
@@ -14316,9 +14316,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use low/high mov instructions. These are only valid in SSE1 because
// otherwise they are widened to v2f64 and never get here.
if (!Subtarget.hasSSE2()) {
- if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
+ if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
}
@@ -14366,9 +14366,9 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
+ else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
@@ -14426,7 +14426,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
if (IsBlendSupported)
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG);
// Try to lower by permuting the inputs into an unpack instruction.
@@ -15035,11 +15035,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return ZExt;
- // Try to use lower using a truncation.
- if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
@@ -15120,11 +15120,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
- // Try to use lower using a truncation.
- if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
@@ -15176,49 +15176,49 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
// We can always bit-blend if we have to so the fallback strategy is to
- // decompose into single-input permutes and blends/unpacks.
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
+ // decompose into single-input permutes and blends/unpacks.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG);
}
-// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
-// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
-// the active subvector is extracted.
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1, SDValue V2,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- MVT MaskVT = VT.changeTypeToInteger();
- SDValue MaskNode;
- MVT ShuffleVT = VT;
- if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
- V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
- V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
- ShuffleVT = V1.getSimpleValueType();
-
- // Adjust mask to correct indices for the second input.
- int NumElts = VT.getVectorNumElements();
- unsigned Scale = 512 / VT.getSizeInBits();
- SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
- for (int &M : AdjustedMask)
- if (NumElts <= M)
- M += (Scale - 1) * NumElts;
- MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
- MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
- } else {
- MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
- }
-
- SDValue Result;
+ ArrayRef<int> Mask, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ MVT MaskVT = VT.changeTypeToInteger();
+ SDValue MaskNode;
+ MVT ShuffleVT = VT;
+ if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+ V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+ V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+ ShuffleVT = V1.getSimpleValueType();
+
+ // Adjust mask to correct indices for the second input.
+ int NumElts = VT.getVectorNumElements();
+ unsigned Scale = 512 / VT.getSizeInBits();
+ SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
+ for (int &M : AdjustedMask)
+ if (NumElts <= M)
+ M += (Scale - 1) * NumElts;
+ MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
+ MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+ } else {
+ MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
+ }
+
+ SDValue Result;
if (V2.isUndef())
- Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
- else
- Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+ Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+ else
+ Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
- if (VT != ShuffleVT)
- Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
-
- return Result;
+ if (VT != ShuffleVT)
+ Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
+
+ return Result;
}
/// Generic lowering of v16i8 shuffles.
@@ -15256,15 +15256,15 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return ZExt;
- // Try to use lower using a truncation.
- if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
- if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
@@ -15447,17 +15447,17 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Unpack;
- // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
- if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
- DAG);
-
- // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
- if (Subtarget.hasXOP()) {
- SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
- return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
- }
-
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+ DAG);
+
+ // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
+ if (Subtarget.hasXOP()) {
+ SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
+ }
+
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
if (SDValue V = lowerShuffleAsByteRotateAndPermute(
@@ -15512,9 +15512,9 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
}
- // Handle multi-input cases by blending/unpacking single-input shuffles.
+ // Handle multi-input cases by blending/unpacking single-input shuffles.
if (NumV2Elements > 0)
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
Subtarget, DAG);
// The fallback path for single-input shuffles widens this into two v8i16
@@ -15694,7 +15694,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
}
/// Either split a vector in halves or decompose the shuffles and the
-/// blend/unpack.
+/// blend/unpack.
///
/// This is provided as a good fallback for many lowerings of non-single-input
/// shuffles with more than one 128-bit lane. In those cases, we want to select
@@ -15729,8 +15729,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
return true;
};
if (DoBothBroadcast())
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+ DAG);
// If the inputs all stem from a single 128-bit lane of each input, then we
// split them rather than blending because the split will decompose to
@@ -15746,9 +15746,9 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
- // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
- // requires that the decomposed single-input shuffles don't end up here.
- return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
+ // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
+ // requires that the decomposed single-input shuffles don't end up here.
+ return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
DAG);
}
@@ -15796,94 +15796,94 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
int NumElts = VT.getVectorNumElements();
int NumLanes = VT.getSizeInBits() / 128;
int NumEltsPerLane = NumElts / NumLanes;
- bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
-
- /// Attempts to find a sublane permute with the given size
- /// that gets all elements into their target lanes.
- ///
- /// If successful, fills CrossLaneMask and InLaneMask and returns true.
- /// If unsuccessful, returns false and may overwrite InLaneMask.
- auto getSublanePermute = [&](int NumSublanes) -> SDValue {
- int NumSublanesPerLane = NumSublanes / NumLanes;
- int NumEltsPerSublane = NumElts / NumSublanes;
-
- SmallVector<int, 16> CrossLaneMask;
- SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
- // CrossLaneMask but one entry == one sublane.
- SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
-
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
- if (M < 0)
- continue;
-
- int SrcSublane = M / NumEltsPerSublane;
- int DstLane = i / NumEltsPerLane;
-
- // We only need to get the elements into the right lane, not sublane.
- // So search all sublanes that make up the destination lane.
- bool Found = false;
- int DstSubStart = DstLane * NumSublanesPerLane;
- int DstSubEnd = DstSubStart + NumSublanesPerLane;
- for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
- if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
- continue;
-
- Found = true;
- CrossLaneMaskLarge[DstSublane] = SrcSublane;
- int DstSublaneOffset = DstSublane * NumEltsPerSublane;
- InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
- break;
- }
- if (!Found)
- return SDValue();
- }
-
- // Fill CrossLaneMask using CrossLaneMaskLarge.
- narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
-
- if (!CanUseSublanes) {
- // If we're only shuffling a single lowest lane and the rest are identity
- // then don't bother.
- // TODO - isShuffleMaskInputInPlace could be extended to something like
- // this.
- int NumIdentityLanes = 0;
- bool OnlyShuffleLowestLane = true;
- for (int i = 0; i != NumLanes; ++i) {
- int LaneOffset = i * NumEltsPerLane;
- if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
- i * NumEltsPerLane))
- NumIdentityLanes++;
- else if (CrossLaneMask[LaneOffset] != 0)
- OnlyShuffleLowestLane = false;
- }
- if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
- return SDValue();
- }
-
- SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
- return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
- InLaneMask);
- };
-
- // First attempt a solution with full lanes.
- if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
- return V;
-
- // The rest of the solutions use sublanes.
- if (!CanUseSublanes)
- return SDValue();
-
- // Then attempt a solution with 64-bit sublanes (vpermq).
- if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
- return V;
-
- // If that doesn't work and we have fast variable shuffle,
- // attempt 32-bit sublanes (vpermd).
- if (!Subtarget.hasFastVariableShuffle())
- return SDValue();
-
- return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
+ bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
+
+ /// Attempts to find a sublane permute with the given size
+ /// that gets all elements into their target lanes.
+ ///
+ /// If successful, fills CrossLaneMask and InLaneMask and returns true.
+ /// If unsuccessful, returns false and may overwrite InLaneMask.
+ auto getSublanePermute = [&](int NumSublanes) -> SDValue {
+ int NumSublanesPerLane = NumSublanes / NumLanes;
+ int NumEltsPerSublane = NumElts / NumSublanes;
+
+ SmallVector<int, 16> CrossLaneMask;
+ SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
+ // CrossLaneMask but one entry == one sublane.
+ SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+
+ int SrcSublane = M / NumEltsPerSublane;
+ int DstLane = i / NumEltsPerLane;
+
+ // We only need to get the elements into the right lane, not sublane.
+ // So search all sublanes that make up the destination lane.
+ bool Found = false;
+ int DstSubStart = DstLane * NumSublanesPerLane;
+ int DstSubEnd = DstSubStart + NumSublanesPerLane;
+ for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
+ if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
+ continue;
+
+ Found = true;
+ CrossLaneMaskLarge[DstSublane] = SrcSublane;
+ int DstSublaneOffset = DstSublane * NumEltsPerSublane;
+ InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
+ break;
+ }
+ if (!Found)
+ return SDValue();
+ }
+
+ // Fill CrossLaneMask using CrossLaneMaskLarge.
+ narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
+
+ if (!CanUseSublanes) {
+ // If we're only shuffling a single lowest lane and the rest are identity
+ // then don't bother.
+ // TODO - isShuffleMaskInputInPlace could be extended to something like
+ // this.
+ int NumIdentityLanes = 0;
+ bool OnlyShuffleLowestLane = true;
+ for (int i = 0; i != NumLanes; ++i) {
+ int LaneOffset = i * NumEltsPerLane;
+ if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
+ i * NumEltsPerLane))
+ NumIdentityLanes++;
+ else if (CrossLaneMask[LaneOffset] != 0)
+ OnlyShuffleLowestLane = false;
+ }
+ if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+ return SDValue();
+ }
+
+ SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
+ return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
+ InLaneMask);
+ };
+
+ // First attempt a solution with full lanes.
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
+ return V;
+
+ // The rest of the solutions use sublanes.
+ if (!CanUseSublanes)
+ return SDValue();
+
+ // Then attempt a solution with 64-bit sublanes (vpermq).
+ if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
+ return V;
+
+ // If that doesn't work and we have fast variable shuffle,
+ // attempt 32-bit sublanes (vpermd).
+ if (!Subtarget.hasFastVariableShuffle())
+ return SDValue();
+
+ return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
@@ -15996,8 +15996,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (!IsLowZero && !IsHighZero) {
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
- if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
+ if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
@@ -16739,7 +16739,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
@@ -16800,7 +16800,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16828,7 +16828,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have AVX2 then we always want to lower with a blend because an v4 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
@@ -16910,7 +16910,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
@@ -16930,7 +16930,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG);
}
@@ -16963,9 +16963,9 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
- if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
if (V2.isUndef())
@@ -17019,13 +17019,13 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// since after split we get a more efficient code using vpunpcklwd and
// vpunpckhwd instrs than vblend.
if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
+ DAG);
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG);
// Otherwise fall back on generic lowering.
@@ -17058,8 +17058,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// vpunpcklwd and vpunpckhwd instrs.
if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
!Subtarget.hasAVX512())
- return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
- DAG);
+ return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
+ DAG);
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -17144,7 +17144,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Otherwise fall back on generic blend lowering.
- return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
+ return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG);
}
@@ -17186,11 +17186,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
- // Try to use lower using a truncation.
- if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -17243,9 +17243,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
- if (Subtarget.hasBWI())
- return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
+ // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+ if (Subtarget.hasBWI())
+ return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -17301,11 +17301,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
- // Try to use lower using a truncation.
- if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
- return V;
-
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -17348,9 +17348,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
- if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
+ // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+ if (Subtarget.hasVBMI())
+ return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
@@ -17477,9 +17477,9 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
+ bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
if (OnlyUsesV1 ||
- isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
SDValue SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
@@ -17564,7 +17564,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (V2.isUndef()) {
// Use low duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
+ if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
@@ -17604,7 +17604,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -17623,9 +17623,9 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
+ if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
- if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
+ if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
if (V2.isUndef())
@@ -17663,7 +17663,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
- return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 8-lane 64-bit integer shuffles.
@@ -17711,14 +17711,14 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
// Try to use PALIGNR.
- if (Subtarget.hasBWI())
- if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
- Subtarget, DAG))
- return Rotate;
+ if (Subtarget.hasBWI())
+ if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Rotate;
if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
-
+
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
DAG, Subtarget))
@@ -17728,7 +17728,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 16-lane 32-bit integer shuffles.
@@ -17805,7 +17805,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
- return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 32-lane 16-bit integer shuffles.
@@ -17868,7 +17868,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return PSHUFB;
- return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
}
/// Handle lowering of 64-lane 8-bit integer shuffles.
@@ -17924,7 +17924,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// VBMI can use VPERMV/VPERMV3 byte shuffles.
if (Subtarget.hasVBMI())
- return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
+ return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -18378,7 +18378,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
// Modify the new Mask to take all zeros from the all-zero vector.
// Choose indices that are blend-friendly.
bool UsedZeroVector = false;
- assert(is_contained(WidenedMask, SM_SentinelZero) &&
+ assert(is_contained(WidenedMask, SM_SentinelZero) &&
"V2's non-undef elements are used?!");
for (int i = 0; i != NewNumElts; ++i)
if (WidenedMask[i] == SM_SentinelZero) {
@@ -18431,11 +18431,11 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
// Only non-legal VSELECTs reach this lowering, convert those into generic
// shuffles and re-use the shuffle lowering path for blends.
- if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
- SmallVector<int, 32> Mask;
- if (createShuffleMaskFromVSELECT(Mask, Cond))
- return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
- }
+ if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
+ SmallVector<int, 32> Mask;
+ if (createShuffleMaskFromVSELECT(Mask, Cond))
+ return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
+ }
return SDValue();
}
@@ -18549,9 +18549,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
- DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -18706,8 +18706,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
- DAG.getTargetConstant(IdxVal, dl, MVT::i8));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -18901,9 +18901,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
Opc = X86ISD::PINSRB;
}
- assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
+ assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
}
@@ -19151,12 +19151,12 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
if (GV) {
// Create a target global address if this is a global. If possible, fold the
// offset into the global address reference. Otherwise, ADD it on later.
- // Suppress the folding if Offset is negative: movl foo-1, %eax is not
- // allowed because if the address of foo is 0, the ELF R_X86_64_32
- // relocation will compute to a negative value, which is invalid.
+ // Suppress the folding if Offset is negative: movl foo-1, %eax is not
+ // allowed because if the address of foo is 0, the ELF R_X86_64_32
+ // relocation will compute to a negative value, which is invalid.
int64_t GlobalOffset = 0;
- if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
- X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
+ if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
+ X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
std::swap(GlobalOffset, Offset);
}
Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
@@ -19243,7 +19243,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
}
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT) {
@@ -19251,17 +19251,17 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
X86::RAX, X86II::MO_TLSGD);
}
-// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
-static SDValue
-LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
- const EVT PtrVT) {
- return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
- X86::EAX, X86II::MO_TLSGD);
-}
-
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
+static SDValue
+LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
+ X86::EAX, X86II::MO_TLSGD);
+}
+
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
- SelectionDAG &DAG, const EVT PtrVT,
- bool Is64Bit, bool Is64BitLP64) {
+ SelectionDAG &DAG, const EVT PtrVT,
+ bool Is64Bit, bool Is64BitLP64) {
SDLoc dl(GA);
// Get the start address of the TLS block for this module.
@@ -19270,9 +19270,9 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
MFI->incNumLocalDynamicTLSAccesses();
SDValue Base;
- if (Is64Bit) {
- unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
- Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
+ if (Is64Bit) {
+ unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
X86II::MO_TLSLD, /*LocalDynamic=*/true);
} else {
SDValue InFlag;
@@ -19369,15 +19369,15 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
- if (Subtarget.is64Bit()) {
- if (Subtarget.isTarget64BitLP64())
- return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
- return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
- }
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTarget64BitLP64())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+ }
return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
case TLSModel::LocalDynamic:
- return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
- Subtarget.isTarget64BitLP64());
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+ Subtarget.isTarget64BitLP64());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
@@ -19477,7 +19477,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
else
IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
- const DataLayout &DL = DAG.getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
SDValue Scale =
DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
@@ -19570,29 +19570,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
if (IsFSHR)
std::swap(Op0, Op1);
- // With AVX512, but not VLX we need to widen to get a 512-bit result type.
- if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
- Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
- Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
- }
-
- SDValue Funnel;
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
+ Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
+ Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
+ }
+
+ SDValue Funnel;
APInt APIntShiftAmt;
- MVT ResultVT = Op0.getSimpleValueType();
+ MVT ResultVT = Op0.getSimpleValueType();
if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
- Funnel =
- DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
- Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
- } else {
- if (!Subtarget.hasVLX() && !VT.is512BitVector())
- Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
- Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
- ResultVT, Op0, Op1, Amt);
- }
- if (!Subtarget.hasVLX() && !VT.is512BitVector())
- Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
- return Funnel;
+ Funnel =
+ DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
+ Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ } else {
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
+ Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
+ ResultVT, Op0, Op1, Amt);
+ }
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
+ return Funnel;
}
assert(
(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -19944,7 +19944,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
}
if (VT == MVT::f128)
- return SDValue();
+ return SDValue();
SDValue ValueToStore = Src;
if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
@@ -20025,10 +20025,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
- // when converting 0 when rounding toward negative infinity. Caller will
- // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
- assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
+ // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
+ // when converting 0 when rounding toward negative infinity. Caller will
+ // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
+ assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
// This algorithm is not obvious. Here it is what we're trying to output:
/*
movq %rax, %xmm0
@@ -20063,27 +20063,27 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
// Load the 64-bit value into an XMM register.
SDValue XR1 =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
- SDValue CLod0 = DAG.getLoad(
- MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
+ SDValue CLod0 = DAG.getLoad(
+ MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue Unpck1 =
getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
- SDValue CLod1 = DAG.getLoad(
- MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
+ SDValue CLod1 = DAG.getLoad(
+ MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
// TODO: Are there any fast-math-flags to propagate here?
- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3() &&
+ if (Subtarget.hasSSE3() &&
shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0, dl));
@@ -20385,7 +20385,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
- return SDValue();
+ return SDValue();
if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -20412,30 +20412,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
- // The transform for i64->f64 isn't correct for 0 when rounding to negative
- // infinity. It produces -0.0, so disable under strictfp.
- if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
+ // The transform for i64->f64 isn't correct for 0 when rounding to negative
+ // infinity. It produces -0.0, so disable under strictfp.
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
- if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
- (DstVT == MVT::f32 || DstVT == MVT::f64))
+ if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
+ (DstVT == MVT::f32 || DstVT == MVT::f64))
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
- Align SlotAlign(8);
+ Align SlotAlign(8);
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
if (SrcVT == MVT::i32) {
- SDValue OffsetSlot =
- DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
- SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
+ SDValue OffsetSlot =
+ DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
+ SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
- OffsetSlot, MPI.getWithOffset(4), SlotAlign);
+ OffsetSlot, MPI.getWithOffset(4), SlotAlign);
std::pair<SDValue, SDValue> Tmp =
- BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -20451,15 +20451,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
}
SDValue Store =
- DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
// For i64 source, we need to add the appropriate power of 2 if the input
- // was negative. We must be careful to do the computation in x87 extended
- // precision, not in SSE.
+ // was negative. We must be careful to do the computation in x87 extended
+ // precision, not in SSE.
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
SDValue Fild =
DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
- SlotAlign, MachineMemOperand::MOLoad);
+ SlotAlign, MachineMemOperand::MOLoad);
Chain = Fild.getValue(1);
@@ -20562,8 +20562,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
- // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
+ // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
+ // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
// FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
@@ -20593,31 +20593,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
*DAG.getContext(), TheVT);
SDValue Cmp;
if (IsStrict) {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
- /*IsSignaling*/ true);
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
+ /*IsSignaling*/ true);
Chain = Cmp.getValue(1);
} else {
- Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
- }
-
- // Our preferred lowering of
- //
- // (Value >= Thresh) ? 0x8000000000000000ULL : 0
- //
- // is
- //
- // (Value >= Thresh) << 63
- //
- // but since we can get here after LegalOperations, DAGCombine might do the
- // wrong thing if we create a select. So, directly create the preferred
- // version.
- SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
- SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
- Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
-
- SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
- DAG.getConstantFP(0.0, DL, TheVT));
-
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
+ }
+
+ // Our preferred lowering of
+ //
+ // (Value >= Thresh) ? 0x8000000000000000ULL : 0
+ //
+ // is
+ //
+ // (Value >= Thresh) << 63
+ //
+ // but since we can get here after LegalOperations, DAGCombine might do the
+ // wrong thing if we create a select. So, directly create the preferred
+ // version.
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
+ SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
+ Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
+
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
+ DAG.getConstantFP(0.0, DL, TheVT));
+
if (IsStrict) {
Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
{ Chain, Value, FltOfs });
@@ -21075,8 +21075,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
- In = DAG.getBitcast(MVT::v8i32, In);
-
+ In = DAG.getBitcast(MVT::v8i32, In);
+
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
@@ -21085,17 +21085,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(0, DL));
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(4, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4, DL));
static const int ShufMask[] = {0, 2, 4, 6};
return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
- In = DAG.getBitcast(MVT::v32i8, In);
-
+ In = DAG.getBitcast(MVT::v32i8, In);
+
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
// The PSHUFB mask:
@@ -21106,17 +21106,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
- static const int ShufMask2[] = {0, 2, -1, -1};
- In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
- DAG.getBitcast(MVT::v16i16, In),
- DAG.getIntPtrConstant(0, DL));
+ static const int ShufMask2[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v16i16, In),
+ DAG.getIntPtrConstant(0, DL));
}
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
DAG.getIntPtrConstant(0, DL));
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
- DAG.getIntPtrConstant(16, DL));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
+ DAG.getIntPtrConstant(16, DL));
// The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
@@ -21452,155 +21452,155 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
}
-SDValue
-X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
- // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
- // but making use of X86 specifics to produce better instruction sequences.
- SDNode *Node = Op.getNode();
- bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
- unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
- SDLoc dl(SDValue(Node, 0));
- SDValue Src = Node->getOperand(0);
-
- // There are three types involved here: SrcVT is the source floating point
- // type, DstVT is the type of the result, and TmpVT is the result of the
- // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
- // DstVT).
- EVT SrcVT = Src.getValueType();
- EVT DstVT = Node->getValueType(0);
- EVT TmpVT = DstVT;
-
- // This code is only for floats and doubles. Fall back to generic code for
- // anything else.
- if (!isScalarFPTypeInSSEReg(SrcVT))
- return SDValue();
-
- unsigned SatWidth = Node->getConstantOperandVal(1);
- unsigned DstWidth = DstVT.getScalarSizeInBits();
- unsigned TmpWidth = TmpVT.getScalarSizeInBits();
- assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
- "Expected saturation width smaller than result width");
-
- // Promote result of FP_TO_*INT to at least 32 bits.
- if (TmpWidth < 32) {
- TmpVT = MVT::i32;
- TmpWidth = 32;
- }
-
- // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
- // us to use a native signed conversion instead.
- if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
- TmpVT = MVT::i64;
- TmpWidth = 64;
- }
-
- // If the saturation width is smaller than the size of the temporary result,
- // we can always use signed conversion, which is native.
- if (SatWidth < TmpWidth)
- FpToIntOpcode = ISD::FP_TO_SINT;
-
- // Determine minimum and maximum integer values and their corresponding
- // floating-point values.
- APInt MinInt, MaxInt;
- if (IsSigned) {
- MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
- MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
- } else {
- MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
- MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
- }
-
- APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
- APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
-
- APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
- MinInt, IsSigned, APFloat::rmTowardZero);
- APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
- MaxInt, IsSigned, APFloat::rmTowardZero);
- bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
- && !(MaxStatus & APFloat::opStatus::opInexact);
-
- SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
- SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
-
- // If the integer bounds are exactly representable as floats, emit a
- // min+max+fptoi sequence. Otherwise use comparisons and selects.
- if (AreExactFloatBounds) {
- if (DstVT != TmpVT) {
- // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
- SDValue MinClamped = DAG.getNode(
- X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
- // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
- SDValue BothClamped = DAG.getNode(
- X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
- // Convert clamped value to integer.
- SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
-
- // NaN will become INDVAL, with the top bit set and the rest zero.
- // Truncation will discard the top bit, resulting in zero.
- return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
- }
-
- // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
- SDValue MinClamped = DAG.getNode(
- X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
- // Clamp by MaxFloat from above. NaN cannot occur.
- SDValue BothClamped = DAG.getNode(
- X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
- // Convert clamped value to integer.
- SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
-
- if (!IsSigned) {
- // In the unsigned case we're done, because we mapped NaN to MinFloat,
- // which is zero.
- return FpToInt;
- }
-
- // Otherwise, select zero if Src is NaN.
- SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
- return DAG.getSelectCC(
- dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
- }
-
- SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
- SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
-
- // Result of direct conversion, which may be selected away.
- SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
-
- if (DstVT != TmpVT) {
- // NaN will become INDVAL, with the top bit set and the rest zero.
- // Truncation will discard the top bit, resulting in zero.
- FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
- }
-
- SDValue Select = FpToInt;
- // For signed conversions where we saturate to the same size as the
- // result type of the fptoi instructions, INDVAL coincides with integer
- // minimum, so we don't need to explicitly check it.
- if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
- // If Src ULT MinFloat, select MinInt. In particular, this also selects
- // MinInt if Src is NaN.
- Select = DAG.getSelectCC(
- dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
- }
-
- // If Src OGT MaxFloat, select MaxInt.
- Select = DAG.getSelectCC(
- dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
-
- // In the unsigned case we are done, because we mapped NaN to MinInt, which
- // is already zero. The promoted case was already handled above.
- if (!IsSigned || DstVT != TmpVT) {
- return Select;
- }
-
- // Otherwise, select 0 if Src is NaN.
- SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
- return DAG.getSelectCC(
- dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
-}
-
+SDValue
+X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+ // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
+ // but making use of X86 specifics to produce better instruction sequences.
+ SDNode *Node = Op.getNode();
+ bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+ unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+ SDLoc dl(SDValue(Node, 0));
+ SDValue Src = Node->getOperand(0);
+
+ // There are three types involved here: SrcVT is the source floating point
+ // type, DstVT is the type of the result, and TmpVT is the result of the
+ // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
+ // DstVT).
+ EVT SrcVT = Src.getValueType();
+ EVT DstVT = Node->getValueType(0);
+ EVT TmpVT = DstVT;
+
+ // This code is only for floats and doubles. Fall back to generic code for
+ // anything else.
+ if (!isScalarFPTypeInSSEReg(SrcVT))
+ return SDValue();
+
+ unsigned SatWidth = Node->getConstantOperandVal(1);
+ unsigned DstWidth = DstVT.getScalarSizeInBits();
+ unsigned TmpWidth = TmpVT.getScalarSizeInBits();
+ assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
+ "Expected saturation width smaller than result width");
+
+ // Promote result of FP_TO_*INT to at least 32 bits.
+ if (TmpWidth < 32) {
+ TmpVT = MVT::i32;
+ TmpWidth = 32;
+ }
+
+ // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
+ // us to use a native signed conversion instead.
+ if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
+ TmpVT = MVT::i64;
+ TmpWidth = 64;
+ }
+
+ // If the saturation width is smaller than the size of the temporary result,
+ // we can always use signed conversion, which is native.
+ if (SatWidth < TmpWidth)
+ FpToIntOpcode = ISD::FP_TO_SINT;
+
+ // Determine minimum and maximum integer values and their corresponding
+ // floating-point values.
+ APInt MinInt, MaxInt;
+ if (IsSigned) {
+ MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
+ MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
+ } else {
+ MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
+ MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
+ }
+
+ APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+ APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+ APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
+ MinInt, IsSigned, APFloat::rmTowardZero);
+ APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
+ MaxInt, IsSigned, APFloat::rmTowardZero);
+ bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
+ && !(MaxStatus & APFloat::opStatus::opInexact);
+
+ SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+ SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+ // If the integer bounds are exactly representable as floats, emit a
+ // min+max+fptoi sequence. Otherwise use comparisons and selects.
+ if (AreExactFloatBounds) {
+ if (DstVT != TmpVT) {
+ // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+ // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
+
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
+ SDValue MinClamped = DAG.getNode(
+ X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+ // Clamp by MaxFloat from above. NaN cannot occur.
+ SDValue BothClamped = DAG.getNode(
+ X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+ // Convert clamped value to integer.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
+
+ if (!IsSigned) {
+ // In the unsigned case we're done, because we mapped NaN to MinFloat,
+ // which is zero.
+ return FpToInt;
+ }
+
+ // Otherwise, select zero if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+ }
+
+ SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+ SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+ // Result of direct conversion, which may be selected away.
+ SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
+
+ if (DstVT != TmpVT) {
+ // NaN will become INDVAL, with the top bit set and the rest zero.
+ // Truncation will discard the top bit, resulting in zero.
+ FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
+ }
+
+ SDValue Select = FpToInt;
+ // For signed conversions where we saturate to the same size as the
+ // result type of the fptoi instructions, INDVAL coincides with integer
+ // minimum, so we don't need to explicitly check it.
+ if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
+ // If Src ULT MinFloat, select MinInt. In particular, this also selects
+ // MinInt if Src is NaN.
+ Select = DAG.getSelectCC(
+ dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+ }
+
+ // If Src OGT MaxFloat, select MaxInt.
+ Select = DAG.getSelectCC(
+ dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+
+ // In the unsigned case we are done, because we mapped NaN to MinInt, which
+ // is already zero. The promoted case was already handled above.
+ if (!IsSigned || DstVT != TmpVT) {
+ return Select;
+ }
+
+ // Otherwise, select 0 if Src is NaN.
+ SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+ return DAG.getSelectCC(
+ dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+}
+
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -21609,8 +21609,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
- if (VT == MVT::f128)
- return SDValue();
+ if (VT == MVT::f128)
+ return SDValue();
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -21626,10 +21626,10 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
// It's legal except when f128 is involved
- if (In.getSimpleValueType() != MVT::f128)
+ if (In.getSimpleValueType() != MVT::f128)
return Op;
- return SDValue();
+ return SDValue();
}
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -21994,7 +21994,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
if (M == SrcOpMap.end()) {
VT = Src.getValueType();
// Quit if not the same type.
- if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
+ if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
return false;
unsigned NumElts = VT.getVectorNumElements();
APInt EltCount = APInt::getNullValue(NumElts);
@@ -22032,11 +22032,11 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, X86::CondCode &X86CC) {
EVT VT = V.getValueType();
- unsigned ScalarSize = VT.getScalarSizeInBits();
- if (Mask.getBitWidth() != ScalarSize) {
- assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
- return SDValue();
- }
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ if (Mask.getBitWidth() != ScalarSize) {
+ assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
+ return SDValue();
+ }
assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
@@ -22940,8 +22940,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
- if (VT.getFixedSizeInBits() >
- Op.getSimpleValueType().getFixedSizeInBits()) {
+ if (VT.getFixedSizeInBits() >
+ Op.getSimpleValueType().getFixedSizeInBits()) {
// We emitted a compare with an XMM/YMM result. Finish converting to a
// mask register using a vptestm.
EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
@@ -23116,10 +23116,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
// Try to use SUBUS and PCMPEQ.
- if (FlipSigns)
- if (SDValue V =
- LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
- return V;
+ if (FlipSigns)
+ if (SDValue V =
+ LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
+ return V;
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
@@ -23914,7 +23914,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
MVT SVT = VT.getVectorElementType();
MVT InSVT = InVT.getVectorElementType();
- assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
+ assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
return SDValue();
@@ -24089,8 +24089,8 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
unsigned HalfOffset = Value0.getValueType().getStoreSize();
SDValue Ptr0 = Store->getBasePtr();
- SDValue Ptr1 =
- DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
+ SDValue Ptr1 =
+ DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
Store->getOriginalAlign(),
@@ -24125,8 +24125,8 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i != NumElems; ++i) {
unsigned Offset = i * ScalarSize;
- SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
- TypeSize::Fixed(Offset), DL);
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
@@ -24147,22 +24147,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
if (StoredVal.getValueType().isVector() &&
StoredVal.getValueType().getVectorElementType() == MVT::i1) {
- unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
- assert(NumElts <= 8 && "Unexpected VT");
+ unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
+ assert(NumElts <= 8 && "Unexpected VT");
assert(!St->isTruncatingStore() && "Expected non-truncating store");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
- // We must pad with zeros to ensure we store zeroes to any unused bits.
+ // We must pad with zeros to ensure we store zeroes to any unused bits.
StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
DAG.getUNDEF(MVT::v16i1), StoredVal,
DAG.getIntPtrConstant(0, dl));
StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
- // Make sure we store zeros in the extra bits.
- if (NumElts < 8)
- StoredVal = DAG.getZeroExtendInReg(
- StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
+ // Make sure we store zeros in the extra bits.
+ if (NumElts < 8)
+ StoredVal = DAG.getZeroExtendInReg(
+ StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getOriginalAlign(),
@@ -24418,7 +24418,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Result;
if (!Lower) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+ Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
@@ -24519,7 +24519,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MemOps.push_back(Store);
// Store fp_offset
- FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
+ FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
Store = DAG.getStore(
Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
@@ -24584,18 +24584,18 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
Subtarget.hasSSE1());
}
- // Insert VAARG node into the DAG
- // VAARG returns two values: Variable Argument Address, Chain
- SDValue InstOps[] = {Chain, SrcPtr,
- DAG.getTargetConstant(ArgSize, dl, MVT::i32),
- DAG.getTargetConstant(ArgMode, dl, MVT::i8),
- DAG.getTargetConstant(Align, dl, MVT::i32)};
+ // Insert VAARG node into the DAG
+ // VAARG returns two values: Variable Argument Address, Chain
+ SDValue InstOps[] = {Chain, SrcPtr,
+ DAG.getTargetConstant(ArgSize, dl, MVT::i32),
+ DAG.getTargetConstant(ArgMode, dl, MVT::i8),
+ DAG.getTargetConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
- Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
- VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
- /*Alignment=*/None,
- MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+ Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
+ VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+ /*Alignment=*/None,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
@@ -24619,11 +24619,11 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
- return DAG.getMemcpy(
- Chain, DL, DstPtr, SrcPtr,
- DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
- Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
- false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+ return DAG.getMemcpy(
+ Chain, DL, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
+ Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
+ false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
@@ -25070,12 +25070,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
- if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
- Src3.getValueType() != MVT::i8) {
- Src3 = DAG.getTargetConstant(
- cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
- }
-
+ if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
+ Src3.getValueType() != MVT::i8) {
+ Src3 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -25094,18 +25094,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
{Src1, Src2, Src3});
}
- case INTR_TYPE_4OP_IMM8: {
- assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
- SDValue Src4 = Op.getOperand(4);
- if (Src4.getValueType() != MVT::i8) {
- Src4 = DAG.getTargetConstant(
- cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
- }
-
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
- Src4);
- }
+ case INTR_TYPE_4OP_IMM8: {
+ assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
+ SDValue Src4 = Op.getOperand(4);
+ if (Src4.getValueType() != MVT::i8) {
+ Src4 = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
+ }
+
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Src4);
+ }
case INTR_TYPE_1OP_MASK: {
SDValue Src = Op.getOperand(1);
SDValue PassThru = Op.getOperand(2);
@@ -25338,21 +25338,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(4);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
if (IntrData->Opc1 != 0) {
- SDValue Sae = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(5);
if (isRoundModeSAE(Sae))
return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC, Mask, Sae);
+ Op.getOperand(2), CC, Mask, Sae);
if (!isRoundModeCurDirection(Sae))
return SDValue();
}
//default rounding mode
return DAG.getNode(IntrData->Opc0, dl, MaskVT,
- {Op.getOperand(1), Op.getOperand(2), CC, Mask});
+ {Op.getOperand(1), Op.getOperand(2), CC, Mask});
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -25507,11 +25507,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
case BEXTRI: {
- assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
+ assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
uint64_t Imm = Op.getConstantOperandVal(2);
- SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
- Op.getValueType());
+ SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
+ Op.getValueType());
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Control);
}
@@ -25902,8 +25902,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// MMX register.
ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
- DAG.getTargetConstant(NewIntrinsic, DL,
- getPointerTy(DAG.getDataLayout())),
+ DAG.getTargetConstant(NewIntrinsic, DL,
+ getPointerTy(DAG.getDataLayout())),
Op.getOperand(1), ShAmt);
}
}
@@ -26273,97 +26273,97 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
}
- case Intrinsic::x86_aesenc128kl:
- case Intrinsic::x86_aesdec128kl:
- case Intrinsic::x86_aesenc256kl:
- case Intrinsic::x86_aesdec256kl: {
- SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
- SDValue Chain = Op.getOperand(0);
- unsigned Opcode;
-
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
- case Intrinsic::x86_aesenc128kl:
- Opcode = X86ISD::AESENC128KL;
- break;
- case Intrinsic::x86_aesdec128kl:
- Opcode = X86ISD::AESDEC128KL;
- break;
- case Intrinsic::x86_aesenc256kl:
- Opcode = X86ISD::AESENC256KL;
- break;
- case Intrinsic::x86_aesdec256kl:
- Opcode = X86ISD::AESDEC256KL;
- break;
- }
-
- MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- MachineMemOperand *MMO = MemIntr->getMemOperand();
- EVT MemVT = MemIntr->getMemoryVT();
- SDValue Operation = DAG.getMemIntrinsicNode(
- Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
- MMO);
- SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
- {ZF, Operation.getValue(0), Operation.getValue(2)});
- }
- case Intrinsic::x86_aesencwide128kl:
- case Intrinsic::x86_aesdecwide128kl:
- case Intrinsic::x86_aesencwide256kl:
- case Intrinsic::x86_aesdecwide256kl: {
- SDLoc DL(Op);
- SDVTList VTs = DAG.getVTList(
- {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
- MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
- SDValue Chain = Op.getOperand(0);
- unsigned Opcode;
-
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
- case Intrinsic::x86_aesencwide128kl:
- Opcode = X86ISD::AESENCWIDE128KL;
- break;
- case Intrinsic::x86_aesdecwide128kl:
- Opcode = X86ISD::AESDECWIDE128KL;
- break;
- case Intrinsic::x86_aesencwide256kl:
- Opcode = X86ISD::AESENCWIDE256KL;
- break;
- case Intrinsic::x86_aesdecwide256kl:
- Opcode = X86ISD::AESDECWIDE256KL;
- break;
- }
-
- MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- MachineMemOperand *MMO = MemIntr->getMemOperand();
- EVT MemVT = MemIntr->getMemoryVT();
- SDValue Operation = DAG.getMemIntrinsicNode(
- Opcode, DL, VTs,
- {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
- Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
- Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
- MemVT, MMO);
- SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
-
- return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
- {ZF, Operation.getValue(1), Operation.getValue(2),
- Operation.getValue(3), Operation.getValue(4),
- Operation.getValue(5), Operation.getValue(6),
- Operation.getValue(7), Operation.getValue(8),
- Operation.getValue(9)});
- }
- case Intrinsic::x86_testui: {
- SDLoc dl(Op);
- SDValue Chain = Op.getOperand(0);
- SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
- SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
- SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
- Operation.getValue(1));
- }
- }
+ case Intrinsic::x86_aesenc128kl:
+ case Intrinsic::x86_aesdec128kl:
+ case Intrinsic::x86_aesenc256kl:
+ case Intrinsic::x86_aesdec256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesenc128kl:
+ Opcode = X86ISD::AESENC128KL;
+ break;
+ case Intrinsic::x86_aesdec128kl:
+ Opcode = X86ISD::AESDEC128KL;
+ break;
+ case Intrinsic::x86_aesenc256kl:
+ Opcode = X86ISD::AESENC256KL;
+ break;
+ case Intrinsic::x86_aesdec256kl:
+ Opcode = X86ISD::AESDEC256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
+ MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(0), Operation.getValue(2)});
+ }
+ case Intrinsic::x86_aesencwide128kl:
+ case Intrinsic::x86_aesdecwide128kl:
+ case Intrinsic::x86_aesencwide256kl:
+ case Intrinsic::x86_aesdecwide256kl: {
+ SDLoc DL(Op);
+ SDVTList VTs = DAG.getVTList(
+ {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
+ MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
+ SDValue Chain = Op.getOperand(0);
+ unsigned Opcode;
+
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic");
+ case Intrinsic::x86_aesencwide128kl:
+ Opcode = X86ISD::AESENCWIDE128KL;
+ break;
+ case Intrinsic::x86_aesdecwide128kl:
+ Opcode = X86ISD::AESDECWIDE128KL;
+ break;
+ case Intrinsic::x86_aesencwide256kl:
+ Opcode = X86ISD::AESENCWIDE256KL;
+ break;
+ case Intrinsic::x86_aesdecwide256kl:
+ Opcode = X86ISD::AESDECWIDE256KL;
+ break;
+ }
+
+ MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ MachineMemOperand *MMO = MemIntr->getMemOperand();
+ EVT MemVT = MemIntr->getMemoryVT();
+ SDValue Operation = DAG.getMemIntrinsicNode(
+ Opcode, DL, VTs,
+ {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
+ Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
+ MemVT, MMO);
+ SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
+
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
+ {ZF, Operation.getValue(1), Operation.getValue(2),
+ Operation.getValue(3), Operation.getValue(4),
+ Operation.getValue(5), Operation.getValue(6),
+ Operation.getValue(7), Operation.getValue(8),
+ Operation.getValue(9)});
+ }
+ case Intrinsic::x86_testui: {
+ SDLoc dl(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
+ SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
+ Operation.getValue(1));
+ }
+ }
return SDValue();
}
@@ -26733,8 +26733,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(2, dl, MVT::i64));
- OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
- MachinePointerInfo(TrmpAddr, 2), Align(2));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+ MachinePointerInfo(TrmpAddr, 2), Align(2));
// Load the 'nest' parameter value into R10.
// R10 is specified in X86CallingConv.td
@@ -26746,8 +26746,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
DAG.getConstant(12, dl, MVT::i64));
- OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
- MachinePointerInfo(TrmpAddr, 12), Align(2));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 12), Align(2));
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
@@ -26789,7 +26789,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
- const DataLayout &DL = DAG.getDataLayout();
+ const DataLayout &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
}
@@ -26827,20 +26827,20 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(1, dl, MVT::i32));
- OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
- MachinePointerInfo(TrmpAddr, 1), Align(1));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+ MachinePointerInfo(TrmpAddr, 1), Align(1));
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(5, dl, MVT::i32));
- OutChains[2] =
- DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
- MachinePointerInfo(TrmpAddr, 5), Align(1));
+ OutChains[2] =
+ DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
+ MachinePointerInfo(TrmpAddr, 5), Align(1));
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
DAG.getConstant(6, dl, MVT::i32));
- OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
- MachinePointerInfo(TrmpAddr, 6), Align(1));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+ MachinePointerInfo(TrmpAddr, 6), Align(1));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}
@@ -27134,47 +27134,47 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
unsigned Opcode = Op.getOpcode();
- SDLoc DL(Op);
-
+ SDLoc DL(Op);
+
if (VT.getScalarType() == MVT::i1) {
switch (Opcode) {
default: llvm_unreachable("Expected saturated arithmetic opcode");
case ISD::UADDSAT:
case ISD::SADDSAT:
// *addsat i1 X, Y --> X | Y
- return DAG.getNode(ISD::OR, DL, VT, X, Y);
+ return DAG.getNode(ISD::OR, DL, VT, X, Y);
case ISD::USUBSAT:
case ISD::SSUBSAT:
// *subsat i1 X, Y --> X & ~Y
- return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
+ return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT));
}
}
- if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
- (VT.is256BitVector() && !Subtarget.hasInt256())) {
- assert(Op.getSimpleValueType().isInteger() &&
- "Only handle AVX vector integer operation");
- return splitVectorIntBinary(Op, DAG);
+ if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
+ (VT.is256BitVector() && !Subtarget.hasInt256())) {
+ assert(Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX vector integer operation");
+ return splitVectorIntBinary(Op, DAG);
}
- // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT SetCCResultType =
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
-
- if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
- // usubsat X, Y --> (X >u Y) ? X - Y : 0
- SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
- SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
- // TODO: Move this to DAGCombiner?
- if (SetCCResultType == VT &&
- DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
- return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
- return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
- }
+ // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT SetCCResultType =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- // Use default expansion.
- return SDValue();
+ if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
+ // usubsat X, Y --> (X >u Y) ? X - Y : 0
+ SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
+ SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
+ // TODO: Move this to DAGCombiner?
+ if (SetCCResultType == VT &&
+ DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
+ return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
+ return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
+ }
+
+ // Use default expansion.
+ return SDValue();
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -27224,8 +27224,8 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
if (VT == MVT::v32i16 || VT == MVT::v64i8)
return splitVectorIntBinary(Op, DAG);
- // Default to expand.
- return SDValue();
+ // Default to expand.
+ return SDValue();
}
static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
@@ -27597,8 +27597,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Entry.Node = StackPtr;
- InChain =
- DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
+ InChain =
+ DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
@@ -27889,7 +27889,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
MVT VT = Amt.getSimpleValueType();
if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
(Subtarget.hasInt256() && VT == MVT::v16i16) ||
- (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
+ (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
(!Subtarget.hasAVX512() && VT == MVT::v16i8)))
return SDValue();
@@ -28467,12 +28467,12 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return Op;
}
- // AVX512 VBMI2 vXi16 - lower to funnel shifts.
- if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
- unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
- return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
- }
-
+ // AVX512 VBMI2 vXi16 - lower to funnel shifts.
+ if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
+ unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
+ return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
+ }
+
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
@@ -28499,8 +28499,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return splitVectorIntBinary(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
- ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
- VT == MVT::v32i16) &&
+ ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||
+ VT == MVT::v32i16) &&
Subtarget.hasAVX2())) &&
"Only vXi32/vXi16/vXi8 vector rotates supported");
@@ -28797,8 +28797,8 @@ bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
/// a) very likely accessed only by a single thread to minimize cache traffic,
/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
- const X86Subtarget &Subtarget, SDValue Chain,
- const SDLoc &DL) {
+ const X86Subtarget &Subtarget, SDValue Chain,
+ const SDLoc &DL) {
// Implementation notes:
// 1) LOCK prefix creates a full read/write reordering barrier for memory
// operations issued by the current processor. As such, the location
@@ -29236,28 +29236,28 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
- assert(VT.getScalarType() == MVT::i8 &&
- "Only byte vector BITREVERSE supported");
-
+ assert(VT.getScalarType() == MVT::i8 &&
+ "Only byte vector BITREVERSE supported");
+
// Split v64i8 without BWI so that we can still use the PSHUFB lowering.
if (VT == MVT::v64i8 && !Subtarget.hasBWI())
return splitVectorIntUnary(Op, DAG);
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
- if (VT == MVT::v32i8 && !Subtarget.hasInt256())
+ if (VT == MVT::v32i8 && !Subtarget.hasInt256())
return splitVectorIntUnary(Op, DAG);
- unsigned NumElts = VT.getVectorNumElements();
-
- // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
- if (Subtarget.hasGFNI()) {
- MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
- SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
- Matrix = DAG.getBitcast(VT, Matrix);
- return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
- DAG.getTargetConstant(0, DL, MVT::i8));
- }
-
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
+ if (Subtarget.hasGFNI()) {
+ MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
+ SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
+ Matrix = DAG.getBitcast(VT, Matrix);
+ return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
+ DAG.getTargetConstant(0, DL, MVT::i8));
+ }
+
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
// 0-15 value (moved to the other nibble).
@@ -29289,58 +29289,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}
-static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(Op);
- SDValue X = Op.getOperand(0);
- MVT VT = Op.getSimpleValueType();
-
- // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
- if (VT == MVT::i8 ||
- DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
- X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
- SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
- DAG.getConstant(0, DL, MVT::i8));
- // Copy the inverse of the parity flag into a register with setcc.
- SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
- // Extend to the original type.
- return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
- }
-
- if (VT == MVT::i64) {
- // Xor the high and low 16-bits together using a 32-bit operation.
- SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
- DAG.getNode(ISD::SRL, DL, MVT::i64, X,
- DAG.getConstant(32, DL, MVT::i8)));
- SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
- }
-
- if (VT != MVT::i16) {
- // Xor the high and low 16-bits together using a 32-bit operation.
- SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
- DAG.getConstant(16, DL, MVT::i8));
- X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
- } else {
- // If the input is 16-bits, we need to extend to use an i32 shift below.
- X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
- }
-
- // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
- // This should allow an h-reg to be used to save a shift.
- SDValue Hi = DAG.getNode(
- ISD::TRUNCATE, DL, MVT::i8,
- DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
- SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
- SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
- SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
-
- // Copy the inverse of the parity flag into a register with setcc.
- SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
- // Extend to the original type.
- return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
-}
-
+static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue X = Op.getOperand(0);
+ MVT VT = Op.getSimpleValueType();
+
+ // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
+ if (VT == MVT::i8 ||
+ DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
+ X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
+ DAG.getConstant(0, DL, MVT::i8));
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+ }
+
+ if (VT == MVT::i64) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i64, X,
+ DAG.getConstant(32, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
+ }
+
+ if (VT != MVT::i16) {
+ // Xor the high and low 16-bits together using a 32-bit operation.
+ SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
+ DAG.getConstant(16, DL, MVT::i8));
+ X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
+ } else {
+ // If the input is 16-bits, we need to extend to use an i32 shift below.
+ X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
+ }
+
+ // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
+ // This should allow an h-reg to be used to save a shift.
+ SDValue Hi = DAG.getNode(
+ ISD::TRUNCATE, DL, MVT::i8,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
+ SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
+ SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
+
+ // Copy the inverse of the parity flag into a register with setcc.
+ SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
+ // Extend to the original type.
+ return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
+}
+
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned NewOpc = 0;
@@ -29477,7 +29477,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Chain =
DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
- MPI, MaybeAlign(), MachineMemOperand::MOStore);
+ MPI, MaybeAlign(), MachineMemOperand::MOStore);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue LdOps[] = {Chain, StackPtr};
SDValue Value =
@@ -29517,7 +29517,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
MVT VT = N->getSimpleValueType(0);
- unsigned Opc = Op.getOpcode();
+ unsigned Opc = Op.getOpcode();
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -29532,14 +29532,14 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
Carry, DAG.getAllOnesConstant(DL, CarryVT));
- bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
- SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
- Op.getOperand(0), Op.getOperand(1),
- Carry.getValue(1));
+ bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
+ SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
+ Op.getOperand(0), Op.getOperand(1),
+ Carry.getValue(1));
- bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
- SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
- Sum.getValue(1), DL, DAG);
+ bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
+ SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
+ Sum.getValue(1), DL, DAG);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -29944,7 +29944,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
- case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
+ case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
@@ -29979,8 +29979,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FP_TO_SINT:
case ISD::FP_TO_UINT:
case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
- case ISD::FP_TO_SINT_SAT:
- case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
+ case ISD::FP_TO_SINT_SAT:
+ case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:
@@ -30047,8 +30047,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
- case ISD::SADDO_CARRY:
- case ISD::SSUBO_CARRY:
+ case ISD::SADDO_CARRY:
+ case ISD::SSUBO_CARRY:
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::ADD:
@@ -30116,9 +30116,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Chain);
return;
}
- case X86ISD::CVTPS2PH:
- Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
- return;
+ case X86ISD::CVTPS2PH:
+ Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
+ return;
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
@@ -30360,7 +30360,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
assert(isTypeLegal(LoVT) && "Split VT not legal?");
- SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
+ SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
// We need to shift the input over by half the number of elements.
unsigned NumElts = InVT.getVectorNumElements();
@@ -30370,7 +30370,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ShufMask[i] = i + HalfNumElts;
SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
- Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
+ Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
Results.push_back(Res);
@@ -30721,30 +30721,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
swapInH =
DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
swapInH, cpInH.getValue(1));
-
- // In 64-bit mode we might need the base pointer in RBX, but we can't know
- // until later. So we keep the RBX input in a vreg and use a custom
- // inserter.
- // Since RBX will be a reserved register the register allocator will not
- // make sure its value will be properly saved and restored around this
- // live-range.
+
+ // In 64-bit mode we might need the base pointer in RBX, but we can't know
+ // until later. So we keep the RBX input in a vreg and use a custom
+ // inserter.
+ // Since RBX will be a reserved register the register allocator will not
+ // make sure its value will be properly saved and restored around this
+ // live-range.
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
- if (Regs64bit) {
- SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
- swapInH.getValue(1)};
- Result =
- DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
+ if (Regs64bit) {
+ SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
+ swapInH.getValue(1)};
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
} else {
- swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
+ swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
swapInH.getValue(1));
SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
swapInL.getValue(1)};
- Result =
- DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
+ Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
}
-
+
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
HalfT, Result.getValue(1));
@@ -30989,9 +30989,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(COMI)
NODE_NAME_CASE(UCOMI)
NODE_NAME_CASE(CMPM)
- NODE_NAME_CASE(CMPMM)
+ NODE_NAME_CASE(CMPMM)
NODE_NAME_CASE(STRICT_CMPM)
- NODE_NAME_CASE(CMPMM_SAE)
+ NODE_NAME_CASE(CMPMM_SAE)
NODE_NAME_CASE(SETCC)
NODE_NAME_CASE(SETCC_CARRY)
NODE_NAME_CASE(FSETCC)
@@ -31109,7 +31109,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(XOR)
NODE_NAME_CASE(AND)
NODE_NAME_CASE(BEXTR)
- NODE_NAME_CASE(BEXTRI)
+ NODE_NAME_CASE(BEXTRI)
NODE_NAME_CASE(BZHI)
NODE_NAME_CASE(PDEP)
NODE_NAME_CASE(PEXT)
@@ -31147,7 +31147,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VBROADCAST)
NODE_NAME_CASE(VBROADCAST_LOAD)
NODE_NAME_CASE(VBROADCASTM)
- NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
+ NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
NODE_NAME_CASE(VPERMILPV)
NODE_NAME_CASE(VPERMILPI)
NODE_NAME_CASE(VPERM2X128)
@@ -31169,7 +31169,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DBPSADBW)
NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
NODE_NAME_CASE(VAARG_64)
- NODE_NAME_CASE(VAARG_X32)
+ NODE_NAME_CASE(VAARG_X32)
NODE_NAME_CASE(WIN_ALLOCA)
NODE_NAME_CASE(MEMBARRIER)
NODE_NAME_CASE(MFENCE)
@@ -31326,15 +31326,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ENQCMD)
NODE_NAME_CASE(ENQCMDS)
NODE_NAME_CASE(VP2INTERSECT)
- NODE_NAME_CASE(AESENC128KL)
- NODE_NAME_CASE(AESDEC128KL)
- NODE_NAME_CASE(AESENC256KL)
- NODE_NAME_CASE(AESDEC256KL)
- NODE_NAME_CASE(AESENCWIDE128KL)
- NODE_NAME_CASE(AESDECWIDE128KL)
- NODE_NAME_CASE(AESENCWIDE256KL)
- NODE_NAME_CASE(AESDECWIDE256KL)
- NODE_NAME_CASE(TESTUI)
+ NODE_NAME_CASE(AESENC128KL)
+ NODE_NAME_CASE(AESDEC128KL)
+ NODE_NAME_CASE(AESENC256KL)
+ NODE_NAME_CASE(AESDEC256KL)
+ NODE_NAME_CASE(AESENCWIDE128KL)
+ NODE_NAME_CASE(AESDECWIDE128KL)
+ NODE_NAME_CASE(AESENCWIDE256KL)
+ NODE_NAME_CASE(AESDECWIDE256KL)
+ NODE_NAME_CASE(TESTUI)
}
return nullptr;
#undef NODE_NAME_CASE
@@ -31680,7 +31680,7 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
/// Utility function to emit xbegin specifying the start of an RTM region.
static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
const TargetInstrInfo *TII) {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
MachineFunction::iterator I = ++MBB->getIterator();
@@ -31760,8 +31760,8 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
@@ -31772,8 +31772,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// 8 ) Align : Alignment of type
// 9 ) EFLAGS (implicit-def)
- assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
- static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
+ assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
+ static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
Register DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
@@ -31788,7 +31788,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
MachineFunction *MF = MBB->getParent();
// Memory Reference
- assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
+ assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
MachineMemOperand *OldMMO = MI.memoperands().front();
@@ -31801,10 +31801,10 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Machine Information
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- const TargetRegisterClass *AddrRegClass =
- getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
// struct va_list {
// i32 gp_offset
@@ -31913,35 +31913,35 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Read the reg_save_area address.
Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(
- offsetMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
- RegSaveReg)
+ BuildMI(
+ offsetMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ RegSaveReg)
.add(Base)
.add(Scale)
.add(Index)
- .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
+ .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
.add(Segment)
.setMemRefs(LoadOnlyMMO);
- if (Subtarget.isTarget64BitLP64()) {
- // Zero-extend the offset
- Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
- .addImm(0)
- .addReg(OffsetReg)
- .addImm(X86::sub_32bit);
-
- // Add the offset to the reg_save_area to get the final address.
- BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
- .addReg(OffsetReg64)
- .addReg(RegSaveReg);
- } else {
- // Add the offset to the reg_save_area to get the final address.
- BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
- .addReg(OffsetReg)
- .addReg(RegSaveReg);
- }
+ if (Subtarget.isTarget64BitLP64()) {
+ // Zero-extend the offset
+ Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ .addImm(0)
+ .addReg(OffsetReg)
+ .addImm(X86::sub_32bit);
+
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+ .addReg(OffsetReg64)
+ .addReg(RegSaveReg);
+ } else {
+ // Add the offset to the reg_save_area to get the final address.
+ BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
+ .addReg(OffsetReg)
+ .addReg(RegSaveReg);
+ }
// Compute the offset for the next argument
Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
@@ -31970,9 +31970,9 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Load the overflow_area address into a register.
Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(overflowMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
- OverflowAddrReg)
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
+ OverflowAddrReg)
.add(Base)
.add(Scale)
.add(Index)
@@ -31987,17 +31987,17 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
- BuildMI(
- overflowMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
- TmpReg)
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ TmpReg)
.addReg(OverflowAddrReg)
.addImm(Alignment.value() - 1);
- BuildMI(
- overflowMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
- OverflowDestReg)
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
+ OverflowDestReg)
.addReg(TmpReg)
.addImm(~(uint64_t)(Alignment.value() - 1));
} else {
@@ -32008,16 +32008,16 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(
- overflowMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
- NextAddrReg)
- .addReg(OverflowDestReg)
- .addImm(ArgSizeA8);
+ BuildMI(
+ overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
+ NextAddrReg)
+ .addReg(OverflowDestReg)
+ .addImm(ArgSizeA8);
// Store the new overflow address.
- BuildMI(overflowMBB, DL,
- TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
+ BuildMI(overflowMBB, DL,
+ TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
.add(Base)
.add(Scale)
.add(Index)
@@ -32073,10 +32073,10 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
// Now add the instructions.
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
Register CountReg = MI.getOperand(0).getReg();
- int RegSaveFrameIndex = MI.getOperand(1).getImm();
+ int RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
@@ -32385,7 +32385,7 @@ MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
MachineBasicBlock *ThisMBB) const {
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// diamond control-flow pattern. The incoming instruction knows the
@@ -32540,7 +32540,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
const unsigned ProbeSize = getStackProbeSize(*MF);
@@ -32633,7 +32633,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
assert(MF->shouldSplitStack());
@@ -32668,7 +32668,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
const TargetRegisterClass *AddrRegClass =
getRegClassFor(getPointerTy(MF->getDataLayout()));
- Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
@@ -32768,7 +32768,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
assert(!isAsynchronousEHPersonality(
classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
@@ -32806,7 +32806,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
// inside MC, therefore without the two markers shrink-wrapping
// may push the prologue/epilogue pass them.
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction &MF = *BB->getParent();
// Emit CALLSEQ_START right before the instruction.
@@ -32835,7 +32835,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
// be in the normal return register.
MachineFunction *F = BB->getParent();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
assert(MI.getOperand(3).isGlobal() && "This should be a global");
@@ -32974,7 +32974,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
@@ -33036,7 +33036,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
/// \param [in] MBB The Machine Basic Block that will be modified.
void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -33079,7 +33079,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -33239,7 +33239,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -33420,7 +33420,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -33504,7 +33504,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB,
int FI) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -33553,7 +33553,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
MachineBasicBlock *
X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *BB) const {
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
@@ -33783,7 +33783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- const DebugLoc &DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
auto TMMImmToTMMReg = [](unsigned Imm) {
assert (Imm < 8 && "Illegal tmm index");
@@ -33793,10 +33793,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
case X86::TLS_addr64:
- case X86::TLS_addrX32:
+ case X86::TLS_addrX32:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
- case X86::TLS_base_addrX32:
+ case X86::TLS_base_addrX32:
return EmitLoweredTLSAddr(MI, BB);
case X86::INDIRECT_THUNK_CALL32:
case X86::INDIRECT_THUNK_CALL64:
@@ -33952,8 +33952,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
case X86::VAARG_64:
- case X86::VAARG_X32:
- return EmitVAARGWithCustomInserter(MI, BB);
+ case X86::VAARG_X32:
+ return EmitVAARGWithCustomInserter(MI, BB);
case X86::EH_SjLj_SetJmp32:
case X86::EH_SjLj_SetJmp64:
@@ -33977,7 +33977,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case TargetOpcode::PATCHABLE_EVENT_CALL:
case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
- return BB;
+ return BB;
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -34032,75 +34032,75 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}
- case X86::LCMPXCHG16B_NO_RBX: {
- const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
- Register BasePtr = TRI->getBaseRegister();
- if (TRI->hasBasePointer(*MF) &&
- (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
- if (!BB->isLiveIn(BasePtr))
- BB->addLiveIn(BasePtr);
- // Save RBX into a virtual register.
- Register SaveRBX =
- MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
- .addReg(X86::RBX);
- Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
- for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
- MIB.add(MI.getOperand(Idx));
- MIB.add(MI.getOperand(X86::AddrNumOperands));
- MIB.addReg(SaveRBX);
- } else {
- // Simple case, just copy the virtual register to RBX.
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
- .add(MI.getOperand(X86::AddrNumOperands));
- MachineInstrBuilder MIB =
- BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
- for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
- MIB.add(MI.getOperand(Idx));
- }
- MI.eraseFromParent();
+ case X86::LCMPXCHG16B_NO_RBX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ if (TRI->hasBasePointer(*MF) &&
+ (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
+ if (!BB->isLiveIn(BasePtr))
+ BB->addLiveIn(BasePtr);
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ MIB.add(MI.getOperand(X86::AddrNumOperands));
+ MIB.addReg(SaveRBX);
+ } else {
+ // Simple case, just copy the virtual register to RBX.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
+ .add(MI.getOperand(X86::AddrNumOperands));
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
+ for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
+ MIB.add(MI.getOperand(Idx));
+ }
+ MI.eraseFromParent();
return BB;
- }
- case X86::MWAITX: {
- const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
- Register BasePtr = TRI->getBaseRegister();
- bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
- // If no need to save the base pointer, we generate MWAITXrrr,
- // else we generate pseudo MWAITX_SAVE_RBX.
- if (!IsRBX || !TRI->hasBasePointer(*MF)) {
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI.getOperand(0).getReg());
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI.getOperand(1).getReg());
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
- .addReg(MI.getOperand(2).getReg());
- BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
- MI.eraseFromParent();
- } else {
- if (!BB->isLiveIn(BasePtr)) {
- BB->addLiveIn(BasePtr);
- }
- // Parameters can be copied into ECX and EAX but not EBX yet.
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI.getOperand(0).getReg());
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI.getOperand(1).getReg());
- assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
- // Save RBX into a virtual register.
- Register SaveRBX =
- MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
- .addReg(X86::RBX);
- // Generate mwaitx pseudo.
- Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
- .addDef(Dst) // Destination tied in with SaveRBX.
- .addReg(MI.getOperand(2).getReg()) // input value of EBX.
- .addUse(SaveRBX); // Save of base pointer.
- MI.eraseFromParent();
- }
+ }
+ case X86::MWAITX: {
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ Register BasePtr = TRI->getBaseRegister();
+ bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
+ // If no need to save the base pointer, we generate MWAITXrrr,
+ // else we generate pseudo MWAITX_SAVE_RBX.
+ if (!IsRBX || !TRI->hasBasePointer(*MF)) {
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
+ .addReg(MI.getOperand(2).getReg());
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
+ MI.eraseFromParent();
+ } else {
+ if (!BB->isLiveIn(BasePtr)) {
+ BB->addLiveIn(BasePtr);
+ }
+ // Parameters can be copied into ECX and EAX but not EBX yet.
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
+ .addReg(MI.getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI.getOperand(1).getReg());
+ assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
+ // Save RBX into a virtual register.
+ Register SaveRBX =
+ MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
+ .addReg(X86::RBX);
+ // Generate mwaitx pseudo.
+ Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
+ .addDef(Dst) // Destination tied in with SaveRBX.
+ .addReg(MI.getOperand(2).getReg()) // input value of EBX.
+ .addUse(SaveRBX); // Save of base pointer.
+ MI.eraseFromParent();
+ }
return BB;
}
case TargetOpcode::PREALLOCATED_SETUP: {
@@ -34365,11 +34365,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits Known2;
if (!!DemandedLHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
- Known = KnownBits::commonBits(Known, Known2);
+ Known = KnownBits::commonBits(Known, Known2);
}
if (!!DemandedRHS) {
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
- Known = KnownBits::commonBits(Known, Known2);
+ Known = KnownBits::commonBits(Known, Known2);
}
if (Known.countMinLeadingZeros() < BitWidth)
@@ -34412,11 +34412,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
- Known = KnownBits::commonBits(Known, Known2);
+ Known = KnownBits::commonBits(Known, Known2);
break;
}
- case X86ISD::BEXTR:
- case X86ISD::BEXTRI: {
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -34438,28 +34438,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
break;
}
- case X86ISD::PDEP: {
- KnownBits Known2;
- Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- // Zeros are retained from the mask operand. But not ones.
- Known.One.clearAllBits();
- // The result will have at least as many trailing zeros as the non-mask
- // operand since bits can only map to the same or higher bit position.
- Known.Zero.setLowBits(Known2.countMinTrailingZeros());
- break;
- }
- case X86ISD::PEXT: {
- Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
- // The result has as many leading zeros as the number of zeroes in the mask.
- unsigned Count = Known.Zero.countPopulation();
- Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
- Known.One.clearAllBits();
- break;
- }
- case X86ISD::VTRUNC:
- case X86ISD::VTRUNCS:
- case X86ISD::VTRUNCUS:
+ case X86ISD::PDEP: {
+ KnownBits Known2;
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ // Zeros are retained from the mask operand. But not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ break;
+ }
+ case X86ISD::PEXT: {
+ Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+ // The result has as many leading zeros as the number of zeroes in the mask.
+ unsigned Count = Known.Zero.countPopulation();
+ Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+ Known.One.clearAllBits();
+ break;
+ }
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P:
case X86ISD::CVTP2SI:
@@ -34476,7 +34476,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::VMFPROUND:
case X86ISD::CVTPS2PH:
case X86ISD::MCVTPS2PH: {
- // Truncations/Conversions - upper elements are known zero.
+ // Truncations/Conversions - upper elements are known zero.
EVT SrcVT = Op.getOperand(0).getValueType();
if (SrcVT.isVector()) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
@@ -34554,7 +34554,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
continue;
KnownBits Known2 =
DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
- Known = KnownBits::commonBits(Known, Known2);
+ Known = KnownBits::commonBits(Known, Known2);
}
}
}
@@ -34733,18 +34733,18 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
- if (MaskEltSize == 32 && Mask[0] == 0) {
- if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
- }
- if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
- }
+ if (MaskEltSize == 32 && Mask[0] == 0) {
+ if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+ if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
}
// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
@@ -34798,17 +34798,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v4f32;
return true;
@@ -34817,17 +34817,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v4f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v8f32;
return true;
@@ -34837,21 +34837,21 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v8f64;
return true;
}
if (isTargetShuffleEquivalent(
- MaskVT, Mask,
- {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
+ MaskVT, Mask,
+ {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
Shuffle = X86ISD::MOVSLDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
}
if (isTargetShuffleEquivalent(
- MaskVT, Mask,
- {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
+ MaskVT, Mask,
+ {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
Shuffle = X86ISD::MOVSHDUP;
SrcVT = DstVT = MVT::v16f32;
return true;
@@ -34933,10 +34933,10 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
- if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
- ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
- (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
@@ -35006,31 +35006,31 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
- unsigned NumMaskElts = Mask.size();
+ unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
- Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
+ Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
SrcVT = DstVT = MVT::v2f64;
return true;
}
- if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
+ if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
SrcVT = DstVT = MVT::v4f32;
@@ -35064,46 +35064,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- // Attempt to match against a OR if we're performing a blend shuffle and the
- // non-blended source element is zero in each case.
- if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
- (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
- bool IsBlend = true;
- unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
- unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
- unsigned Scale1 = NumV1Elts / NumMaskElts;
- unsigned Scale2 = NumV2Elts / NumMaskElts;
- APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
- APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
- for (unsigned i = 0; i != NumMaskElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- if (M == SM_SentinelZero) {
- DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
- DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
- continue;
- }
- if (M == (int)i) {
- DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
- continue;
- }
- if (M == (int)(i + NumMaskElts)) {
- DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
- continue;
- }
- IsBlend = false;
- break;
- }
- if (IsBlend &&
- DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
- DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
- Shuffle = ISD::OR;
- SrcVT = DstVT = MaskVT.changeTypeToInteger();
- return true;
- }
- }
-
+ // Attempt to match against a OR if we're performing a blend shuffle and the
+ // non-blended source element is zero in each case.
+ if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
+ bool IsBlend = true;
+ unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
+ unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
+ unsigned Scale1 = NumV1Elts / NumMaskElts;
+ unsigned Scale2 = NumV2Elts / NumMaskElts;
+ APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
+ APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ if (M == SM_SentinelZero) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)i) {
+ DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
+ continue;
+ }
+ if (M == (int)(i + NumMaskElts)) {
+ DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
+ continue;
+ }
+ IsBlend = false;
+ break;
+ }
+ if (IsBlend &&
+ DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
+ DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
+ Shuffle = ISD::OR;
+ SrcVT = DstVT = MaskVT.changeTypeToInteger();
+ return true;
+ }
+ }
+
return false;
}
@@ -35292,16 +35292,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
- MVT RootVT = Root.getSimpleValueType();
- unsigned RootSizeInBits = RootVT.getSizeInBits();
- unsigned NumRootElts = RootVT.getVectorNumElements();
-
- // Canonicalize shuffle input op to the requested type.
- // TODO: Support cases where Op is smaller than VT.
- auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
- return DAG.getBitcast(VT, Op);
- };
-
+ MVT RootVT = Root.getSimpleValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ unsigned NumRootElts = RootVT.getVectorNumElements();
+
+ // Canonicalize shuffle input op to the requested type.
+ // TODO: Support cases where Op is smaller than VT.
+ auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
+ return DAG.getBitcast(VT, Op);
+ };
+
// Find the inputs that enter the chain. Note that multiple uses are OK
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
@@ -35311,8 +35311,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
- assert(VT1.getSizeInBits() == RootSizeInBits &&
- VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
+ assert(VT1.getSizeInBits() == RootSizeInBits &&
+ VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch");
SDLoc DL(Root);
SDValue Res;
@@ -35320,7 +35320,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
- return CanonicalizeShuffleInput(RootVT, V1);
+ return CanonicalizeShuffleInput(RootVT, V1);
}
bool OptForSize = DAG.shouldOptForSize();
@@ -35344,9 +35344,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// we can just use the broadcast directly. This works for smaller broadcast
// elements as well as they already repeat across each mask element
if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
- (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
- V1.getValueSizeInBits() >= RootSizeInBits) {
- return CanonicalizeShuffleInput(RootVT, V1);
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+ V1.getValueSizeInBits() >= RootSizeInBits) {
+ return CanonicalizeShuffleInput(RootVT, V1);
}
// Handle 128/256-bit lane shuffles of 512-bit vectors.
@@ -35360,11 +35360,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
"Unexpected lane shuffle");
- Res = CanonicalizeShuffleInput(RootVT, V1);
- unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
bool UseZero = isAnyZero(BaseMask);
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
- return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
+ return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
}
// Narrow shuffle mask to v4x128.
@@ -35373,8 +35373,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
// Try to lower to vshuf64x2/vshuf32x4.
- auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
- SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
unsigned PermMask = 0;
// Insure elements came from the same Op.
SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
@@ -35397,8 +35397,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
- CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
- CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
+ CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
+ CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
DAG.getTargetConstant(PermMask, DL, MVT::i8));
};
@@ -35413,9 +35413,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
if (!isAnyZero(Mask) && !PreferPERMQ) {
- if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
- return SDValue(); // Nothing to do!
- MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
return DAG.getBitcast(RootVT, V);
}
@@ -35430,10 +35430,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
return SDValue(); // Nothing to do!
assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
- Res = CanonicalizeShuffleInput(RootVT, V1);
- Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
- return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
- DL, 256);
+ Res = CanonicalizeShuffleInput(RootVT, V1);
+ Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+ return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
}
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
@@ -35448,9 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
- return DAG.getNode(
- X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
- DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getNode(
+ X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
+ DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
@@ -35466,12 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] & 3) << 0);
PermMask |= ((BaseMask[1] & 3) << 4);
- SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
- SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
- return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
- CanonicalizeShuffleInput(RootVT, LHS),
- CanonicalizeShuffleInput(RootVT, RHS),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+ SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
+ return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
+ CanonicalizeShuffleInput(RootVT, LHS),
+ CanonicalizeShuffleInput(RootVT, RHS),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
}
}
@@ -35533,7 +35533,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if ((Subtarget.hasAVX2() ||
(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
- if (isUndefOrEqual(Mask, 0)) {
+ if (isUndefOrEqual(Mask, 0)) {
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
@@ -35546,7 +35546,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Subtarget.hasAVX2()) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
- Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
@@ -35561,7 +35561,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
return DAG.getBitcast(RootVT, Res);
}
@@ -35573,7 +35573,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = CanonicalizeShuffleInput(ShuffleVT, V1);
+ Res = CanonicalizeShuffleInput(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
@@ -35584,32 +35584,32 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// from a scalar.
// TODO: Handle other insertions here as well?
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
- Subtarget.hasSSE41() &&
- !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
- if (MaskEltSizeInBits == 32) {
- SDValue SrcV1 = V1, SrcV2 = V2;
- if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
- DAG) &&
- SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
- if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
- return SDValue(); // Nothing to do!
- Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
- CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
- CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
- DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
- }
- }
- if (MaskEltSizeInBits == 64 &&
- isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
- V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- V2.getScalarValueSizeInBits() <= 32) {
+ Subtarget.hasSSE41() &&
+ !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
+ if (MaskEltSizeInBits == 32) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
+ DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
+ CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+ if (MaskEltSizeInBits == 64 &&
+ isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
+ V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V2.getScalarValueSizeInBits() <= 32) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
return SDValue(); // Nothing to do!
- PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
+ PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
- CanonicalizeShuffleInput(MVT::v4f32, V1),
- CanonicalizeShuffleInput(MVT::v4f32, V2),
+ CanonicalizeShuffleInput(MVT::v4f32, V1),
+ CanonicalizeShuffleInput(MVT::v4f32, V2),
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -35623,8 +35623,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
- NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
+ NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
return DAG.getBitcast(RootVT, Res);
}
@@ -35637,8 +35637,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
- NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
+ NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
+ NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
@@ -35655,7 +35655,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Zeroable)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
- V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -35665,8 +35665,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
- V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
- V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
+ V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
+ V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
DAG.getTargetConstant(BitLen, DL, MVT::i8),
DAG.getTargetConstant(BitIdx, DL, MVT::i8));
@@ -35685,7 +35685,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
if (Depth == 0 && Root.getOpcode() == Opc)
return SDValue(); // Nothing to do!
- V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
if (ShuffleVT.getSizeInBits() < RootSizeInBits)
Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
@@ -35702,8 +35702,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return SDValue(); // Nothing to do!
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
- V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
- V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
+ V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
+ V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
@@ -35720,56 +35720,56 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Depth threshold above which we can efficiently use variable mask shuffles.
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
- // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
- // higher depth before combining them.
- bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
+ // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
+ // higher depth before combining them.
+ bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask);
bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
- if (Subtarget.hasAVX2() &&
- (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
- SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
- Res = CanonicalizeShuffleInput(MaskVT, V1);
- Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
- return DAG.getBitcast(RootVT, Res);
- }
- // AVX512 variants (non-VLX will pad to 512-bit shuffles).
- if ((Subtarget.hasAVX512() &&
- (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() &&
- (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() &&
- (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
- V1 = CanonicalizeShuffleInput(MaskVT, V1);
- V2 = DAG.getUNDEF(MaskVT);
- Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
- return DAG.getBitcast(RootVT, Res);
- }
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) {
+ if (Subtarget.hasAVX2() &&
+ (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ // AVX512 variants (non-VLX will pad to 512-bit shuffles).
+ if ((Subtarget.hasAVX512() &&
+ (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = DAG.getUNDEF(MaskVT);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ return DAG.getBitcast(RootVT, Res);
+ }
}
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
- // vector as the second source (non-VLX will pad to 512-bit shuffles).
+ // vector as the second source (non-VLX will pad to 512-bit shuffles).
if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
- MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
// Adjust shuffle mask - replace SM_SentinelZero with second source index.
for (unsigned i = 0; i != NumMaskElts; ++i)
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
- V1 = CanonicalizeShuffleInput(MaskVT, V1);
- V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
- Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
@@ -35780,21 +35780,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
- // (non-VLX will pad to 512-bit shuffles).
+ // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles).
if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
- MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
+ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
+ MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
- (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
- V1 = CanonicalizeShuffleInput(MaskVT, V1);
- V2 = CanonicalizeShuffleInput(MaskVT, V2);
- Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
return SDValue();
@@ -35820,7 +35820,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
EltBits[i] = AllOnes;
}
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
- Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
unsigned AndOpcode =
MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
@@ -35840,7 +35840,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPermIdx.push_back(Idx);
}
SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
- Res = CanonicalizeShuffleInput(MaskVT, V1);
+ Res = CanonicalizeShuffleInput(MaskVT, V1);
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
return DAG.getBitcast(RootVT, Res);
}
@@ -35872,8 +35872,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
VPerm2Idx.push_back(Index);
}
- V1 = CanonicalizeShuffleInput(MaskVT, V1);
- V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
@@ -35907,7 +35907,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
- Res = CanonicalizeShuffleInput(ByteVT, V1);
+ Res = CanonicalizeShuffleInput(ByteVT, V1);
SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
return DAG.getBitcast(RootVT, Res);
@@ -35937,8 +35937,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::v16i8;
- V1 = CanonicalizeShuffleInput(ByteVT, V1);
- V2 = CanonicalizeShuffleInput(ByteVT, V2);
+ V1 = CanonicalizeShuffleInput(ByteVT, V1);
+ V2 = CanonicalizeShuffleInput(ByteVT, V2);
SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
return DAG.getBitcast(RootVT, Res);
@@ -35951,22 +35951,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG, Subtarget))
return WideShuffle;
- // If we have a dual input shuffle then lower to VPERMV3,
- // (non-VLX will pad to 512-bit shuffles)
+ // If we have a dual input shuffle then lower to VPERMV3,
+ // (non-VLX will pad to 512-bit shuffles)
if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
- (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
- MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
- MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
- MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
- MaskVT == MVT::v16i32)) ||
- (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
- (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
- (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
- V1 = CanonicalizeShuffleInput(MaskVT, V1);
- V2 = CanonicalizeShuffleInput(MaskVT, V2);
- Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
+ (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
+ MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
+ MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
+ MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
+ MaskVT == MVT::v16i32)) ||
+ (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
+ (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
+ (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
+ V1 = CanonicalizeShuffleInput(MaskVT, V1);
+ V2 = CanonicalizeShuffleInput(MaskVT, V2);
+ Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
return DAG.getBitcast(RootVT, Res);
}
@@ -35991,16 +35991,16 @@ static SDValue combineX86ShuffleChainWithExtract(
if (NumInputs == 0)
return SDValue();
- EVT RootVT = Root.getValueType();
- unsigned RootSizeInBits = RootVT.getSizeInBits();
- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
-
+ EVT RootVT = Root.getValueType();
+ unsigned RootSizeInBits = RootVT.getSizeInBits();
+ assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
+
SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
SmallVector<unsigned, 4> Offsets(NumInputs, 0);
// Peek through subvectors.
// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
- unsigned WideSizeInBits = RootSizeInBits;
+ unsigned WideSizeInBits = RootSizeInBits;
for (unsigned i = 0; i != NumInputs; ++i) {
SDValue &Src = WideInputs[i];
unsigned &Offset = Offsets[i];
@@ -36082,149 +36082,149 @@ static SDValue combineX86ShuffleChainWithExtract(
return SDValue();
}
-// Canonicalize the combined shuffle mask chain with horizontal ops.
-// NOTE: This may update the Ops and Mask.
-static SDValue canonicalizeShuffleMaskWithHorizOp(
- MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
- unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (Mask.empty() || Ops.empty())
- return SDValue();
-
- SmallVector<SDValue> BC;
- for (SDValue Op : Ops)
- BC.push_back(peekThroughBitcasts(Op));
-
- // All ops must be the same horizop + type.
- SDValue BC0 = BC[0];
- EVT VT0 = BC0.getValueType();
- unsigned Opcode0 = BC0.getOpcode();
- if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
- return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
- }))
- return SDValue();
-
- bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
- bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
- if (!isHoriz && !isPack)
- return SDValue();
-
- int NumElts = VT0.getVectorNumElements();
- int NumLanes = VT0.getSizeInBits() / 128;
- int NumEltsPerLane = NumElts / NumLanes;
- int NumHalfEltsPerLane = NumEltsPerLane / 2;
-
- // See if we can remove the shuffle by resorting the HOP chain so that
- // the HOP args are pre-shuffled.
- // TODO: Generalize to any sized/depth chain.
- // TODO: Add support for PACKSS/PACKUS.
- if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
- shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
- SmallVector<int> ScaledMask;
- if (scaleShuffleElements(Mask, 4, ScaledMask)) {
- // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
- auto GetHOpSrc = [&](int M) {
- if (M == SM_SentinelUndef)
- return DAG.getUNDEF(VT0);
- if (M == SM_SentinelZero)
- return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
- SDValue Src0 = BC[M / NumElts];
- SDValue Src1 = Src0.getOperand((M % 4) >= 2);
- if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
- return Src1.getOperand(M % 2);
- return SDValue();
- };
- SDValue M0 = GetHOpSrc(ScaledMask[0]);
- SDValue M1 = GetHOpSrc(ScaledMask[1]);
- SDValue M2 = GetHOpSrc(ScaledMask[2]);
- SDValue M3 = GetHOpSrc(ScaledMask[3]);
- if (M0 && M1 && M2 && M3) {
- SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
- SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
- return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
- }
- }
- }
-
- if (2 < Ops.size())
- return SDValue();
-
- SDValue BC1 = BC[BC.size() - 1];
- if (Mask.size() == VT0.getVectorNumElements()) {
- // Canonicalize binary shuffles of horizontal ops that use the
- // same sources to an unary shuffle.
- // TODO: Try to perform this fold even if the shuffle remains.
- if (Ops.size() == 2) {
- auto ContainsOps = [](SDValue HOp, SDValue Op) {
- return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
- };
- // Commute if all BC0's ops are contained in BC1.
- if (ContainsOps(BC1, BC0.getOperand(0)) &&
- ContainsOps(BC1, BC0.getOperand(1))) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
- std::swap(BC0, BC1);
- }
-
- // If BC1 can be represented by BC0, then convert to unary shuffle.
- if (ContainsOps(BC0, BC1.getOperand(0)) &&
- ContainsOps(BC0, BC1.getOperand(1))) {
- for (int &M : Mask) {
- if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
- continue;
- int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
- M -= NumElts + (SubLane * NumHalfEltsPerLane);
- if (BC1.getOperand(SubLane) != BC0.getOperand(0))
- M += NumHalfEltsPerLane;
- }
- }
- }
-
- // Canonicalize unary horizontal ops to only refer to lower halves.
- for (int i = 0; i != NumElts; ++i) {
- int &M = Mask[i];
- if (isUndefOrZero(M))
- continue;
- if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
- (M % NumEltsPerLane) >= NumHalfEltsPerLane)
- M -= NumHalfEltsPerLane;
- if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
- (M % NumEltsPerLane) >= NumHalfEltsPerLane)
- M -= NumHalfEltsPerLane;
- }
- }
-
- // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
- // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
- // represents the LHS/RHS inputs for the lower/upper halves.
- unsigned EltSizeInBits = RootSizeInBits / Mask.size();
- SmallVector<int, 16> TargetMask128, WideMask128;
- if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
- scaleShuffleElements(TargetMask128, 2, WideMask128)) {
- assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
- bool SingleOp = (Ops.size() == 1);
- if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
- SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
- SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
- Lo = Lo.getOperand(WideMask128[0] & 1);
- Hi = Hi.getOperand(WideMask128[1] & 1);
- if (SingleOp) {
- MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
- SDValue Undef = DAG.getUNDEF(SrcVT);
- SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
- Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
- Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
- Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
- Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
- }
- return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- }
- }
-
- return SDValue();
-}
-
+// Canonicalize the combined shuffle mask chain with horizontal ops.
+// NOTE: This may update the Ops and Mask.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+ MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+ unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Mask.empty() || Ops.empty())
+ return SDValue();
+
+ SmallVector<SDValue> BC;
+ for (SDValue Op : Ops)
+ BC.push_back(peekThroughBitcasts(Op));
+
+ // All ops must be the same horizop + type.
+ SDValue BC0 = BC[0];
+ EVT VT0 = BC0.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
+ return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
+ }))
+ return SDValue();
+
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
+ if (!isHoriz && !isPack)
+ return SDValue();
+
+ int NumElts = VT0.getVectorNumElements();
+ int NumLanes = VT0.getSizeInBits() / 128;
+ int NumEltsPerLane = NumElts / NumLanes;
+ int NumHalfEltsPerLane = NumEltsPerLane / 2;
+
+ // See if we can remove the shuffle by resorting the HOP chain so that
+ // the HOP args are pre-shuffled.
+ // TODO: Generalize to any sized/depth chain.
+ // TODO: Add support for PACKSS/PACKUS.
+ if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() &&
+ shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) {
+ SmallVector<int> ScaledMask;
+ if (scaleShuffleElements(Mask, 4, ScaledMask)) {
+ // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
+ auto GetHOpSrc = [&](int M) {
+ if (M == SM_SentinelUndef)
+ return DAG.getUNDEF(VT0);
+ if (M == SM_SentinelZero)
+ return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
+ SDValue Src0 = BC[M / NumElts];
+ SDValue Src1 = Src0.getOperand((M % 4) >= 2);
+ if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
+ return Src1.getOperand(M % 2);
+ return SDValue();
+ };
+ SDValue M0 = GetHOpSrc(ScaledMask[0]);
+ SDValue M1 = GetHOpSrc(ScaledMask[1]);
+ SDValue M2 = GetHOpSrc(ScaledMask[2]);
+ SDValue M3 = GetHOpSrc(ScaledMask[3]);
+ if (M0 && M1 && M2 && M3) {
+ SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1);
+ SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3);
+ return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
+ }
+ }
+ }
+
+ if (2 < Ops.size())
+ return SDValue();
+
+ SDValue BC1 = BC[BC.size() - 1];
+ if (Mask.size() == VT0.getVectorNumElements()) {
+ // Canonicalize binary shuffles of horizontal ops that use the
+ // same sources to an unary shuffle.
+ // TODO: Try to perform this fold even if the shuffle remains.
+ if (Ops.size() == 2) {
+ auto ContainsOps = [](SDValue HOp, SDValue Op) {
+ return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
+ };
+ // Commute if all BC0's ops are contained in BC1.
+ if (ContainsOps(BC1, BC0.getOperand(0)) &&
+ ContainsOps(BC1, BC0.getOperand(1))) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ std::swap(BC0, BC1);
+ }
+
+ // If BC1 can be represented by BC0, then convert to unary shuffle.
+ if (ContainsOps(BC0, BC1.getOperand(0)) &&
+ ContainsOps(BC0, BC1.getOperand(1))) {
+ for (int &M : Mask) {
+ if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
+ continue;
+ int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
+ M -= NumElts + (SubLane * NumHalfEltsPerLane);
+ if (BC1.getOperand(SubLane) != BC0.getOperand(0))
+ M += NumHalfEltsPerLane;
+ }
+ }
+ }
+
+ // Canonicalize unary horizontal ops to only refer to lower halves.
+ for (int i = 0; i != NumElts; ++i) {
+ int &M = Mask[i];
+ if (isUndefOrZero(M))
+ continue;
+ if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
+ (M % NumEltsPerLane) >= NumHalfEltsPerLane)
+ M -= NumHalfEltsPerLane;
+ }
+ }
+
+ // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ unsigned EltSizeInBits = RootSizeInBits / Mask.size();
+ SmallVector<int, 16> TargetMask128, WideMask128;
+ if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
+ scaleShuffleElements(TargetMask128, 2, WideMask128)) {
+ assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
+ bool SingleOp = (Ops.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WideMask128[0] & 1);
+ Hi = Hi.getOperand(WideMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ }
+ }
+
+ return SDValue();
+}
+
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
@@ -36316,14 +36316,14 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
return DAG.getBitcast(VT, CstOp);
}
-namespace llvm {
- namespace X86 {
- enum {
- MaxShuffleCombineDepth = 8
- };
- }
-} // namespace llvm
-
+namespace llvm {
+ namespace X86 {
+ enum {
+ MaxShuffleCombineDepth = 8
+ };
+ }
+} // namespace llvm
+
/// Fully generic combining of x86 shuffle instructions.
///
/// This should be the last combine run over the x86 shuffle instructions. Once
@@ -36356,30 +36356,30 @@ namespace llvm {
static SDValue combineX86ShufflesRecursively(
ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
- unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
- SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
assert(RootMask.size() > 0 &&
(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
"Illegal shuffle root mask");
- assert(Root.getSimpleValueType().isVector() &&
- "Shuffles operate on vector types!");
- unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+ assert(Root.getSimpleValueType().isVector() &&
+ "Shuffles operate on vector types!");
+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
- if (Depth >= MaxDepth)
+ if (Depth >= MaxDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
SDValue Op = SrcOps[SrcOpIndex];
Op = peekThroughOneUseBitcasts(Op);
- EVT VT = Op.getValueType();
- if (!VT.isVector() || !VT.isSimple())
- return SDValue(); // Bail if we hit a non-simple non-vector.
+ EVT VT = Op.getValueType();
+ if (!VT.isVector() || !VT.isSimple())
+ return SDValue(); // Bail if we hit a non-simple non-vector.
- assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
- "Can only combine shuffles upto size of the root op.");
+ assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
+ "Can only combine shuffles upto size of the root op.");
// Extract target shuffle mask and resolve sentinels and inputs.
// TODO - determine Op's demanded elts from RootMask.
@@ -36392,32 +36392,32 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- // Shuffle inputs must not be larger than the shuffle result.
- // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
- if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
- return OpInput.getValueSizeInBits() > VT.getSizeInBits();
+ // Shuffle inputs must not be larger than the shuffle result.
+ // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
+ if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+ return OpInput.getValueSizeInBits() > VT.getSizeInBits();
}))
return SDValue();
- // If the shuffle result was smaller than the root, we need to adjust the
- // mask indices and pad the mask with undefs.
- if (RootSizeInBits > VT.getSizeInBits()) {
- unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
- unsigned OpMaskSize = OpMask.size();
- if (OpInputs.size() > 1) {
- unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
- for (int &M : OpMask) {
- if (M < 0)
- continue;
- int EltIdx = M % OpMaskSize;
- int OpIdx = M / OpMaskSize;
- M = (PaddedMaskSize * OpIdx) + EltIdx;
- }
- }
- OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
- OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
- OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
- }
+ // If the shuffle result was smaller than the root, we need to adjust the
+ // mask indices and pad the mask with undefs.
+ if (RootSizeInBits > VT.getSizeInBits()) {
+ unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
+ unsigned OpMaskSize = OpMask.size();
+ if (OpInputs.size() > 1) {
+ unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
+ for (int &M : OpMask) {
+ if (M < 0)
+ continue;
+ int EltIdx = M % OpMaskSize;
+ int OpIdx = M / OpMaskSize;
+ M = (PaddedMaskSize * OpIdx) + EltIdx;
+ }
+ }
+ OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
+ OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
+ OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
+ }
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
@@ -36577,7 +36577,7 @@ static SDValue combineX86ShufflesRecursively(
// shuffles to avoid constant pool bloat.
// Don't recurse if we already have more source ops than we can combine in
// the remaining recursion depth.
- if (Ops.size() < (MaxDepth - Depth)) {
+ if (Ops.size() < (MaxDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
// For empty roots, we need to resolve zeroable elements before combining
// them with other shuffles.
@@ -36589,7 +36589,7 @@ static SDValue combineX86ShufflesRecursively(
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
@@ -36600,24 +36600,24 @@ static SDValue combineX86ShufflesRecursively(
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
return Cst;
- // Canonicalize the combined shuffle mask chain with horizontal ops.
- // NOTE: This will update the Ops and Mask.
- if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
- Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
- return DAG.getBitcast(Root.getValueType(), HOp);
-
- // Widen any subvector shuffle inputs we've collected.
- if (any_of(Ops, [RootSizeInBits](SDValue Op) {
- return Op.getValueSizeInBits() < RootSizeInBits;
- })) {
- for (SDValue &Op : Ops)
- if (Op.getValueSizeInBits() < RootSizeInBits)
- Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
- RootSizeInBits);
- // Reresolve - we might have repeated subvector sources.
- resolveTargetShuffleInputsAndMask(Ops, Mask);
- }
-
+ // Canonicalize the combined shuffle mask chain with horizontal ops.
+ // NOTE: This will update the Ops and Mask.
+ if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+ Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
+ return DAG.getBitcast(Root.getValueType(), HOp);
+
+ // Widen any subvector shuffle inputs we've collected.
+ if (any_of(Ops, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() < RootSizeInBits;
+ })) {
+ for (SDValue &Op : Ops)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
+ RootSizeInBits);
+ // Reresolve - we might have repeated subvector sources.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ }
+
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() <= 2) {
// Minor canonicalization of the accumulated shuffle mask to make it easier
@@ -36625,10 +36625,10 @@ static SDValue combineX86ShufflesRecursively(
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- while (Mask.size() > 1) {
- SmallVector<int, 64> WidenedMask;
- if (!canWidenShuffleElements(Mask, WidenedMask))
- break;
+ while (Mask.size() > 1) {
+ SmallVector<int, 64> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ break;
Mask = std::move(WidenedMask);
}
@@ -36655,7 +36655,7 @@ static SDValue combineX86ShufflesRecursively(
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
- X86::MaxShuffleCombineDepth,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
@@ -36889,61 +36889,61 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
return SDValue();
}
-/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
-static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
- SelectionDAG &DAG,
- const SDLoc &DL) {
- assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
-
- MVT VT = V.getSimpleValueType();
- SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
- SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
- unsigned SrcOpc0 = Src0.getOpcode();
- unsigned SrcOpc1 = Src1.getOpcode();
- EVT SrcVT0 = Src0.getValueType();
- EVT SrcVT1 = Src1.getValueType();
-
- if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
- return SDValue();
-
- switch (SrcOpc0) {
- case X86ISD::MOVDDUP: {
- SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
- SDValue RHS =
- DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
- SDValue Res =
- DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
- Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
- return DAG.getBitcast(VT, Res);
- }
- case X86ISD::VPERMILPI:
- // TODO: Handle v4f64 permutes with different low/high lane masks.
- if (SrcVT0 == MVT::v4f64) {
- uint64_t Mask = Src0.getConstantOperandVal(1);
- if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
- break;
- }
- LLVM_FALLTHROUGH;
- case X86ISD::VSHLI:
- case X86ISD::VSRLI:
- case X86ISD::VSRAI:
- case X86ISD::PSHUFD:
- if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
- SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
- SDValue RHS =
- DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
- SDValue Res =
- DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
- Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
- Src0.getOperand(1));
- return DAG.getBitcast(VT, Res);
- }
- break;
- }
-
- return SDValue();
-}
-
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+ SelectionDAG &DAG,
+ const SDLoc &DL) {
+ assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+ MVT VT = V.getSimpleValueType();
+ SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+ SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+ unsigned SrcOpc0 = Src0.getOpcode();
+ unsigned SrcOpc1 = Src1.getOpcode();
+ EVT SrcVT0 = Src0.getValueType();
+ EVT SrcVT1 = Src1.getValueType();
+
+ if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
+ return SDValue();
+
+ switch (SrcOpc0) {
+ case X86ISD::MOVDDUP: {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res));
+ return DAG.getBitcast(VT, Res);
+ }
+ case X86ISD::VPERMILPI:
+ // TODO: Handle v4f64 permutes with different low/high lane masks.
+ if (SrcVT0 == MVT::v4f64) {
+ uint64_t Mask = Src0.getConstantOperandVal(1);
+ if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
+ break;
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI:
+ case X86ISD::PSHUFD:
+ if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
+ SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0));
+ SDValue RHS =
+ DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2));
+ Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+ Src0.getOperand(1));
+ return DAG.getBitcast(VT, Res);
+ }
+ break;
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -37016,7 +37016,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
- X86::MaxShuffleCombineDepth,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
@@ -37046,8 +37046,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
for (SDNode *User : Src->uses())
if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
Src == User->getOperand(0) &&
- User->getValueSizeInBits(0).getFixedSize() >
- VT.getFixedSizeInBits()) {
+ User->getValueSizeInBits(0).getFixedSize() >
+ VT.getFixedSizeInBits()) {
return extractSubVector(SDValue(User, 0), 0, DAG, DL,
VT.getSizeInBits());
}
@@ -37133,8 +37133,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
LN->isSimple()) {
unsigned Offset = ShiftAmt / 8;
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
- TypeSize::Fixed(Offset), DL);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
+ TypeSize::Fixed(Offset), DL);
SDValue Ops[] = { LN->getChain(), Ptr };
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
@@ -37166,16 +37166,16 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
// vbroadcast(vector load X) -> vbroadcast_load
- if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
- SrcVT == MVT::v4i32) &&
- Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
+ if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
+ SrcVT == MVT::v4i32) &&
+ Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd = DAG.getMemIntrinsicNode(
- X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
LN->getPointerInfo(), LN->getOriginalAlign(),
LN->getMemOperand()->getFlags());
DCI.CombineTo(N.getNode(), BcastLd);
@@ -37262,27 +37262,27 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}
- // Pull subvector inserts into undef through VZEXT_MOVL by making it an
- // insert into a zero vector. This helps get VZEXT_MOVL closer to
- // scalar_to_vectors where 256/512 are canonicalized to an insert and a
- // 128-bit scalar_to_vector. This reduces the number of isel patterns.
- if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
- SDValue V = peekThroughOneUseBitcasts(N0);
-
- if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
- isNullConstant(V.getOperand(2))) {
- SDValue In = V.getOperand(1);
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- In.getValueSizeInBits() /
- VT.getScalarSizeInBits());
- In = DAG.getBitcast(SubVT, In);
- SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL), Movl,
- V.getOperand(2));
- }
- }
-
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N0);
+
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
+ isNullConstant(V.getOperand(2))) {
+ SDValue In = V.getOperand(1);
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ In.getValueSizeInBits() /
+ VT.getScalarSizeInBits());
+ In = DAG.getBitcast(SubVT, In);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL), Movl,
+ V.getOperand(2));
+ }
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -37324,51 +37324,51 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
case X86ISD::VPERM2X128: {
- // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
- if (LHS.getOpcode() == ISD::BITCAST &&
- (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
- EVT SrcVT = LHS.getOperand(0).getValueType();
- if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
- DAG.getBitcast(SrcVT, LHS),
- DAG.getBitcast(SrcVT, RHS),
- N->getOperand(2)));
- }
- }
-
- // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
- if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
- return Res;
-
- // Fold vperm2x128 subvector shuffle with an inner concat pattern.
- // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
- auto FindSubVector128 = [&](unsigned Idx) {
- if (Idx > 3)
- return SDValue();
- SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
- SmallVector<SDValue> SubOps;
- if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
- return SubOps[Idx & 1];
- unsigned NumElts = Src.getValueType().getVectorNumElements();
- if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(1).getValueSizeInBits() == 128 &&
- Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
- return Src.getOperand(1);
- }
+ // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if (LHS.getOpcode() == ISD::BITCAST &&
+ (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
+ EVT SrcVT = LHS.getOperand(0).getValueType();
+ if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
+ DAG.getBitcast(SrcVT, LHS),
+ DAG.getBitcast(SrcVT, RHS),
+ N->getOperand(2)));
+ }
+ }
+
+ // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
+ if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+ return Res;
+
+ // Fold vperm2x128 subvector shuffle with an inner concat pattern.
+ // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
+ auto FindSubVector128 = [&](unsigned Idx) {
+ if (Idx > 3)
+ return SDValue();
+ SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
+ SmallVector<SDValue> SubOps;
+ if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+ return SubOps[Idx & 1];
+ unsigned NumElts = Src.getValueType().getVectorNumElements();
+ if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueSizeInBits() == 128 &&
+ Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
+ return Src.getOperand(1);
+ }
return SDValue();
- };
- unsigned Imm = N.getConstantOperandVal(2);
- if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
- if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
- MVT SubVT = VT.getHalfNumVectorElementsVT();
- SubLo = DAG.getBitcast(SubVT, SubLo);
- SubHi = DAG.getBitcast(SubVT, SubHi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
- }
- }
- return SDValue();
+ };
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
+ if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
+ MVT SubVT = VT.getHalfNumVectorElementsVT();
+ SubLo = DAG.getBitcast(SubVT, SubLo);
+ SubHi = DAG.getBitcast(SubVT, SubHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
+ }
+ }
+ return SDValue();
}
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
@@ -37811,12 +37811,12 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
/// Eliminate a redundant shuffle of a horizontal math op.
static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
- // TODO: Can we use getTargetShuffleInputs instead?
+ // TODO: Can we use getTargetShuffleInputs instead?
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
- if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
- if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
- return SDValue();
+ if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH)
+ if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
// For a broadcast, peek through an extract element of index 0 to find the
// horizontal op: broadcast (ext_vec_elt HOp, 0)
@@ -37835,28 +37835,28 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
return SDValue();
- // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
- // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
- // Don't fold if hop(x,y) == hop(z,w).
- if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
- SDValue HOp2 = N->getOperand(1);
- if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
- return SDValue();
- if (HOp == HOp2)
- return SDValue();
- SDLoc DL(HOp);
- unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
- SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
- HOp2.getOperand(LoHi));
- // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
- // combining and domain handling will simplify this later on.
- EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
- Res = DAG.getBitcast(ShuffleVT, Res);
- Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
- getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
- return DAG.getBitcast(VT, Res);
- }
-
+ // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)).
+ // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)).
+ // Don't fold if hop(x,y) == hop(z,w).
+ if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) {
+ SDValue HOp2 = N->getOperand(1);
+ if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32)
+ return SDValue();
+ if (HOp == HOp2)
+ return SDValue();
+ SDLoc DL(HOp);
+ unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1;
+ SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi),
+ HOp2.getOperand(LoHi));
+ // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle
+ // combining and domain handling will simplify this later on.
+ EVT ShuffleVT = VT.changeVectorElementType(MVT::f32);
+ Res = DAG.getBitcast(ShuffleVT, Res);
+ Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
+ getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG));
+ return DAG.getBitcast(VT, Res);
+ }
+
// 128-bit horizontal math instructions are defined to operate on adjacent
// lanes of each operand as:
// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
@@ -37889,8 +37889,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
// replicating low and high halves (and without changing the type/length of
// the vector), we don't need the shuffle.
if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
- if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector())
- return SDValue();
+ if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector())
+ return SDValue();
if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
// movddup (hadd X, X) --> hadd X, X
// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
@@ -37903,20 +37903,20 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
-
+
// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
// but this should be tied to whatever horizontal op matching and shuffle
// canonicalization are producing.
if (HOp.getValueSizeInBits() == 128 &&
- (isShuffleEquivalent(Mask, {0, 0}) ||
- isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
- isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
+ (isShuffleEquivalent(Mask, {0, 0}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
+ isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
return updateHOp(HOp, DAG);
if (HOp.getValueSizeInBits() == 256 &&
- (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
- isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
- isShuffleEquivalent(
+ (isShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
+ isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
+ isShuffleEquivalent(
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
return updateHOp(HOp, DAG);
@@ -37974,34 +37974,34 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
return HAddSub;
-
- // Merge shuffles through binops if its likely we'll be able to merge it
- // with other shuffles (as long as they aren't splats).
- // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
- // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
- if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
- unsigned SrcOpcode = N->getOperand(0).getOpcode();
- if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
- N->isOnlyUserOf(N->getOperand(0).getNode()) &&
- N->isOnlyUserOf(N->getOperand(1).getNode())) {
- SDValue Op00 = N->getOperand(0).getOperand(0);
- SDValue Op10 = N->getOperand(1).getOperand(0);
- SDValue Op01 = N->getOperand(0).getOperand(1);
- SDValue Op11 = N->getOperand(1).getOperand(1);
- auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
- auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
- auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
- auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
- if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
- ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
- SDLoc DL(N);
- ArrayRef<int> Mask = SVN->getMask();
- SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
- SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
- return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
- }
- }
- }
+
+ // Merge shuffles through binops if its likely we'll be able to merge it
+ // with other shuffles (as long as they aren't splats).
+ // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
+ // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE.
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) {
+ unsigned SrcOpcode = N->getOperand(0).getOpcode();
+ if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) &&
+ N->isOnlyUserOf(N->getOperand(0).getNode()) &&
+ N->isOnlyUserOf(N->getOperand(1).getNode())) {
+ SDValue Op00 = N->getOperand(0).getOperand(0);
+ SDValue Op10 = N->getOperand(1).getOperand(0);
+ SDValue Op01 = N->getOperand(0).getOperand(1);
+ SDValue Op11 = N->getOperand(1).getOperand(1);
+ auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00);
+ auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10);
+ auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01);
+ auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11);
+ if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) &&
+ ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) {
+ SDLoc DL(N);
+ ArrayRef<int> Mask = SVN->getMask();
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask);
+ return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
+ }
+ }
+ }
}
// Attempt to combine into a vector load/broadcast.
@@ -38035,8 +38035,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// TODO - merge this into combineX86ShufflesRecursively.
APInt KnownUndef, KnownZero;
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
- if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
- DCI))
+ if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
+ DCI))
return SDValue(N, 0);
}
@@ -38170,13 +38170,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
Depth + 1))
return true;
-
- // Aggressively peek through ops to get at the demanded elts.
- if (!DemandedElts.isAllOnesValue())
- if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
- Src, DemandedElts, TLO.DAG, Depth + 1))
- return TLO.CombineTo(
- Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
+
+ // Aggressively peek through ops to get at the demanded elts.
+ if (!DemandedElts.isAllOnesValue())
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
break;
}
case X86ISD::KSHIFTL: {
@@ -38365,7 +38365,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (!SrcVT.isVector())
- break;
+ break;
// Don't bother broadcasting if we just need the 0'th element.
if (DemandedElts == 1) {
if (Src.getValueType() != VT)
@@ -38418,62 +38418,62 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
- // Scalar broadcast.
- case X86ISD::VBROADCAST: {
+ // Scalar broadcast.
+ case X86ISD::VBROADCAST: {
SDLoc DL(Op);
SDValue Src = Op.getOperand(0);
if (Src.getValueSizeInBits() > ExtSizeInBits)
Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
- EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
- ExtSizeInBits / VT.getScalarSizeInBits());
- SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
- TLO.DAG, DL, ExtSizeInBits));
- }
- case X86ISD::VBROADCAST_LOAD: {
- SDLoc DL(Op);
- auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
- EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
- ExtSizeInBits / VT.getScalarSizeInBits());
- SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
- SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
- SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
- X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
- MemIntr->getMemOperand());
- TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
- Bcst.getValue(1));
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
TLO.DAG, DL, ExtSizeInBits));
}
- // Subvector broadcast.
- case X86ISD::SUBV_BROADCAST_LOAD: {
- auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
- EVT MemVT = MemIntr->getMemoryVT();
- if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
- SDLoc DL(Op);
- SDValue Ld =
- TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
- MemIntr->getBasePtr(), MemIntr->getMemOperand());
- TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
- Ld.getValue(1));
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
- TLO.DAG, DL, ExtSizeInBits));
- } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
- SDLoc DL(Op);
- EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
- ExtSizeInBits / VT.getScalarSizeInBits());
- SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
- SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
- SDValue Bcst =
- TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
- Ops, MemVT, MemIntr->getMemOperand());
- TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
- Bcst.getValue(1));
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
- TLO.DAG, DL, ExtSizeInBits));
- }
- break;
- }
+ case X86ISD::VBROADCAST_LOAD: {
+ SDLoc DL(Op);
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST_LOAD: {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
+ SDLoc DL(Op);
+ SDValue Ld =
+ TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(), MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Ld.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
+ SDLoc DL(Op);
+ EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
+ ExtSizeInBits / VT.getScalarSizeInBits());
+ SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
+ SDValue Bcst =
+ TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
+ Ops, MemVT, MemIntr->getMemOperand());
+ TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
+ Bcst.getValue(1));
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
+ break;
+ }
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -38525,8 +38525,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
- // Integer ops.
- case X86ISD::AVG:
+ // Integer ops.
+ case X86ISD::AVG:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
// Horizontal Ops.
@@ -38619,22 +38619,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// If we don't demand all elements, then attempt to combine to a simpler
// shuffle.
- // We need to convert the depth to something combineX86ShufflesRecursively
- // can handle - so pretend its Depth == 0 again, and reduce the max depth
- // to match. This prevents combineX86ShuffleChain from returning a
- // combined shuffle that's the same as the original root, causing an
- // infinite loop.
- if (!DemandedElts.isAllOnesValue()) {
- assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
-
+ // We need to convert the depth to something combineX86ShufflesRecursively
+ // can handle - so pretend its Depth == 0 again, and reduce the max depth
+ // to match. This prevents combineX86ShuffleChain from returning a
+ // combined shuffle that's the same as the original root, causing an
+ // infinite loop.
+ if (!DemandedElts.isAllOnesValue()) {
+ assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
+
SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
for (int i = 0; i != NumElts; ++i)
if (DemandedElts[i])
DemandedMask[i] = i;
SDValue NewShuffle = combineX86ShufflesRecursively(
- {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
- /*HasVarMask*/ false,
+ {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
+ /*HasVarMask*/ false,
/*AllowVarMask*/ true, TLO.DAG, Subtarget);
if (NewShuffle)
return TLO.CombineTo(Op, NewShuffle);
@@ -38737,7 +38737,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
- return false;
+ return false;
}
case X86ISD::VSRLI: {
unsigned ShAmt = Op.getConstantOperandVal(1);
@@ -38756,7 +38756,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
- return false;
+ return false;
}
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
@@ -38805,7 +38805,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// High bits are known one.
if (Known.One[BitWidth - ShAmt - 1])
Known.One.setHighBits(ShAmt);
- return false;
+ return false;
}
case X86ISD::PEXTRB:
case X86ISD::PEXTRW: {
@@ -38871,7 +38871,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return true;
KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
- Known = KnownBits::commonBits(KnownVec, KnownScl);
+ Known = KnownBits::commonBits(KnownVec, KnownScl);
return false;
}
break;
@@ -38951,83 +38951,83 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
return false;
}
- case X86ISD::BEXTR:
- case X86ISD::BEXTRI: {
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
// Only bottom 16-bits of the control bits are required.
if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
// NOTE: SimplifyDemandedBits won't do this for constants.
- uint64_t Val1 = Cst1->getZExtValue();
- uint64_t MaskedVal1 = Val1 & 0xFFFF;
- if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
+ uint64_t Val1 = Cst1->getZExtValue();
+ uint64_t MaskedVal1 = Val1 & 0xFFFF;
+ if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
SDLoc DL(Op);
return TLO.CombineTo(
Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
TLO.DAG.getConstant(MaskedVal1, DL, VT)));
}
-
- unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
- unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
-
- // If the length is 0, the result is 0.
- if (Length == 0) {
- Known.setAllZero();
- return false;
- }
-
- if ((Shift + Length) <= BitWidth) {
- APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
- if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
- return true;
-
- Known = Known.extractBits(Length, Shift);
- Known = Known.zextOrTrunc(BitWidth);
- return false;
- }
- } else {
- assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
- KnownBits Known1;
- APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
- if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
- return true;
-
- // If the length is 0, replace with 0.
- KnownBits LengthBits = Known1.extractBits(8, 8);
- if (LengthBits.isZero())
- return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
- }
-
- break;
- }
- case X86ISD::PDEP: {
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
-
- unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
- APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
-
- // If the demanded bits has leading zeroes, we don't demand those from the
- // mask.
- if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ return false;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
+ return true;
+
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ return false;
+ }
+ } else {
+ assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
+ KnownBits Known1;
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ return true;
+
+ // If the length is 0, replace with 0.
+ KnownBits LengthBits = Known1.extractBits(8, 8);
+ if (LengthBits.isZero())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+ }
+
+ break;
+ }
+ case X86ISD::PDEP: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
+ APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+ // If the demanded bits has leading zeroes, we don't demand those from the
+ // mask.
+ if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
return true;
- // The number of possible 1s in the mask determines the number of LSBs of
- // operand 0 used. Undemanded bits from the mask don't matter so filter
- // them before counting.
- KnownBits Known2;
- uint64_t Count = (~Known.Zero & LoMask).countPopulation();
- APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
- if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
- return true;
+ // The number of possible 1s in the mask determines the number of LSBs of
+ // operand 0 used. Undemanded bits from the mask don't matter so filter
+ // them before counting.
+ KnownBits Known2;
+ uint64_t Count = (~Known.Zero & LoMask).countPopulation();
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+ if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+ return true;
- // Zeroes are retained from the mask, but not ones.
- Known.One.clearAllBits();
- // The result will have at least as many trailing zeros as the non-mask
- // operand since bits can only map to the same or higher bit position.
- Known.Zero.setLowBits(Known2.countMinTrailingZeros());
- return false;
+ // Zeroes are retained from the mask, but not ones.
+ Known.One.clearAllBits();
+ // The result will have at least as many trailing zeros as the non-mask
+ // operand since bits can only map to the same or higher bit position.
+ Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+ return false;
}
}
@@ -39438,8 +39438,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
// Convert build vector ops to MMX data in the bottom elements.
SmallVector<SDValue, 8> Ops;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
if (Splat) {
if (Splat.isUndef())
@@ -39452,16 +39452,16 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
if (NumElts == 8)
Splat = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
- TLI.getPointerTy(DAG.getDataLayout())),
- Splat, Splat);
+ DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
+ Splat, Splat);
// Use PSHUFW to repeat 16-bit elements.
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
- TLI.getPointerTy(DAG.getDataLayout())),
+ DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
+ TLI.getPointerTy(DAG.getDataLayout())),
Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
}
Ops.append(NumElts, Splat);
@@ -39477,8 +39477,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
: Intrinsic::x86_mmx_punpcklbw));
- SDValue Intrin = DAG.getTargetConstant(
- IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
+ SDValue Intrin = DAG.getTargetConstant(
+ IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
for (unsigned i = 0; i != NumOps; i += 2)
Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
Ops[i], Ops[i + 1]);
@@ -39492,7 +39492,7 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
// a vector/float/double that got truncated/extended/bitcast to/from a scalar
// integer. If so, replace the scalar ops with bool vector equivalents back down
// the chain.
-static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -39545,10 +39545,10 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
case ISD::SHL: {
// If we find a suitable source, a SHL becomes a KSHIFTL.
SDValue Src0 = V.getOperand(0);
- if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
- ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
- break;
-
+ if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
+ ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
+ break;
+
if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
return DAG.getNode(
@@ -39891,8 +39891,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
// PHMINPOSUW.
-static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// Bail without SSE41.
if (!Subtarget.hasSSE41())
return SDValue();
@@ -39965,8 +39965,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
-static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// Bail without SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
@@ -40080,8 +40080,8 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
if (BinOp == ISD::XOR) {
- // parity -> (PARITY(MOVMSK X))
- SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
+ // parity -> (PARITY(MOVMSK X))
+ SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
}
@@ -40269,12 +40269,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
// Handle extract(truncate(x)) for 0'th index.
// TODO: Treat this as a faux shuffle?
// TODO: When can we use this for general indices?
- if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
- (SrcVT.getSizeInBits() % 128) == 0) {
+ if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
+ (SrcVT.getSizeInBits() % 128) == 0) {
Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
- MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
- return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
- Idx);
+ MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
+ return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
+ Idx);
}
// Resolve the target shuffle inputs and mask.
@@ -40350,7 +40350,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
- DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
+ DAG.getTargetConstant(SrcIdx, dl, MVT::i8));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);
}
@@ -40457,8 +40457,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
/// Try to convert a vector reduction sequence composed of binops and shuffles
/// into horizontal ops.
-static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
// We need at least SSE2 to anything here.
@@ -40466,8 +40466,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
return SDValue();
ISD::NodeType Opc;
- SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
- {ISD::ADD, ISD::MUL, ISD::FADD}, true);
+ SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
+ {ISD::ADD, ISD::MUL, ISD::FADD}, true);
if (!Rdx)
return SDValue();
@@ -40482,46 +40482,46 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
SDLoc DL(ExtElt);
- // vXi8 mul reduction - promote to vXi16 mul reduction.
- if (Opc == ISD::MUL) {
- unsigned NumElts = VecVT.getVectorNumElements();
- if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
- return SDValue();
- if (VecVT.getSizeInBits() >= 128) {
- EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
- SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
- SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
- Lo = DAG.getBitcast(WideVT, Lo);
- Hi = DAG.getBitcast(WideVT, Hi);
- Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
- while (Rdx.getValueSizeInBits() > 128) {
- std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
- Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
- }
- } else {
- if (VecVT == MVT::v4i8)
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
- DAG.getUNDEF(MVT::v4i8));
- Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
- DAG.getUNDEF(MVT::v8i8));
- Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
- Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
- }
- if (NumElts >= 8)
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {4, 5, 6, 7, -1, -1, -1, -1}));
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {2, 3, -1, -1, -1, -1, -1, -1}));
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {1, -1, -1, -1, -1, -1, -1, -1}));
- Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
- }
-
- // vXi8 add reduction - sub 128-bit vector.
+ // vXi8 mul reduction - promote to vXi16 mul reduction.
+ if (Opc == ISD::MUL) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
+ return SDValue();
+ if (VecVT.getSizeInBits() >= 128) {
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
+ SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
+ Lo = DAG.getBitcast(WideVT, Lo);
+ Hi = DAG.getBitcast(WideVT, Hi);
+ Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
+ while (Rdx.getValueSizeInBits() > 128) {
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
+ }
+ } else {
+ if (VecVT == MVT::v4i8)
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getUNDEF(MVT::v4i8));
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
+ }
+ if (NumElts >= 8)
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {4, 5, 6, 7, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {2, 3, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
+ DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
+ {1, -1, -1, -1, -1, -1, -1, -1}));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // vXi8 add reduction - sub 128-bit vector.
if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
if (VecVT == MVT::v4i8) {
// Pad with zero.
@@ -40552,7 +40552,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
!isPowerOf2_32(VecVT.getVectorNumElements()))
return SDValue();
- // vXi8 add reduction - sum lo/hi halves then use PSADBW.
+ // vXi8 add reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
SDValue Lo, Hi;
@@ -40658,7 +40658,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
// TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
// combineBasicSADPattern.
return SDValue();
}
@@ -40690,15 +40690,15 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SAD;
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
- if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
+ if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
return Cmp;
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
- if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
+ if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
return MinMax;
- // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
- if (SDValue V = combineArithReduction(N, DAG, Subtarget))
+ // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
+ if (SDValue V = combineArithReduction(N, DAG, Subtarget))
return V;
if (SDValue V = scalarizeExtEltFP(N, DAG))
@@ -40822,7 +40822,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (TValIsAllOnes && FValIsAllZeros)
return DAG.getBitcast(VT, Cond);
- if (!TLI.isTypeLegal(CondVT))
+ if (!TLI.isTypeLegal(CondVT))
return SDValue();
// vselect Cond, 111..., X -> or Cond, X
@@ -41145,36 +41145,36 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
}
- // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
- // by forcing the unselected elements to zero.
- // TODO: Can we handle more shuffles with this?
- if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
- LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
- LHS.hasOneUse() && RHS.hasOneUse()) {
- MVT SimpleVT = VT.getSimpleVT();
- bool LHSUnary, RHSUnary;
- SmallVector<SDValue, 1> LHSOps, RHSOps;
- SmallVector<int, 64> LHSMask, RHSMask, CondMask;
- if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
- getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
- LHSUnary) &&
- getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
- RHSUnary)) {
- int NumElts = VT.getVectorNumElements();
- for (int i = 0; i != NumElts; ++i) {
- if (CondMask[i] < NumElts)
- RHSMask[i] = 0x80;
- else
- LHSMask[i] = 0x80;
- }
- LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
- getConstVector(LHSMask, SimpleVT, DAG, DL, true));
- RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
- getConstVector(RHSMask, SimpleVT, DAG, DL, true));
- return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
- }
- }
-
+ // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
+ // by forcing the unselected elements to zero.
+ // TODO: Can we handle more shuffles with this?
+ if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
+ LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
+ LHS.hasOneUse() && RHS.hasOneUse()) {
+ MVT SimpleVT = VT.getSimpleVT();
+ bool LHSUnary, RHSUnary;
+ SmallVector<SDValue, 1> LHSOps, RHSOps;
+ SmallVector<int, 64> LHSMask, RHSMask, CondMask;
+ if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
+ getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask,
+ LHSUnary) &&
+ getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask,
+ RHSUnary)) {
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i != NumElts; ++i) {
+ if (CondMask[i] < NumElts)
+ RHSMask[i] = 0x80;
+ else
+ LHSMask[i] = 0x80;
+ }
+ LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
+ getConstVector(LHSMask, SimpleVT, DAG, DL, true));
+ RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
+ getConstVector(RHSMask, SimpleVT, DAG, DL, true));
+ return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+ }
+ }
+
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
@@ -41401,12 +41401,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineSelectOfTwoConstants(N, DAG))
return V;
- // Canonicalize min/max:
- // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
- // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
+ // Canonicalize min/max:
+ // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
+ // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
- // the need for an extra compare against zero. e.g.
- // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
+ // the need for an extra compare against zero. e.g.
+ // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
// subl %esi, %edi
// testl %edi, %edi
// movl $0, %eax
@@ -41415,27 +41415,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// xorl %eax, %eax
// subl %esi, $edi
// cmovsl %eax, %edi
- //
- // We can also canonicalize
- // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
- // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
- // This allows the use of a test instruction for the compare.
+ //
+ // We can also canonicalize
+ // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
+ // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
+ // This allows the use of a test instruction for the compare.
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
Cond.hasOneUse() &&
- LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
- (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
- ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
+ if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
+ (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
+ ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
- if (CC == ISD::SETUGT && isOneConstant(RHS)) {
- ISD::CondCode NewCC = ISD::SETUGE;
- Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
- Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getSelect(DL, VT, Cond, LHS, RHS);
+ if (CC == ISD::SETUGT && isOneConstant(RHS)) {
+ ISD::CondCode NewCC = ISD::SETUGE;
+ Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
+ Cond.getOperand(0), Cond.getOperand(1), NewCC);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
@@ -41466,18 +41466,18 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return V;
// select(~Cond, X, Y) -> select(Cond, Y, X)
- if (CondVT.getScalarType() != MVT::i1) {
+ if (CondVT.getScalarType() != MVT::i1) {
if (SDValue CondNot = IsNOT(Cond, DAG))
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
- // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
- if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
- ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
- Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
- DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
- return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
- }
- }
+ // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
+ if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
+ ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
+ Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
+ DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
+ return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
+ }
+ }
// Try to optimize vXi1 selects if both operands are either all constants or
// bitcasts from scalar integer type. In that case we can convert the operands
@@ -43094,116 +43094,116 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
- assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
- X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
- X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
- "Unexpected hadd/hsub/pack opcode");
+ assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode ||
+ X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode ||
+ X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected hadd/hsub/pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- EVT SrcVT = N0.getValueType();
+ EVT SrcVT = N0.getValueType();
- // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
- // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
// truncation trees that help us avoid lane crossing shuffles.
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
- // TODO: We don't handle vXf64 shuffles yet.
+ // TODO: We don't handle vXf64 shuffles yet.
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
N0.getConstantOperandAPInt(1) == 0 &&
- N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
+ N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
- N0.getOperand(0).getValueType().is256BitVector() &&
- SrcVT.getScalarSizeInBits() <= 32) {
+ N0.getOperand(0).getValueType().is256BitVector() &&
+ SrcVT.getScalarSizeInBits() <= 32) {
// TODO - support target/faux shuffles.
SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
- // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
+ // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
// shuffle to a vXi64 width - we can probably relax this in the future.
SmallVector<int, 4> ShuffleMask;
if (SVN->getOperand(1).isUndef() &&
scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
SDLoc DL(N);
SDValue Lo, Hi;
- MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
Lo = DAG.getBitcast(N0.getValueType(), Lo);
Hi = DAG.getBitcast(N1.getValueType(), Hi);
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
- Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
return DAG.getBitcast(VT, Res);
}
}
}
- // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
- // TODO: Merge with binary shuffle folds below.
- if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
- int PostShuffle[4] = {0, 1, 2, 3};
-
- // If the op is an unary shuffle that can scale to v2x64,
- // then we can perform this as a v4x32 post shuffle.
- auto AdjustOp = [&](SDValue V, int Offset) {
- auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
- SmallVector<int, 2> ScaledMask;
- if (!SVN || !SVN->getOperand(1).isUndef() ||
- !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
- !N->isOnlyUserOf(V.getNode()))
- return SDValue();
- PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
- PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
- return SVN->getOperand(0);
- };
-
- SDValue Src0 = AdjustOp(N0, 0);
- SDValue Src1 = AdjustOp(N1, 2);
- if (Src0 || Src1) {
- Src0 = Src0 ? Src0 : N0;
- Src1 = Src1 ? Src1 : N1;
- SDLoc DL(N);
- MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
- SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
- Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
- return DAG.getBitcast(VT, Res);
- }
- }
-
- // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
+ // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)).
+ // TODO: Merge with binary shuffle folds below.
+ if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
+ int PostShuffle[4] = {0, 1, 2, 3};
+
+ // If the op is an unary shuffle that can scale to v2x64,
+ // then we can perform this as a v4x32 post shuffle.
+ auto AdjustOp = [&](SDValue V, int Offset) {
+ auto *SVN = dyn_cast<ShuffleVectorSDNode>(V);
+ SmallVector<int, 2> ScaledMask;
+ if (!SVN || !SVN->getOperand(1).isUndef() ||
+ !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) ||
+ !N->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0];
+ PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1];
+ return SVN->getOperand(0);
+ };
+
+ SDValue Src0 = AdjustOp(N0, 0);
+ SDValue Src1 = AdjustOp(N1, 2);
+ if (Src0 || Src1) {
+ Src0 = Src0 ? Src0 : N0;
+ Src1 = Src1 ? Src1 : N1;
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+
+ // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
// TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
- if (VT.is256BitVector() && Subtarget.hasInt256()) {
- SmallVector<int> Mask0, Mask1;
- SmallVector<SDValue> Ops0, Ops1;
- if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
- getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
- !Ops0.empty() && !Ops1.empty()) {
- SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
- SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
- SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
- if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
- Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
- scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
- scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
- if ((Op00 == Op11) && (Op01 == Op10)) {
- std::swap(Op10, Op11);
- ShuffleVectorSDNode::commuteMask(ShuffleMask1);
- }
- if ((Op00 == Op10) && (Op01 == Op11)) {
- SmallVector<int, 4> ShuffleMask;
- ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
- ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
- SDLoc DL(N);
- MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
- SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
- Res = DAG.getBitcast(ShufVT, Res);
- Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
- return DAG.getBitcast(VT, Res);
+ if (VT.is256BitVector() && Subtarget.hasInt256()) {
+ SmallVector<int> Mask0, Mask1;
+ SmallVector<SDValue> Ops0, Ops1;
+ if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
+ getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
+ !Ops0.empty() && !Ops1.empty()) {
+ SDValue Op00 = Ops0.front(), Op01 = Ops0.back();
+ SDValue Op10 = Ops1.front(), Op11 = Ops1.back();
+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+ if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT &&
+ Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT &&
+ scaleShuffleElements(Mask0, 2, ShuffleMask0) &&
+ scaleShuffleElements(Mask1, 2, ShuffleMask1)) {
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);
}
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ SmallVector<int, 4> ShuffleMask;
+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+ SDLoc DL(N);
+ MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+ Res = DAG.getBitcast(ShufVT, Res);
+ Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
}
}
}
@@ -43285,7 +43285,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
}
// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
- if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
return V;
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
@@ -43307,28 +43307,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
}
}
- // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
- if (VT.is128BitVector()) {
- unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue Src0, Src1;
- if (N0.getOpcode() == ExtOpc &&
- N0.getOperand(0).getValueType().is64BitVector() &&
- N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
- Src0 = N0.getOperand(0);
- }
- if (N1.getOpcode() == ExtOpc &&
- N1.getOperand(0).getValueType().is64BitVector() &&
- N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
- Src1 = N1.getOperand(0);
- }
- if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
- assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
- Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
- Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
- return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
- }
- }
-
+ // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
+ if (VT.is128BitVector()) {
+ unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue Src0, Src1;
+ if (N0.getOpcode() == ExtOpc &&
+ N0.getOperand(0).getValueType().is64BitVector() &&
+ N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src0 = N0.getOperand(0);
+ }
+ if (N1.getOpcode() == ExtOpc &&
+ N1.getOperand(0).getValueType().is64BitVector() &&
+ N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
+ Src1 = N1.getOperand(0);
+ }
+ if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
+ assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
+ Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
+ Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
+ }
+ }
+
// Attempt to combine as shuffle.
SDValue Op(N, 0);
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -43337,20 +43337,20 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
- X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
- "Unexpected horizontal add/sub opcode");
-
- // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
- if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
- return V;
-
- return SDValue();
-}
-
+static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
+ X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
+ "Unexpected horizontal add/sub opcode");
+
+ // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
+ if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
+ return V;
+
+ return SDValue();
+}
+
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -44139,7 +44139,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (VT == SrcVecVT.getScalarType() &&
N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
- llvm::all_of(EltBits, [](const APInt &M) {
+ llvm::all_of(EltBits, [](const APInt &M) {
return M.isNullValue() || M.isAllOnesValue();
})) {
unsigned NumElts = SrcVecVT.getVectorNumElements();
@@ -44158,7 +44158,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue Shuffle = combineX86ShufflesRecursively(
{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
- X86::MaxShuffleCombineDepth,
+ X86::MaxShuffleCombineDepth,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
@@ -44826,13 +44826,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
unsigned NumElems = VT.getVectorNumElements();
EVT ScalarVT = VT.getVectorElementType();
- if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
+ if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
return SDValue();
// InScalarVT is the intermediate type in AVG pattern and it should be greater
// than the original input type (i8/i16).
EVT InScalarVT = InVT.getVectorElementType();
- if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
+ if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
return SDValue();
if (!Subtarget.hasSSE2())
@@ -44860,8 +44860,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
};
// Check if each element of the vector is right-shifted by one.
- SDValue LHS = In.getOperand(0);
- SDValue RHS = In.getOperand(1);
+ SDValue LHS = In.getOperand(0);
+ SDValue RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
return SDValue();
if (LHS.getOpcode() != ISD::ADD)
@@ -44877,29 +44877,29 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
};
- auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
- // Pad to a power-of-2 vector, split+apply and extract the original vector.
- unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
- EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
- if (NumElemsPow2 != NumElems) {
- SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
- SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
- for (unsigned i = 0; i != NumElems; ++i) {
- SDValue Idx = DAG.getIntPtrConstant(i, DL);
- Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
- Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
- }
- Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
- Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
- }
- SDValue Res =
- SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
- if (NumElemsPow2 == NumElems)
- return Res;
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
- };
-
+ auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
+ // Pad to a power-of-2 vector, split+apply and extract the original vector.
+ unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
+ EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
+ if (NumElemsPow2 != NumElems) {
+ SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Idx = DAG.getIntPtrConstant(i, DL);
+ Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
+ Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
+ }
+ Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
+ Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
+ }
+ SDValue Res =
+ SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
+ if (NumElemsPow2 == NumElems)
+ return Res;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ };
+
// Take care of the case when one of the operands is a constant vector whose
// element is in the range [1, 256].
if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
@@ -44910,7 +44910,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
SDValue VecOnes = DAG.getConstant(1, DL, InVT);
Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
- return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
+ return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
}
// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
@@ -44957,7 +44957,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
}
// The pattern is detected, emit X86ISD::AVG instruction(s).
- return AVGSplitter(Operands[0], Operands[1]);
+ return AVGSplitter(Operands[0], Operands[1]);
}
return SDValue();
@@ -44990,8 +44990,8 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
unsigned HalfOffset = 16;
SDValue Ptr1 = Ld->getBasePtr();
- SDValue Ptr2 =
- DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
+ SDValue Ptr2 =
+ DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
@@ -45025,29 +45025,29 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
}
}
- // If we also broadcast this as a subvector to a wider type, then just extract
- // the lowest subvector.
- if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
- (RegVT.is128BitVector() || RegVT.is256BitVector())) {
- SDValue Ptr = Ld->getBasePtr();
- SDValue Chain = Ld->getChain();
- for (SDNode *User : Ptr->uses()) {
- if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
- cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
- cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
- MemVT.getSizeInBits() &&
- !User->hasAnyUseOfValue(1) &&
- User->getValueSizeInBits(0).getFixedSize() >
- RegVT.getFixedSizeInBits()) {
- SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
- RegVT.getSizeInBits());
- Extract = DAG.getBitcast(RegVT, Extract);
- return DCI.CombineTo(N, Extract, SDValue(User, 1));
- }
- }
- }
-
+ // If we also broadcast this as a subvector to a wider type, then just extract
+ // the lowest subvector.
+ if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
+ (RegVT.is128BitVector() || RegVT.is256BitVector())) {
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Chain = Ld->getChain();
+ for (SDNode *User : Ptr->uses()) {
+ if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0).getFixedSize() >
+ RegVT.getFixedSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ RegVT.getSizeInBits());
+ Extract = DAG.getBitcast(RegVT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+ }
+ }
+
// Cast ptr32 and ptr64 pointers to the default address space before a load.
unsigned AddrSpace = Ld->getAddressSpace();
if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
@@ -45089,7 +45089,7 @@ static int getOneTrueElt(SDValue V) {
auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
if (!ConstNode)
return -1;
- if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
+ if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
// If we already found a one, this is too many.
if (TrueIndex >= 0)
return -1;
@@ -45105,8 +45105,8 @@ static int getOneTrueElt(SDValue V) {
/// scalar element, and the alignment for the scalar memory access.
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
SelectionDAG &DAG, SDValue &Addr,
- SDValue &Index, Align &Alignment,
- unsigned &Offset) {
+ SDValue &Index, Align &Alignment,
+ unsigned &Offset) {
int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
if (TrueMaskElt < 0)
return false;
@@ -45114,17 +45114,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
// Get the address of the one scalar element that is specified by the mask
// using the appropriate offset from the base pointer.
EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
- Offset = 0;
+ Offset = 0;
Addr = MaskedOp->getBasePtr();
if (TrueMaskElt != 0) {
- Offset = TrueMaskElt * EltVT.getStoreSize();
- Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
- SDLoc(MaskedOp));
+ Offset = TrueMaskElt * EltVT.getStoreSize();
+ Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
+ SDLoc(MaskedOp));
}
Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
- Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
- EltVT.getStoreSize());
+ Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
+ EltVT.getStoreSize());
return true;
}
@@ -45134,17 +45134,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
- Align Alignment;
- unsigned Offset;
- if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
return SDValue();
// Load the one scalar element that is specified by the mask using the
@@ -45152,25 +45152,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
-
- EVT CastVT = VT;
- if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
- EltVT = MVT::f64;
- CastVT =
- EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
- }
-
+
+ EVT CastVT = VT;
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ }
+
SDValue Load =
- DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
- ML->getPointerInfo().getWithOffset(Offset),
+ DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+ ML->getPointerInfo().getWithOffset(Offset),
Alignment, ML->getMemOperand()->getFlags());
- SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
-
+ SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
+
// Insert the loaded element into the appropriate place in the vector.
- SDValue Insert =
- DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
- Insert = DAG.getBitcast(VT, Insert);
+ SDValue Insert =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
+ Insert = DAG.getBitcast(VT, Insert);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}
@@ -45233,8 +45233,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
- if (SDValue ScalarLoad =
- reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
+ if (SDValue ScalarLoad =
+ reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
@@ -45271,35 +45271,35 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
SDValue Addr, VecIndex;
- Align Alignment;
- unsigned Offset;
- if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
+ Align Alignment;
+ unsigned Offset;
+ if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
return SDValue();
// Extract the one scalar element that is actually being stored.
SDLoc DL(MS);
- SDValue Value = MS->getValue();
- EVT VT = Value.getValueType();
+ SDValue Value = MS->getValue();
+ EVT VT = Value.getValueType();
EVT EltVT = VT.getVectorElementType();
- if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
- EltVT = MVT::f64;
- EVT CastVT =
- EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
- Value = DAG.getBitcast(CastVT, Value);
- }
- SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
+ if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
+ EltVT = MVT::f64;
+ EVT CastVT =
+ EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
+ Value = DAG.getBitcast(CastVT, Value);
+ }
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
// Store that element at the appropriate offset from the base pointer.
- return DAG.getStore(MS->getChain(), DL, Extract, Addr,
- MS->getPointerInfo().getWithOffset(Offset),
+ return DAG.getStore(MS->getChain(), DL, Extract, Addr,
+ MS->getPointerInfo().getWithOffset(Offset),
Alignment, MS->getMemOperand()->getFlags());
}
@@ -45317,7 +45317,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isTruncatingStore())
return SDValue();
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to
@@ -45378,21 +45378,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
StoredVal.getOperand(0).getValueType() == MVT::i8) {
- SDValue Val = StoredVal.getOperand(0);
- // We must store zeros to the unused bits.
- Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
- return DAG.getStore(St->getChain(), dl, Val,
+ SDValue Val = StoredVal.getOperand(0);
+ // We must store zeros to the unused bits.
+ Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
+ return DAG.getStore(St->getChain(), dl, Val,
St->getBasePtr(), St->getPointerInfo(),
St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
- if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
+ if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
Subtarget.hasAVX512()) {
unsigned NumConcats = 8 / VT.getVectorNumElements();
- // We must store zeros to the unused bits.
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
+ // We must store zeros to the unused bits.
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
@@ -45414,7 +45414,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
Hi = combinevXi1ConstantToInteger(Hi, DAG);
SDValue Ptr0 = St->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
@@ -45493,36 +45493,36 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
VT, St->getMemOperand(), DAG);
}
- // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
- if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
- auto IsExtractedElement = [](SDValue V) {
- if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
- V = V.getOperand(0);
- unsigned Opc = V.getOpcode();
- if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
- if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
- return V.getOperand(0);
- }
- return SDValue();
- };
- if (SDValue Extract = IsExtractedElement(StoredVal)) {
- SDValue Trunc = peekThroughOneUseBitcasts(Extract);
- if (Trunc.getOpcode() == X86ISD::VTRUNC) {
- SDValue Src = Trunc.getOperand(0);
- MVT DstVT = Trunc.getSimpleValueType();
- MVT SrcVT = Src.getSimpleValueType();
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
- MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
- if (NumTruncBits == VT.getSizeInBits() &&
- TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
- return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
- TruncVT, St->getMemOperand());
- }
- }
- }
- }
-
+ // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
+ auto IsExtractedElement = [](SDValue V) {
+ if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
+ V = V.getOperand(0);
+ unsigned Opc = V.getOpcode();
+ if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
+ if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
+ return V.getOperand(0);
+ }
+ return SDValue();
+ };
+ if (SDValue Extract = IsExtractedElement(StoredVal)) {
+ SDValue Trunc = peekThroughOneUseBitcasts(Extract);
+ if (Trunc.getOpcode() == X86ISD::VTRUNC) {
+ SDValue Src = Trunc.getOperand(0);
+ MVT DstVT = Trunc.getSimpleValueType();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
+ MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+ if (NumTruncBits == VT.getSizeInBits() &&
+ TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
+ return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
+ TruncVT, St->getMemOperand());
+ }
+ }
+ }
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
@@ -45665,9 +45665,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
- SelectionDAG &DAG, const X86Subtarget &Subtarget,
- bool IsCommutative,
+static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ bool IsCommutative,
SmallVectorImpl<int> &PostShuffleMask) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
@@ -45815,39 +45815,39 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
}
}
- SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
- SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
bool IsIdentityPostShuffle =
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
if (IsIdentityPostShuffle)
PostShuffleMask.clear();
- // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
- if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
- isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
- return false;
-
- // If the source nodes are already used in HorizOps then always accept this.
- // Shuffle folding should merge these back together.
- bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
- return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
- });
- bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
- return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
- });
- bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
-
+ // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
+ if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+ isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
+ return false;
+
+ // If the source nodes are already used in HorizOps then always accept this.
+ // Shuffle folding should merge these back together.
+ bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
+ return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+ });
+ bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
+
// Assume a SingleSource HOP if we only shuffle one input and don't need to
// shuffle the result.
- if (!ForceHorizOp &&
- !shouldUseHorizontalOp(NewLHS == NewRHS &&
+ if (!ForceHorizOp &&
+ !shouldUseHorizontalOp(NewLHS == NewRHS &&
(NumShuffles < 2 || !IsIdentityPostShuffle),
DAG, Subtarget))
return false;
- LHS = DAG.getBitcast(VT, NewLHS);
- RHS = DAG.getBitcast(VT, NewRHS);
+ LHS = DAG.getBitcast(VT, NewLHS);
+ RHS = DAG.getBitcast(VT, NewRHS);
return true;
}
@@ -45865,8 +45865,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 8> PostShuffleMask;
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
- PostShuffleMask)) {
+ isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
+ PostShuffleMask)) {
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
if (!PostShuffleMask.empty())
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
@@ -46011,7 +46011,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
EVT OutSVT = OutVT.getVectorElementType();
EVT InSVT = InVT.getVectorElementType();
- if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
NumElems >= 8))
return SDValue();
@@ -46073,13 +46073,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// there's no harm in trying pack.
if (Subtarget.hasAVX512() &&
!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
- InVT.is512BitVector())) {
- // PACK should still be worth it for 128-bit vectors if the sources were
- // originally concatenated from subvectors.
- SmallVector<SDValue> ConcatOps;
- if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+ InVT.is512BitVector())) {
+ // PACK should still be worth it for 128-bit vectors if the sources were
+ // originally concatenated from subvectors.
+ SmallVector<SDValue> ConcatOps;
+ if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
return SDValue();
- }
+ }
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -46101,23 +46101,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
return SDValue();
- unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
- if (NumSignBits > MinSignBits)
+ unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
+ if (NumSignBits > MinSignBits)
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
- // If we have a srl that only generates signbits that we will discard in
- // the truncation then we can use PACKSS by converting the srl to a sra.
- // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
- if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
- if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
- In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
- if (*ShAmt == MinSignBits) {
- SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
- Subtarget);
- }
- }
-
+ // If we have a srl that only generates signbits that we will discard in
+ // the truncation then we can use PACKSS by converting the srl to a sra.
+ // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
+ if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
+ if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
+ In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
+ if (*ShAmt == MinSignBits) {
+ SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
+ Subtarget);
+ }
+ }
+
return SDValue();
}
@@ -47466,14 +47466,14 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
- if (VT.isVector()) {
+ if (VT.isVector()) {
if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
return R;
- if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
- return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
- }
-
+ if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+ return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
+ }
+
if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
return NewAdd;
@@ -47492,19 +47492,19 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (!TLI.isTypeLegal(VT))
return SDValue();
- SDValue A = N->getOperand(IsStrict ? 1 : 0);
- SDValue B = N->getOperand(IsStrict ? 2 : 1);
- SDValue C = N->getOperand(IsStrict ? 3 : 2);
-
- // If the operation allows fast-math and the target does not support FMA,
- // split this into mul+add to avoid libcall(s).
- SDNodeFlags Flags = N->getFlags();
- if (!IsStrict && Flags.hasAllowReassociation() &&
- TLI.isOperationExpand(ISD::FMA, VT)) {
- SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
- return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
- }
-
+ SDValue A = N->getOperand(IsStrict ? 1 : 0);
+ SDValue B = N->getOperand(IsStrict ? 2 : 1);
+ SDValue C = N->getOperand(IsStrict ? 3 : 2);
+
+ // If the operation allows fast-math and the target does not support FMA,
+ // split this into mul+add to avoid libcall(s).
+ SDNodeFlags Flags = N->getFlags();
+ if (!IsStrict && Flags.hasAllowReassociation() &&
+ TLI.isOperationExpand(ISD::FMA, VT)) {
+ SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
+ return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
+ }
+
EVT ScalarVT = VT.getScalarType();
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
@@ -47931,7 +47931,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
- // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
+ // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
// with scalar comparisons.
if (SDValue NotSrc = IsNOT(Src, DAG)) {
SDLoc DL(N);
@@ -47942,17 +47942,17 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(NotMask, DL, VT));
}
- // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
- // results with scalar comparisons.
- if (Src.getOpcode() == X86ISD::PCMPGT &&
- ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
- SDLoc DL(N);
- APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
- return DAG.getNode(ISD::XOR, DL, VT,
- DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
- DAG.getConstant(NotMask, DL, VT));
- }
-
+ // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
+ // results with scalar comparisons.
+ if (Src.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
+ SDLoc DL(N);
+ APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
+ return DAG.getNode(ISD::XOR, DL, VT,
+ DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
+ DAG.getConstant(NotMask, DL, VT));
+ }
+
// Simplify the inputs.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getAllOnesValue(NumBits));
@@ -47990,8 +47990,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
return DAG.getMaskedGather(Gather->getVTList(),
Gather->getMemoryVT(), DL, Ops,
Gather->getMemOperand(),
- Gather->getIndexType(),
- Gather->getExtensionType());
+ Gather->getIndexType(),
+ Gather->getExtensionType());
}
auto *Scatter = cast<MaskedScatterSDNode>(GorS);
SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
@@ -47999,8 +47999,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
return DAG.getMaskedScatter(Scatter->getVTList(),
Scatter->getMemoryVT(), DL,
Ops, Scatter->getMemOperand(),
- Scatter->getIndexType(),
- Scatter->isTruncatingStore());
+ Scatter->getIndexType(),
+ Scatter->isTruncatingStore());
}
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
@@ -48995,18 +48995,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
bool IsAdd = N->getOpcode() == ISD::ADD;
- auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
+ auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
SmallVector<int, 8> PostShuffleMask;
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
- isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
- PostShuffleMask)) {
- auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
+ isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
+ PostShuffleMask)) {
+ auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
};
SDValue HorizBinOp =
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
@@ -49071,11 +49071,11 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector())
return SDValue();
- // PSUBUS is supported, starting from SSE2.
+ // PSUBUS is supported, starting from SSE2.
EVT EltVT = VT.getVectorElementType();
- if (!(Subtarget.hasSSE2() &&
- (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
- VT == MVT::v8i64 || VT == MVT::v16i32)))
+ if (!(Subtarget.hasSSE2() &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 ||
+ VT == MVT::v8i64 || VT == MVT::v16i32)))
return SDValue();
SDValue SubusLHS, SubusRHS;
@@ -49111,9 +49111,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue MinLHS = Op1.getOperand(0).getOperand(0);
SDValue MinRHS = Op1.getOperand(0).getOperand(1);
EVT TruncVT = Op1.getOperand(0).getValueType();
- if (!(Subtarget.hasSSE2() &&
- (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
- TruncVT == MVT::v16i32)))
+ if (!(Subtarget.hasSSE2() &&
+ (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 ||
+ TruncVT == MVT::v16i32)))
return SDValue();
SDValue OpToSaturate;
if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
@@ -49151,7 +49151,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
// values, or first 48 bits for 64 bit values.
KnownBits Known = DAG.computeKnownBits(SubusLHS);
unsigned NumZeros = Known.countMinLeadingZeros();
- if (NumZeros < (VT.getScalarSizeInBits() - 16))
+ if (NumZeros < (VT.getScalarSizeInBits() - 16))
return SDValue();
EVT ExtType = SubusLHS.getValueType();
@@ -49252,46 +49252,46 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
// Repeated subvectors.
- if (IsSplat &&
- (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
- // If this broadcast is inserted into both halves, use a larger broadcast.
- if (Op0.getOpcode() == X86ISD::VBROADCAST)
+ if (IsSplat &&
+ (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
+ // If this broadcast is inserted into both halves, use a larger broadcast.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
- // If this scalar/subvector broadcast_load is inserted into both halves, use
- // a larger broadcast_load. Update other uses to use an extracted subvector.
- if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
- Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ // If this scalar/subvector broadcast_load is inserted into both halves, use
+ // a larger broadcast_load. Update other uses to use an extracted subvector.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
- SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
- MemIntr->getMemoryVT(),
- MemIntr->getMemOperand());
+ SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(
Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
return BcastLd;
}
- // If this is a simple subvector load repeated across multiple lanes, then
- // broadcast the load. Update other uses to use an extracted subvector.
- if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
- if (Ld->isSimple() && !Ld->isNonTemporal() &&
- Ld->getExtensionType() == ISD::NON_EXTLOAD) {
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
- SDValue BcastLd =
- DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
- Ld->getMemoryVT(), Ld->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(
- Op0,
- extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
- return BcastLd;
- }
- }
-
+ // If this is a simple subvector load repeated across multiple lanes, then
+ // broadcast the load. Update other uses to use an extracted subvector.
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
+ if (Ld->isSimple() && !Ld->isNonTemporal() &&
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
+ Ld->getMemoryVT(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0,
+ extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -49369,38 +49369,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
- case X86ISD::VPERMV3:
- if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
- MVT OpVT = Op0.getSimpleValueType();
- int NumSrcElts = OpVT.getVectorNumElements();
- SmallVector<int, 64> ConcatMask;
- for (unsigned i = 0; i != NumOps; ++i) {
- bool IsUnary;
- SmallVector<int, 64> SubMask;
- SmallVector<SDValue, 2> SubOps;
- if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
- SubMask, IsUnary))
- break;
- for (int M : SubMask) {
- if (0 <= M) {
- M += M < NumSrcElts ? 0 : NumSrcElts;
- M += i * NumSrcElts;
- }
- ConcatMask.push_back(M);
- }
- }
- if (ConcatMask.size() == (NumOps * NumSrcElts)) {
- SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
- Ops[1].getOperand(0), DAG, DL);
- SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
- Ops[1].getOperand(2), DAG, DL);
- MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
- SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
- return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
- }
- }
- break;
+ case X86ISD::VPERMV3:
+ if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
+ MVT OpVT = Op0.getSimpleValueType();
+ int NumSrcElts = OpVT.getVectorNumElements();
+ SmallVector<int, 64> ConcatMask;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ bool IsUnary;
+ SmallVector<int, 64> SubMask;
+ SmallVector<SDValue, 2> SubOps;
+ if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
+ SubMask, IsUnary))
+ break;
+ for (int M : SubMask) {
+ if (0 <= M) {
+ M += M < NumSrcElts ? 0 : NumSrcElts;
+ M += i * NumSrcElts;
+ }
+ ConcatMask.push_back(M);
+ }
+ }
+ if (ConcatMask.size() == (NumOps * NumSrcElts)) {
+ SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
+ Ops[1].getOperand(0), DAG, DL);
+ SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
+ Ops[1].getOperand(2), DAG, DL);
+ MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
+ SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
+ }
+ }
+ break;
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
@@ -49433,33 +49433,33 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
Op0.getOperand(1));
}
break;
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- case X86ISD::ANDNP:
- // TODO: Add 256-bit support.
- if (!IsSplat && VT.is512BitVector()) {
- SmallVector<SDValue, 2> LHS, RHS;
- for (unsigned i = 0; i != NumOps; ++i) {
- LHS.push_back(Ops[i].getOperand(0));
- RHS.push_back(Ops[i].getOperand(1));
- }
- MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
- SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
- NumOps * SrcVT.getVectorNumElements());
- return DAG.getNode(Op0.getOpcode(), DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
- DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
- }
- break;
- case X86ISD::HADD:
- case X86ISD::HSUB:
- case X86ISD::FHADD:
- case X86ISD::FHSUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case X86ISD::ANDNP:
+ // TODO: Add 256-bit support.
+ if (!IsSplat && VT.is512BitVector()) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
+ SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ NumOps * SrcVT.getVectorNumElements());
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
+ }
+ break;
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB:
case X86ISD::PACKSS:
case X86ISD::PACKUS:
- if (!IsSplat && VT.is256BitVector() &&
- (VT.isFloatingPoint() || Subtarget.hasInt256())) {
+ if (!IsSplat && VT.is256BitVector() &&
+ (VT.isFloatingPoint() || Subtarget.hasInt256())) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
@@ -49494,20 +49494,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
- // Fold subvector loads into one.
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
- bool Fast;
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *FirstLd->getMemOperand(), &Fast) &&
- Fast) {
- if (SDValue Ld =
- EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
- return Ld;
- }
- }
-
+ // Fold subvector loads into one.
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
+ bool Fast;
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ *FirstLd->getMemOperand(), &Fast) &&
+ Fast) {
+ if (SDValue Ld =
+ EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
+ return Ld;
+ }
+ }
+
return SDValue();
}
@@ -49579,8 +49579,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ins = SubVec.getOperand(0);
if (isNullConstant(Ins.getOperand(2)) &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
- Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
- SubVecVT.getFixedSizeInBits())
+ Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
+ SubVecVT.getFixedSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
getZeroVector(OpVT, Subtarget, DAG, dl),
Ins.getOperand(1), N->getOperand(2));
@@ -49733,14 +49733,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
unsigned IdxVal = N->getConstantOperandVal(1);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
- unsigned SizeInBits = VT.getSizeInBits();
- unsigned InSizeInBits = InVecVT.getSizeInBits();
+ unsigned SizeInBits = VT.getSizeInBits();
+ unsigned InSizeInBits = InVecVT.getSizeInBits();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
TLI.isTypeLegal(InVecVT) &&
- InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
- auto isConcatenatedNot = [](SDValue V) {
+ InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
+ auto isConcatenatedNot = [](SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
return false;
@@ -49783,7 +49783,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
- InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
+ InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
SDLoc DL(N);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
getZeroVector(VT, Subtarget, DAG, DL),
@@ -49795,20 +49795,20 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// SimplifyDemandedVectorElts do more simplifications.
if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
- return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
- // If we're extracting a broadcasted subvector, just use the lowest subvector.
- if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
- return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
+ // If we're extracting a broadcasted subvector, just use the lowest subvector.
+ if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
// Attempt to extract from the source of a shuffle vector.
- if ((InSizeInBits % SizeInBits) == 0 &&
+ if ((InSizeInBits % SizeInBits) == 0 &&
(IdxVal % VT.getVectorNumElements()) == 0) {
SmallVector<int, 32> ShuffleMask;
SmallVector<int, 32> ScaledMask;
SmallVector<SDValue, 2> ShuffleInputs;
- unsigned NumSubVecs = InSizeInBits / SizeInBits;
+ unsigned NumSubVecs = InSizeInBits / SizeInBits;
// Decode the shuffle mask and scale it so its shuffling subvectors.
if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
@@ -49818,18 +49818,18 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (ScaledMask[SubVecIdx] == SM_SentinelZero)
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
- if (Src.getValueSizeInBits() == InSizeInBits) {
+ if (Src.getValueSizeInBits() == InSizeInBits) {
unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
- SDLoc(N), SizeInBits);
+ SDLoc(N), SizeInBits);
}
}
}
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
- unsigned InOpcode = InVec.getOpcode();
+ unsigned InOpcode = InVec.getOpcode();
if (IdxVal == 0 && InVec.hasOneUse()) {
if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
@@ -49854,14 +49854,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
InOpcode == ISD::SIGN_EXTEND ||
InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
- (SizeInBits == 128 || SizeInBits == 256) &&
- InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
- SDLoc DL(N);
- SDValue Ext = InVec.getOperand(0);
- if (Ext.getValueSizeInBits() > SizeInBits)
- Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
+ (SizeInBits == 128 || SizeInBits == 256) &&
+ InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
+ SDLoc DL(N);
+ SDValue Ext = InVec.getOperand(0);
+ if (Ext.getValueSizeInBits() > SizeInBits)
+ Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
- return DAG.getNode(ExtOp, DL, VT, Ext);
+ return DAG.getNode(ExtOp, DL, VT, Ext);
}
if (InOpcode == ISD::VSELECT &&
InVec.getOperand(0).getValueType().is256BitVector() &&
@@ -49873,27 +49873,27 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
}
- if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
- (VT.is128BitVector() || VT.is256BitVector())) {
- SDLoc DL(N);
- SDValue InVecSrc = InVec.getOperand(0);
- unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
- SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
- return DAG.getNode(InOpcode, DL, VT, Ext);
- }
- }
-
- // Always split vXi64 logical shifts where we're extracting the upper 32-bits
- // as this is very likely to fold into a shuffle/truncation.
- if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
- InVecVT.getScalarSizeInBits() == 64 &&
- InVec.getConstantOperandAPInt(1) == 32) {
- SDLoc DL(N);
- SDValue Ext =
- extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
- return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
- }
-
+ if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
+ (VT.is128BitVector() || VT.is256BitVector())) {
+ SDLoc DL(N);
+ SDValue InVecSrc = InVec.getOperand(0);
+ unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
+ SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext);
+ }
+ }
+
+ // Always split vXi64 logical shifts where we're extracting the upper 32-bits
+ // as this is very likely to fold into a shuffle/truncation.
+ if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
+ InVecVT.getScalarSizeInBits() == 64 &&
+ InVec.getConstantOperandAPInt(1) == 32) {
+ SDLoc DL(N);
+ SDValue Ext =
+ extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+ return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
+ }
+
return SDValue();
}
@@ -49975,7 +49975,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
// If the input is an extend_invec and the SimplifyDemandedBits call didn't
// convert it to any_extend_invec, due to the LegalOperations check, do the
// conversion directly to a vector shuffle manually. This exposes combine
- // opportunities missed by combineEXTEND_VECTOR_INREG not calling
+ // opportunities missed by combineEXTEND_VECTOR_INREG not calling
// combineX86ShufflesRecursively on SSE4.1 targets.
// FIXME: This is basically a hack around several other issues related to
// ANY_EXTEND_VECTOR_INREG.
@@ -50003,13 +50003,13 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
- unsigned Opcode = N->getOpcode();
- unsigned InOpcode = In.getOpcode();
+ unsigned Opcode = N->getOpcode();
+ unsigned InOpcode = In.getOpcode();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Try to merge vector loads and extend_inreg to an extload.
@@ -50018,7 +50018,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
auto *Ld = cast<LoadSDNode>(In);
if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
- ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
+ ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
? ISD::SEXTLOAD
: ISD::ZEXTLOAD;
EVT MemVT =
@@ -50026,7 +50026,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
+ Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
@@ -50034,23 +50034,23 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
}
}
- // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
- if (Opcode == InOpcode)
- return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
-
- // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
- // -> EXTEND_VECTOR_INREG(X).
- // TODO: Handle non-zero subvector indices.
- if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
- In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
- In.getOperand(0).getOperand(0).getValueSizeInBits() ==
- In.getValueSizeInBits())
- return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
-
+ // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
+ if (Opcode == InOpcode)
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
+
+ // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
+ // -> EXTEND_VECTOR_INREG(X).
+ // TODO: Handle non-zero subvector indices.
+ if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
+ In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
+ In.getOperand(0).getOperand(0).getValueSizeInBits() ==
+ In.getValueSizeInBits())
+ return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
+
// Attempt to combine as a shuffle.
- // TODO: General ZERO_EXTEND_VECTOR_INREG support.
- if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
- (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
+ // TODO: General ZERO_EXTEND_VECTOR_INREG support.
+ if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
+ (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
SDValue Op(N, 0);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
@@ -50171,15 +50171,15 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
}
-// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
-// from. Limit this to cases where the loads have the same input chain and the
-// output chains are unused. This avoids any memory ordering issues.
-static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
- assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
- N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
- "Unknown broadcast load type");
-
+// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
+// from. Limit this to cases where the loads have the same input chain and the
+// output chains are unused. This avoids any memory ordering issues.
+static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
+ "Unknown broadcast load type");
+
// Only do this if the chain result is unused.
if (N->hasAnyUseOfValue(1))
return SDValue();
@@ -50194,13 +50194,13 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
// Look at other users of our base pointer and try to find a wider broadcast.
// The input chain and the size of the memory VT must match.
for (SDNode *User : Ptr->uses())
- if (User != N && User->getOpcode() == N->getOpcode() &&
+ if (User != N && User->getOpcode() == N->getOpcode() &&
cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
MemVT.getSizeInBits() &&
!User->hasAnyUseOfValue(1) &&
- User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
+ User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
VT.getSizeInBits());
Extract = DAG.getBitcast(VT, Extract);
@@ -50271,17 +50271,17 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
- unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedBits(SDValue(N, 0),
- APInt::getAllOnesValue(NumBits), DCI))
- return SDValue(N, 0);
-
- return SDValue();
-}
-
+static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBits), DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -50318,8 +50318,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
- case X86ISD::BEXTR:
- case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
+ case X86ISD::BEXTR:
+ case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
@@ -50364,17 +50364,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::ANY_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
- case ISD::ZERO_EXTEND_VECTOR_INREG:
- return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::PACKSS:
case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
- case X86ISD::HADD:
- case X86ISD::HSUB:
- case X86ISD::FHADD:
- case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
+ case X86ISD::HADD:
+ case X86ISD::HSUB:
+ case X86ISD::FHADD:
+ case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
case X86ISD::VSHL:
case X86ISD::VSRA:
case X86ISD::VSRL:
@@ -50451,10 +50451,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::STRICT_FP_EXTEND:
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
- case X86ISD::VBROADCAST_LOAD:
- case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::VBROADCAST_LOAD:
+ case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
- case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
+ case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
}
return SDValue();
@@ -50743,7 +50743,7 @@ static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
.Case("{@ccnl}", X86::COND_GE)
.Case("{@ccnle}", X86::COND_G)
.Case("{@ccno}", X86::COND_NO)
- .Case("{@ccnp}", X86::COND_NP)
+ .Case("{@ccnp}", X86::COND_NP)
.Case("{@ccns}", X86::COND_NS)
.Case("{@cco}", X86::COND_O)
.Case("{@ccp}", X86::COND_P)
@@ -50979,8 +50979,8 @@ LowerXConstraint(EVT ConstraintVT) const {
// Lower @cc targets via setcc.
SDValue X86TargetLowering::LowerAsmOutputForConstraint(
- SDValue &Chain, SDValue &Flag, const SDLoc &DL,
- const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+ SDValue &Chain, SDValue &Flag, const SDLoc &DL,
+ const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
if (Cond == X86::COND_INVALID)
return SDValue();
@@ -51416,26 +51416,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Not found as a standard register?
if (!Res.second) {
- // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
- // to/from f80.
- if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
- // Map st(0) -> st(7) -> ST0
- if (Constraint.size() == 7 && Constraint[0] == '{' &&
- tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
- Constraint[3] == '(' &&
- (Constraint[4] >= '0' && Constraint[4] <= '7') &&
- Constraint[5] == ')' && Constraint[6] == '}') {
- // st(7) is not allocatable and thus not a member of RFP80. Return
- // singleton class in cases where we have a reference to it.
- if (Constraint[4] == '7')
- return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
- return std::make_pair(X86::FP0 + Constraint[4] - '0',
- &X86::RFP80RegClass);
- }
-
- // GCC allows "st(0)" to be called just plain "st".
- if (StringRef("{st}").equals_lower(Constraint))
- return std::make_pair(X86::FP0, &X86::RFP80RegClass);
+ // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
+ // to/from f80.
+ if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' && Constraint[6] == '}') {
+ // st(7) is not allocatable and thus not a member of RFP80. Return
+ // singleton class in cases where we have a reference to it.
+ if (Constraint[4] == '7')
+ return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
+ return std::make_pair(X86::FP0 + Constraint[4] - '0',
+ &X86::RFP80RegClass);
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint))
+ return std::make_pair(X86::FP0, &X86::RFP80RegClass);
}
// flags -> EFLAGS
@@ -51443,8 +51443,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
// dirflag -> DF
- // Only allow for clobber.
- if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
+ // Only allow for clobber.
+ if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other)
return std::make_pair(X86::DF, &X86::DFCCRRegClass);
// fpsr -> FPSW
@@ -51718,10 +51718,10 @@ X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
.getAsInteger(0, StackProbeSize);
return StackProbeSize;
}
-
-Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
- if (ML->isInnermost() &&
- ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
- return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
- return TargetLowering::getPrefLoopAlignment();
-}
+
+Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ if (ML->isInnermost() &&
+ ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
+ return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
+ return TargetLowering::getPrefLoopAlignment();
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h
index 76c83b7df9..8c2249a18f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h
@@ -384,10 +384,10 @@ namespace llvm {
/// Vector comparison generating mask bits for fp and
/// integer signed and unsigned data types.
CMPM,
- // Vector mask comparison generating mask bits for FP values.
- CMPMM,
- // Vector mask comparison with SAE for FP values.
- CMPMM_SAE,
+ // Vector mask comparison generating mask bits for FP values.
+ CMPMM,
+ // Vector mask comparison with SAE for FP values.
+ CMPMM_SAE,
// Arithmetic operations with FLAGS results.
ADD,
@@ -402,7 +402,7 @@ namespace llvm {
// Bit field extract.
BEXTR,
- BEXTRI,
+ BEXTRI,
// Zero High Bits Starting with Specified Bit Position.
BZHI,
@@ -709,9 +709,9 @@ namespace llvm {
// For avx512-vp2intersect
VP2INTERSECT,
- // User level interrupts - testui
- TESTUI,
-
+ // User level interrupts - testui
+ TESTUI,
+
/// X86 strict FP compare instructions.
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCMPS,
@@ -751,9 +751,9 @@ namespace llvm {
STRICT_CVTPS2PH,
STRICT_CVTPH2PS,
- // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
- // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
-
+ // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
+ // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
@@ -774,12 +774,12 @@ namespace llvm {
// extract_vector_elt, store.
VEXTRACT_STORE,
- // scalar broadcast from memory.
+ // scalar broadcast from memory.
VBROADCAST_LOAD,
- // subvector broadcast from memory.
- SUBV_BROADCAST_LOAD,
-
+ // subvector broadcast from memory.
+ SUBV_BROADCAST_LOAD,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -815,10 +815,10 @@ namespace llvm {
/// specifies the type to store as.
FST,
- /// These instructions grab the address of the next argument
+ /// These instructions grab the address of the next argument
/// from a va_list. (reads and modifies the va_list in memory)
VAARG_64,
- VAARG_X32,
+ VAARG_X32,
// Vector truncating store with unsigned/signed saturation
VTRUNCSTOREUS,
@@ -831,16 +831,16 @@ namespace llvm {
MGATHER,
MSCATTER,
- // Key locker nodes that produce flags.
- AESENC128KL,
- AESDEC128KL,
- AESENC256KL,
- AESDEC256KL,
- AESENCWIDE128KL,
- AESDECWIDE128KL,
- AESENCWIDE256KL,
- AESDECWIDE256KL,
-
+ // Key locker nodes that produce flags.
+ AESENC128KL,
+ AESDEC128KL,
+ AESENC256KL,
+ AESDEC256KL,
+ AESENCWIDE128KL,
+ AESDECWIDE128KL,
+ AESENCWIDE256KL,
+ AESDECWIDE256KL,
+
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
// opcodes will be thought as target memory ops!
@@ -855,7 +855,7 @@ namespace llvm {
/// Returns true of the given offset can be
/// fit into displacement field of the instruction.
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
- bool hasSymbolicDisplacement);
+ bool hasSymbolicDisplacement);
/// Determines whether the callee is required to pop its
/// own arguments. Callee pop is necessary to support tail calls.
@@ -1128,8 +1128,8 @@ namespace llvm {
}
/// Handle Lowering flag assembly outputs.
- SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
- const SDLoc &DL,
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+ const SDLoc &DL,
const AsmOperandInfo &Constraint,
SelectionDAG &DAG) const override;
@@ -1408,8 +1408,8 @@ namespace llvm {
SDValue Addr, SelectionDAG &DAG)
const override;
- Align getPrefLoopAlignment(MachineLoop *ML) const override;
-
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
protected:
std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1501,7 +1501,7 @@ namespace llvm {
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
@@ -1583,7 +1583,7 @@ namespace llvm {
// Utility function to emit the low-level va_arg code for X86-64.
MachineBasicBlock *
- EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
+ EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
/// Utility function to emit the xmm reg save portion of va_start.
MachineBasicBlock *
@@ -1699,7 +1699,7 @@ namespace llvm {
};
/// Generate unpacklo/unpackhi shuffle mask.
- void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
bool Unary);
/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp
index 85410c54a4..bba0e8d30d 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
#define DEBUG_TYPE "x86-indirect-branch-tracking"
-cl::opt<bool> IndirectBranchTracking(
+cl::opt<bool> IndirectBranchTracking(
"x86-indirect-branch-tracking", cl::init(false), cl::Hidden,
cl::desc("Enable X86 indirect branch tracking pass."));
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp
index 004e6fa5eb..1c25934ded 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp
@@ -214,10 +214,10 @@ bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
MachineInstrBuilder MIB(MF, PFetch);
- static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
- X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
- X86::AddrSegmentReg == 4,
- "Unexpected change in X86 operand offset order.");
+ static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
+ X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
+ X86::AddrSegmentReg == 4,
+ "Unexpected change in X86 operand offset order.");
// This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
// FIXME(mtrofin): consider adding a:
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp
index 56d2709f59..fe9bcfd0fd 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp
@@ -115,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
return false;
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- const X86InstrInfo *TII = ST.getInstrInfo();
+ const X86InstrInfo *TII = ST.getInstrInfo();
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp
index c4150ed528..c6388617c6 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -1,2017 +1,2017 @@
-//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// X86 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86TargetTransformInfo.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombiner.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86tti"
-
-/// Return a constant boolean vector that has true elements in all positions
-/// where the input constant data vector has an element with the sign bit set.
-static Constant *getNegativeIsTrueBoolVec(Constant *V) {
- VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
- V = ConstantExpr::getBitCast(V, IntTy);
- V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
- V);
- return V;
-}
-
-/// Convert the x86 XMM integer vector mask to a vector of bools based on
-/// each element's most significant bit (the sign bit).
-static Value *getBoolVecFromMask(Value *Mask) {
- // Fold Constant Mask.
- if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
- return getNegativeIsTrueBoolVec(ConstantMask);
-
- // Mask was extended from a boolean vector.
- Value *ExtMask;
- if (PatternMatch::match(
- Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
- ExtMask->getType()->isIntOrIntVectorTy(1))
- return ExtMask;
-
- return nullptr;
-}
-
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
- Value *Ptr = II.getOperand(0);
- Value *Mask = II.getOperand(1);
- Constant *ZeroVec = Constant::getNullValue(II.getType());
-
- // Zero Mask - masked load instruction creates a zero vector.
- if (isa<ConstantAggregateZero>(Mask))
- return IC.replaceInstUsesWith(II, ZeroVec);
-
- // The mask is constant or extended from a bool vector. Convert this x86
- // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
- if (Value *BoolMask = getBoolVecFromMask(Mask)) {
- // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
- // the LLVM intrinsic definition for the pointer argument.
- unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
- PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
- Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
- // The pass-through vector for an x86 masked load is a zero vector.
- CallInst *NewMaskedLoad =
- IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
- return IC.replaceInstUsesWith(II, NewMaskedLoad);
- }
-
- return nullptr;
-}
-
-// TODO: If the x86 backend knew how to convert a bool vector mask back to an
-// XMM register mask efficiently, we could transform all x86 masked intrinsics
-// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
-static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
- Value *Ptr = II.getOperand(0);
- Value *Mask = II.getOperand(1);
- Value *Vec = II.getOperand(2);
-
- // Zero Mask - this masked store instruction does nothing.
- if (isa<ConstantAggregateZero>(Mask)) {
- IC.eraseInstFromFunction(II);
- return true;
- }
-
- // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
- // anything else at this level.
- if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
- return false;
-
- // The mask is constant or extended from a bool vector. Convert this x86
- // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
- if (Value *BoolMask = getBoolVecFromMask(Mask)) {
- unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
- PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
- Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
-
- IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
-
- // 'Replace uses' doesn't work for stores. Erase the original masked store.
- IC.eraseInstFromFunction(II);
- return true;
- }
-
- return false;
-}
-
-static Value *simplifyX86immShift(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- bool LogicalShift = false;
- bool ShiftLeft = false;
- bool IsImm = false;
-
- switch (II.getIntrinsicID()) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psrai_d_512:
- case Intrinsic::x86_avx512_psrai_q_512:
- case Intrinsic::x86_avx512_psrai_w_512:
- IsImm = true;
- LLVM_FALLTHROUGH;
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx512_psra_q_128:
- case Intrinsic::x86_avx512_psra_q_256:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
- LogicalShift = false;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_sse2_psrli_d:
- case Intrinsic::x86_sse2_psrli_q:
- case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx512_psrli_d_512:
- case Intrinsic::x86_avx512_psrli_q_512:
- case Intrinsic::x86_avx512_psrli_w_512:
- IsImm = true;
- LLVM_FALLTHROUGH;
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
- LogicalShift = true;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx512_pslli_d_512:
- case Intrinsic::x86_avx512_pslli_q_512:
- case Intrinsic::x86_avx512_pslli_w_512:
- IsImm = true;
- LLVM_FALLTHROUGH;
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_psll_w_512:
- LogicalShift = true;
- ShiftLeft = true;
- break;
- }
- assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
- auto Vec = II.getArgOperand(0);
- auto Amt = II.getArgOperand(1);
- auto VT = cast<FixedVectorType>(Vec->getType());
- auto SVT = VT->getElementType();
- auto AmtVT = Amt->getType();
- unsigned VWidth = VT->getNumElements();
- unsigned BitWidth = SVT->getPrimitiveSizeInBits();
-
- // If the shift amount is guaranteed to be in-range we can replace it with a
- // generic shift. If its guaranteed to be out of range, logical shifts combine
- // to zero and arithmetic shifts are clamped to (BitWidth - 1).
- if (IsImm) {
- assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
- KnownBits KnownAmtBits =
- llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
- if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
- Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
- Amt = Builder.CreateVectorSplat(VWidth, Amt);
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
- if (KnownAmtBits.getMinValue().uge(BitWidth)) {
- if (LogicalShift)
- return ConstantAggregateZero::get(VT);
- Amt = ConstantInt::get(SVT, BitWidth - 1);
- return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
- }
- } else {
- // Ensure the first element has an in-range value and the rest of the
- // elements in the bottom 64 bits are zero.
- assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
- cast<VectorType>(AmtVT)->getElementType() == SVT &&
- "Unexpected shift-by-scalar type");
- unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
- APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
- APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
- KnownBits KnownLowerBits = llvm::computeKnownBits(
- Amt, DemandedLower, II.getModule()->getDataLayout());
- KnownBits KnownUpperBits = llvm::computeKnownBits(
- Amt, DemandedUpper, II.getModule()->getDataLayout());
- if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
- (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
- SmallVector<int, 16> ZeroSplat(VWidth, 0);
- Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
- }
-
- // Simplify if count is constant vector.
- auto CDV = dyn_cast<ConstantDataVector>(Amt);
- if (!CDV)
- return nullptr;
-
- // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
- // operand to compute the shift amount.
- assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
- cast<VectorType>(AmtVT)->getElementType() == SVT &&
- "Unexpected shift-by-scalar type");
-
- // Concatenate the sub-elements to create the 64-bit value.
- APInt Count(64, 0);
- for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
- unsigned SubEltIdx = (NumSubElts - 1) - i;
- auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
- Count <<= BitWidth;
- Count |= SubElt->getValue().zextOrTrunc(64);
- }
-
- // If shift-by-zero then just return the original value.
- if (Count.isNullValue())
- return Vec;
-
- // Handle cases when Shift >= BitWidth.
- if (Count.uge(BitWidth)) {
- // If LogicalShift - just return zero.
- if (LogicalShift)
- return ConstantAggregateZero::get(VT);
-
- // If ArithmeticShift - clamp Shift to (BitWidth - 1).
- Count = APInt(64, BitWidth - 1);
- }
-
- // Get a constant vector of the same type as the first operand.
- auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
- auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
-
- if (ShiftLeft)
- return Builder.CreateShl(Vec, ShiftVec);
-
- if (LogicalShift)
- return Builder.CreateLShr(Vec, ShiftVec);
-
- return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
-// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
-// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
-static Value *simplifyX86varShift(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- bool LogicalShift = false;
- bool ShiftLeft = false;
-
- switch (II.getIntrinsicID()) {
- default:
- llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- case Intrinsic::x86_avx512_psrav_q_128:
- case Intrinsic::x86_avx512_psrav_q_256:
- case Intrinsic::x86_avx512_psrav_d_512:
- case Intrinsic::x86_avx512_psrav_q_512:
- case Intrinsic::x86_avx512_psrav_w_128:
- case Intrinsic::x86_avx512_psrav_w_256:
- case Intrinsic::x86_avx512_psrav_w_512:
- LogicalShift = false;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx512_psrlv_d_512:
- case Intrinsic::x86_avx512_psrlv_q_512:
- case Intrinsic::x86_avx512_psrlv_w_128:
- case Intrinsic::x86_avx512_psrlv_w_256:
- case Intrinsic::x86_avx512_psrlv_w_512:
- LogicalShift = true;
- ShiftLeft = false;
- break;
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx512_psllv_d_512:
- case Intrinsic::x86_avx512_psllv_q_512:
- case Intrinsic::x86_avx512_psllv_w_128:
- case Intrinsic::x86_avx512_psllv_w_256:
- case Intrinsic::x86_avx512_psllv_w_512:
- LogicalShift = true;
- ShiftLeft = true;
- break;
- }
- assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
-
- auto Vec = II.getArgOperand(0);
- auto Amt = II.getArgOperand(1);
- auto VT = cast<FixedVectorType>(II.getType());
- auto SVT = VT->getElementType();
- int NumElts = VT->getNumElements();
- int BitWidth = SVT->getIntegerBitWidth();
-
- // If the shift amount is guaranteed to be in-range we can replace it with a
- // generic shift.
- APInt UpperBits =
- APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
- if (llvm::MaskedValueIsZero(Amt, UpperBits,
- II.getModule()->getDataLayout())) {
- return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
- : Builder.CreateLShr(Vec, Amt))
- : Builder.CreateAShr(Vec, Amt));
- }
-
- // Simplify if all shift amounts are constant/undef.
- auto *CShift = dyn_cast<Constant>(Amt);
- if (!CShift)
- return nullptr;
-
- // Collect each element's shift amount.
- // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
- bool AnyOutOfRange = false;
- SmallVector<int, 8> ShiftAmts;
- for (int I = 0; I < NumElts; ++I) {
- auto *CElt = CShift->getAggregateElement(I);
- if (isa_and_nonnull<UndefValue>(CElt)) {
- ShiftAmts.push_back(-1);
- continue;
- }
-
- auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
- if (!COp)
- return nullptr;
-
- // Handle out of range shifts.
- // If LogicalShift - set to BitWidth (special case).
- // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
- APInt ShiftVal = COp->getValue();
- if (ShiftVal.uge(BitWidth)) {
- AnyOutOfRange = LogicalShift;
- ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
- continue;
- }
-
- ShiftAmts.push_back((int)ShiftVal.getZExtValue());
- }
-
- // If all elements out of range or UNDEF, return vector of zeros/undefs.
- // ArithmeticShift should only hit this if they are all UNDEF.
- auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
- if (llvm::all_of(ShiftAmts, OutOfRange)) {
- SmallVector<Constant *, 8> ConstantVec;
- for (int Idx : ShiftAmts) {
- if (Idx < 0) {
- ConstantVec.push_back(UndefValue::get(SVT));
- } else {
- assert(LogicalShift && "Logical shift expected");
- ConstantVec.push_back(ConstantInt::getNullValue(SVT));
- }
- }
- return ConstantVector::get(ConstantVec);
- }
-
- // We can't handle only some out of range values with generic logical shifts.
- if (AnyOutOfRange)
- return nullptr;
-
- // Build the shift amount constant vector.
- SmallVector<Constant *, 8> ShiftVecAmts;
- for (int Idx : ShiftAmts) {
- if (Idx < 0)
- ShiftVecAmts.push_back(UndefValue::get(SVT));
- else
- ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
- }
- auto ShiftVec = ConstantVector::get(ShiftVecAmts);
-
- if (ShiftLeft)
- return Builder.CreateShl(Vec, ShiftVec);
-
- if (LogicalShift)
- return Builder.CreateLShr(Vec, ShiftVec);
-
- return Builder.CreateAShr(Vec, ShiftVec);
-}
-
-static Value *simplifyX86pack(IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder, bool IsSigned) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Type *ResTy = II.getType();
-
- // Fast all undef handling.
- if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
- return UndefValue::get(ResTy);
-
- auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
- unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
- unsigned NumSrcElts = ArgTy->getNumElements();
- assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
- "Unexpected packing types");
-
- unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
- unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
- unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
- assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
- "Unexpected packing types");
-
- // Constant folding.
- if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
- return nullptr;
-
- // Clamp Values - signed/unsigned both use signed clamp values, but they
- // differ on the min/max values.
- APInt MinValue, MaxValue;
- if (IsSigned) {
- // PACKSS: Truncate signed value with signed saturation.
- // Source values less than dst minint are saturated to minint.
- // Source values greater than dst maxint are saturated to maxint.
- MinValue =
- APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
- MaxValue =
- APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
- } else {
- // PACKUS: Truncate signed value with unsigned saturation.
- // Source values less than zero are saturated to zero.
- // Source values greater than dst maxuint are saturated to maxuint.
- MinValue = APInt::getNullValue(SrcScalarSizeInBits);
- MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
- }
-
- auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
- auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
- Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
- Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
- Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
- Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
-
- // Shuffle clamped args together at the lane level.
- SmallVector<int, 32> PackMask;
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
- PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
- for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
- PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
- }
- auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
-
- // Truncate to dst size.
- return Builder.CreateTrunc(Shuffle, ResTy);
-}
-
-static Value *simplifyX86movmsk(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *Arg = II.getArgOperand(0);
- Type *ResTy = II.getType();
-
- // movmsk(undef) -> zero as we must ensure the upper bits are zero.
- if (isa<UndefValue>(Arg))
- return Constant::getNullValue(ResTy);
-
- auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
- // We can't easily peek through x86_mmx types.
- if (!ArgTy)
- return nullptr;
-
- // Expand MOVMSK to compare/bitcast/zext:
- // e.g. PMOVMSKB(v16i8 x):
- // %cmp = icmp slt <16 x i8> %x, zeroinitializer
- // %int = bitcast <16 x i1> %cmp to i16
- // %res = zext i16 %int to i32
- unsigned NumElts = ArgTy->getNumElements();
- Type *IntegerVecTy = VectorType::getInteger(ArgTy);
- Type *IntegerTy = Builder.getIntNTy(NumElts);
-
- Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
- Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
- Res = Builder.CreateBitCast(Res, IntegerTy);
- Res = Builder.CreateZExtOrTrunc(Res, ResTy);
- return Res;
-}
-
-static Value *simplifyX86addcarry(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *CarryIn = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- Value *Op2 = II.getArgOperand(2);
- Type *RetTy = II.getType();
- Type *OpTy = Op1->getType();
- assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
- RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
- "Unexpected types for x86 addcarry");
-
- // If carry-in is zero, this is just an unsigned add with overflow.
- if (match(CarryIn, PatternMatch::m_ZeroInt())) {
- Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
- {Op1, Op2});
- // The types have to be adjusted to match the x86 call types.
- Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
- Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
- Builder.getInt8Ty());
- Value *Res = UndefValue::get(RetTy);
- Res = Builder.CreateInsertValue(Res, UAddOV, 0);
- return Builder.CreateInsertValue(Res, UAddResult, 1);
- }
-
- return nullptr;
-}
-
-static Value *simplifyX86insertps(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
- if (!CInt)
- return nullptr;
-
- auto *VecTy = cast<FixedVectorType>(II.getType());
- assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
-
- // The immediate permute control byte looks like this:
- // [3:0] - zero mask for each 32-bit lane
- // [5:4] - select one 32-bit destination lane
- // [7:6] - select one 32-bit source lane
-
- uint8_t Imm = CInt->getZExtValue();
- uint8_t ZMask = Imm & 0xf;
- uint8_t DestLane = (Imm >> 4) & 0x3;
- uint8_t SourceLane = (Imm >> 6) & 0x3;
-
- ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
- // If all zero mask bits are set, this was just a weird way to
- // generate a zero vector.
- if (ZMask == 0xf)
- return ZeroVector;
-
- // Initialize by passing all of the first source bits through.
- int ShuffleMask[4] = {0, 1, 2, 3};
-
- // We may replace the second operand with the zero vector.
- Value *V1 = II.getArgOperand(1);
-
- if (ZMask) {
- // If the zero mask is being used with a single input or the zero mask
- // overrides the destination lane, this is a shuffle with the zero vector.
- if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
- (ZMask & (1 << DestLane))) {
- V1 = ZeroVector;
- // We may still move 32-bits of the first source vector from one lane
- // to another.
- ShuffleMask[DestLane] = SourceLane;
- // The zero mask may override the previous insert operation.
- for (unsigned i = 0; i < 4; ++i)
- if ((ZMask >> i) & 0x1)
- ShuffleMask[i] = i + 4;
- } else {
- // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
- return nullptr;
- }
- } else {
- // Replace the selected destination lane with the selected source lane.
- ShuffleMask[DestLane] = SourceLane + 4;
- }
-
- return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
-}
-
-/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
-/// or conversion to a shuffle vector.
-static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
- ConstantInt *CILength, ConstantInt *CIIndex,
- InstCombiner::BuilderTy &Builder) {
- auto LowConstantHighUndef = [&](uint64_t Val) {
- Type *IntTy64 = Type::getInt64Ty(II.getContext());
- Constant *Args[] = {ConstantInt::get(IntTy64, Val),
- UndefValue::get(IntTy64)};
- return ConstantVector::get(Args);
- };
-
- // See if we're dealing with constant values.
- Constant *C0 = dyn_cast<Constant>(Op0);
- ConstantInt *CI0 =
- C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
- : nullptr;
-
- // Attempt to constant fold.
- if (CILength && CIIndex) {
- // From AMD documentation: "The bit index and field length are each six
- // bits in length other bits of the field are ignored."
- APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
- APInt APLength = CILength->getValue().zextOrTrunc(6);
-
- unsigned Index = APIndex.getZExtValue();
-
- // From AMD documentation: "a value of zero in the field length is
- // defined as length of 64".
- unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
- // From AMD documentation: "If the sum of the bit index + length field
- // is greater than 64, the results are undefined".
- unsigned End = Index + Length;
-
- // Note that both field index and field length are 8-bit quantities.
- // Since variables 'Index' and 'Length' are unsigned values
- // obtained from zero-extending field index and field length
- // respectively, their sum should never wrap around.
- if (End > 64)
- return UndefValue::get(II.getType());
-
- // If we are inserting whole bytes, we can convert this to a shuffle.
- // Lowering can recognize EXTRQI shuffle masks.
- if ((Length % 8) == 0 && (Index % 8) == 0) {
- // Convert bit indices to byte indices.
- Length /= 8;
- Index /= 8;
-
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
- SmallVector<int, 16> ShuffleMask;
- for (int i = 0; i != (int)Length; ++i)
- ShuffleMask.push_back(i + Index);
- for (int i = Length; i != 8; ++i)
- ShuffleMask.push_back(i + 16);
- for (int i = 8; i != 16; ++i)
- ShuffleMask.push_back(-1);
-
- Value *SV = Builder.CreateShuffleVector(
- Builder.CreateBitCast(Op0, ShufTy),
- ConstantAggregateZero::get(ShufTy), ShuffleMask);
- return Builder.CreateBitCast(SV, II.getType());
- }
-
- // Constant Fold - shift Index'th bit to lowest position and mask off
- // Length bits.
- if (CI0) {
- APInt Elt = CI0->getValue();
- Elt.lshrInPlace(Index);
- Elt = Elt.zextOrTrunc(Length);
- return LowConstantHighUndef(Elt.getZExtValue());
- }
-
- // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
- if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
- Value *Args[] = {Op0, CILength, CIIndex};
- Module *M = II.getModule();
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
- return Builder.CreateCall(F, Args);
- }
- }
-
- // Constant Fold - extraction from zero is always {zero, undef}.
- if (CI0 && CI0->isZero())
- return LowConstantHighUndef(0);
-
- return nullptr;
-}
-
-/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
-/// folding or conversion to a shuffle vector.
-static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
- APInt APLength, APInt APIndex,
- InstCombiner::BuilderTy &Builder) {
- // From AMD documentation: "The bit index and field length are each six bits
- // in length other bits of the field are ignored."
- APIndex = APIndex.zextOrTrunc(6);
- APLength = APLength.zextOrTrunc(6);
-
- // Attempt to constant fold.
- unsigned Index = APIndex.getZExtValue();
-
- // From AMD documentation: "a value of zero in the field length is
- // defined as length of 64".
- unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
-
- // From AMD documentation: "If the sum of the bit index + length field
- // is greater than 64, the results are undefined".
- unsigned End = Index + Length;
-
- // Note that both field index and field length are 8-bit quantities.
- // Since variables 'Index' and 'Length' are unsigned values
- // obtained from zero-extending field index and field length
- // respectively, their sum should never wrap around.
- if (End > 64)
- return UndefValue::get(II.getType());
-
- // If we are inserting whole bytes, we can convert this to a shuffle.
- // Lowering can recognize INSERTQI shuffle masks.
- if ((Length % 8) == 0 && (Index % 8) == 0) {
- // Convert bit indices to byte indices.
- Length /= 8;
- Index /= 8;
-
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- auto *ShufTy = FixedVectorType::get(IntTy8, 16);
-
- SmallVector<int, 16> ShuffleMask;
- for (int i = 0; i != (int)Index; ++i)
- ShuffleMask.push_back(i);
- for (int i = 0; i != (int)Length; ++i)
- ShuffleMask.push_back(i + 16);
- for (int i = Index + Length; i != 8; ++i)
- ShuffleMask.push_back(i);
- for (int i = 8; i != 16; ++i)
- ShuffleMask.push_back(-1);
-
- Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
- Builder.CreateBitCast(Op1, ShufTy),
- ShuffleMask);
- return Builder.CreateBitCast(SV, II.getType());
- }
-
- // See if we're dealing with constant values.
- Constant *C0 = dyn_cast<Constant>(Op0);
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CI00 =
- C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
- : nullptr;
- ConstantInt *CI10 =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
- : nullptr;
-
- // Constant Fold - insert bottom Length bits starting at the Index'th bit.
- if (CI00 && CI10) {
- APInt V00 = CI00->getValue();
- APInt V10 = CI10->getValue();
- APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
- V00 = V00 & ~Mask;
- V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
- APInt Val = V00 | V10;
- Type *IntTy64 = Type::getInt64Ty(II.getContext());
- Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
- UndefValue::get(IntTy64)};
- return ConstantVector::get(Args);
- }
-
- // If we were an INSERTQ call, we'll save demanded elements if we convert to
- // INSERTQI.
- if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
- Type *IntTy8 = Type::getInt8Ty(II.getContext());
- Constant *CILength = ConstantInt::get(IntTy8, Length, false);
- Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
-
- Value *Args[] = {Op0, Op1, CILength, CIIndex};
- Module *M = II.getModule();
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
- return Builder.CreateCall(F, Args);
- }
-
- return nullptr;
-}
-
-/// Attempt to convert pshufb* to shufflevector if the mask is constant.
-static Value *simplifyX86pshufb(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
-
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned NumElts = VecTy->getNumElements();
- assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
- "Unexpected number of elements in shuffle mask!");
-
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[64];
-
- // Each byte in the shuffle control mask forms an index to permute the
- // corresponding byte in the destination operand.
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
-
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
-
- int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
-
- // If the most significant bit (bit[7]) of each byte of the shuffle
- // control mask is set, then zero is written in the result byte.
- // The zero vector is in the right-hand side of the resulting
- // shufflevector.
-
- // The value of each index for the high 128-bit lane is the least
- // significant 4 bits of the respective shuffle control byte.
- Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
- Indexes[I] = Index;
- }
-
- auto V1 = II.getArgOperand(0);
- auto V2 = Constant::getNullValue(VecTy);
- return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
-static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
-
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned NumElts = VecTy->getNumElements();
- bool IsPD = VecTy->getScalarType()->isDoubleTy();
- unsigned NumLaneElts = IsPD ? 2 : 4;
- assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
-
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[16];
-
- // The intrinsics only read one or two bits, clear the rest.
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
-
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
-
- APInt Index = cast<ConstantInt>(COp)->getValue();
- Index = Index.zextOrTrunc(32).getLoBits(2);
-
- // The PD variants uses bit 1 to select per-lane element index, so
- // shift down to convert to generic shuffle mask index.
- if (IsPD)
- Index.lshrInPlace(1);
-
- // The _256 variants are a bit trickier since the mask bits always index
- // into the corresponding 128 half. In order to convert to a generic
- // shuffle, we have to make that explicit.
- Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
-
- Indexes[I] = Index.getZExtValue();
- }
-
- auto V1 = II.getArgOperand(0);
- return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
-}
-
-/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
-static Value *simplifyX86vpermv(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *V = dyn_cast<Constant>(II.getArgOperand(1));
- if (!V)
- return nullptr;
-
- auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned Size = VecTy->getNumElements();
- assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
- "Unexpected shuffle mask size");
-
- // Construct a shuffle mask from constant integers or UNDEFs.
- int Indexes[64];
-
- for (unsigned I = 0; I < Size; ++I) {
- Constant *COp = V->getAggregateElement(I);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
- return nullptr;
-
- if (isa<UndefValue>(COp)) {
- Indexes[I] = -1;
- continue;
- }
-
- uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
- Index &= Size - 1;
- Indexes[I] = Index;
- }
-
- auto V1 = II.getArgOperand(0);
- return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
-}
-
-Optional<Instruction *>
-X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
- auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
- unsigned DemandedWidth) {
- APInt UndefElts(Width, 0);
- APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
- return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
- };
-
- Intrinsic::ID IID = II.getIntrinsicID();
- switch (IID) {
- case Intrinsic::x86_bmi_bextr_32:
- case Intrinsic::x86_bmi_bextr_64:
- case Intrinsic::x86_tbm_bextri_u32:
- case Intrinsic::x86_tbm_bextri_u64:
- // If the RHS is a constant we can try some simplifications.
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- uint64_t Shift = C->getZExtValue();
- uint64_t Length = (Shift >> 8) & 0xff;
- Shift &= 0xff;
- unsigned BitWidth = II.getType()->getIntegerBitWidth();
- // If the length is 0 or the shift is out of range, replace with zero.
- if (Length == 0 || Shift >= BitWidth) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- // If the LHS is also a constant, we can completely constant fold this.
- if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Result = InC->getZExtValue() >> Shift;
- if (Length > BitWidth)
- Length = BitWidth;
- Result &= maskTrailingOnes<uint64_t>(Length);
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
- // are only masking bits that a shift already cleared?
- }
- break;
-
- case Intrinsic::x86_bmi_bzhi_32:
- case Intrinsic::x86_bmi_bzhi_64:
- // If the RHS is a constant we can try some simplifications.
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- uint64_t Index = C->getZExtValue() & 0xff;
- unsigned BitWidth = II.getType()->getIntegerBitWidth();
- if (Index >= BitWidth) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
- if (Index == 0) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- // If the LHS is also a constant, we can completely constant fold this.
- if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Result = InC->getZExtValue();
- Result &= maskTrailingOnes<uint64_t>(Index);
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- // TODO should we convert this to an AND if the RHS is constant?
- }
- break;
- case Intrinsic::x86_bmi_pext_32:
- case Intrinsic::x86_bmi_pext_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
-
- if (MaskC->getValue().isShiftedMask()) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
- Value *Input = II.getArgOperand(0);
- Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
- Value *Shifted = IC.Builder.CreateLShr(Masked,
- ConstantInt::get(II.getType(),
- ShiftAmount));
- return IC.replaceInstUsesWith(II, Shifted);
- }
-
-
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToSet = 1;
-
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToTest = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
-
- BitToSet <<= 1;
- // Clear lowest set bit.
- Mask &= Mask - 1;
- }
-
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
- case Intrinsic::x86_bmi_pdep_32:
- case Intrinsic::x86_bmi_pdep_64:
- if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
- if (MaskC->isNullValue()) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
- }
- if (MaskC->isAllOnesValue()) {
- return IC.replaceInstUsesWith(II, II.getArgOperand(0));
- }
- if (MaskC->getValue().isShiftedMask()) {
- // any single contingous sequence of 1s anywhere in the mask simply
- // describes a subset of the input bits shifted to the appropriate
- // position. Replace with the straight forward IR.
- unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
- Value *Input = II.getArgOperand(0);
- Value *Shifted = IC.Builder.CreateShl(Input,
- ConstantInt::get(II.getType(),
- ShiftAmount));
- Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
- return IC.replaceInstUsesWith(II, Masked);
- }
-
- if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
- uint64_t Src = SrcC->getZExtValue();
- uint64_t Mask = MaskC->getZExtValue();
- uint64_t Result = 0;
- uint64_t BitToTest = 1;
-
- while (Mask) {
- // Isolate lowest set bit.
- uint64_t BitToSet = Mask & -Mask;
- if (BitToTest & Src)
- Result |= BitToSet;
-
- BitToTest <<= 1;
- // Clear lowest set bit;
- Mask &= Mask - 1;
- }
-
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), Result));
- }
- }
- break;
-
- case Intrinsic::x86_sse_cvtss2si:
- case Intrinsic::x86_sse_cvtss2si64:
- case Intrinsic::x86_sse_cvttss2si:
- case Intrinsic::x86_sse_cvttss2si64:
- case Intrinsic::x86_sse2_cvtsd2si:
- case Intrinsic::x86_sse2_cvtsd2si64:
- case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse2_cvttsd2si64:
- case Intrinsic::x86_avx512_vcvtss2si32:
- case Intrinsic::x86_avx512_vcvtss2si64:
- case Intrinsic::x86_avx512_vcvtss2usi32:
- case Intrinsic::x86_avx512_vcvtss2usi64:
- case Intrinsic::x86_avx512_vcvtsd2si32:
- case Intrinsic::x86_avx512_vcvtsd2si64:
- case Intrinsic::x86_avx512_vcvtsd2usi32:
- case Intrinsic::x86_avx512_vcvtsd2usi64:
- case Intrinsic::x86_avx512_cvttss2si:
- case Intrinsic::x86_avx512_cvttss2si64:
- case Intrinsic::x86_avx512_cvttss2usi:
- case Intrinsic::x86_avx512_cvttss2usi64:
- case Intrinsic::x86_avx512_cvttsd2si:
- case Intrinsic::x86_avx512_cvttsd2si64:
- case Intrinsic::x86_avx512_cvttsd2usi:
- case Intrinsic::x86_avx512_cvttsd2usi64: {
- // These intrinsics only demand the 0th element of their input vectors. If
- // we can simplify the input based on that, do so now.
- Value *Arg = II.getArgOperand(0);
- unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
-
- case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse_movmsk_ps:
- case Intrinsic::x86_sse2_movmsk_pd:
- case Intrinsic::x86_sse2_pmovmskb_128:
- case Intrinsic::x86_avx_movmsk_pd_256:
- case Intrinsic::x86_avx_movmsk_ps_256:
- case Intrinsic::x86_avx2_pmovmskb:
- if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_sse_comieq_ss:
- case Intrinsic::x86_sse_comige_ss:
- case Intrinsic::x86_sse_comigt_ss:
- case Intrinsic::x86_sse_comile_ss:
- case Intrinsic::x86_sse_comilt_ss:
- case Intrinsic::x86_sse_comineq_ss:
- case Intrinsic::x86_sse_ucomieq_ss:
- case Intrinsic::x86_sse_ucomige_ss:
- case Intrinsic::x86_sse_ucomigt_ss:
- case Intrinsic::x86_sse_ucomile_ss:
- case Intrinsic::x86_sse_ucomilt_ss:
- case Intrinsic::x86_sse_ucomineq_ss:
- case Intrinsic::x86_sse2_comieq_sd:
- case Intrinsic::x86_sse2_comige_sd:
- case Intrinsic::x86_sse2_comigt_sd:
- case Intrinsic::x86_sse2_comile_sd:
- case Intrinsic::x86_sse2_comilt_sd:
- case Intrinsic::x86_sse2_comineq_sd:
- case Intrinsic::x86_sse2_ucomieq_sd:
- case Intrinsic::x86_sse2_ucomige_sd:
- case Intrinsic::x86_sse2_ucomigt_sd:
- case Intrinsic::x86_sse2_ucomile_sd:
- case Intrinsic::x86_sse2_ucomilt_sd:
- case Intrinsic::x86_sse2_ucomineq_sd:
- case Intrinsic::x86_avx512_vcomi_ss:
- case Intrinsic::x86_avx512_vcomi_sd:
- case Intrinsic::x86_avx512_mask_cmp_ss:
- case Intrinsic::x86_avx512_mask_cmp_sd: {
- // These intrinsics only demand the 0th element of their input vectors. If
- // we can simplify the input based on that, do so now.
- bool MadeChange = false;
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
-
- case Intrinsic::x86_avx512_add_ps_512:
- case Intrinsic::x86_avx512_div_ps_512:
- case Intrinsic::x86_avx512_mul_ps_512:
- case Intrinsic::x86_avx512_sub_ps_512:
- case Intrinsic::x86_avx512_add_pd_512:
- case Intrinsic::x86_avx512_div_pd_512:
- case Intrinsic::x86_avx512_mul_pd_512:
- case Intrinsic::x86_avx512_sub_pd_512:
- // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
- // IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
- if (R->getValue() == 4) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
-
- Value *V;
- switch (IID) {
- default:
- llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_add_ps_512:
- case Intrinsic::x86_avx512_add_pd_512:
- V = IC.Builder.CreateFAdd(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_sub_ps_512:
- case Intrinsic::x86_avx512_sub_pd_512:
- V = IC.Builder.CreateFSub(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_mul_ps_512:
- case Intrinsic::x86_avx512_mul_pd_512:
- V = IC.Builder.CreateFMul(Arg0, Arg1);
- break;
- case Intrinsic::x86_avx512_div_ps_512:
- case Intrinsic::x86_avx512_div_pd_512:
- V = IC.Builder.CreateFDiv(Arg0, Arg1);
- break;
- }
-
- return IC.replaceInstUsesWith(II, V);
- }
- }
- break;
-
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
- // IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
- if (R->getValue() == 4) {
- // Extract the element as scalars.
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
- Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
-
- Value *V;
- switch (IID) {
- default:
- llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- V = IC.Builder.CreateFAdd(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- V = IC.Builder.CreateFSub(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- V = IC.Builder.CreateFMul(LHS, RHS);
- break;
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- V = IC.Builder.CreateFDiv(LHS, RHS);
- break;
- }
-
- // Handle the masking aspect of the intrinsic.
- Value *Mask = II.getArgOperand(3);
- auto *C = dyn_cast<ConstantInt>(Mask);
- // We don't need a select if we know the mask bit is a 1.
- if (!C || !C->getValue()[0]) {
- // Cast the mask to an i1 vector and then extract the lowest element.
- auto *MaskTy = FixedVectorType::get(
- IC.Builder.getInt1Ty(),
- cast<IntegerType>(Mask->getType())->getBitWidth());
- Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
- Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
- // Extract the lowest element from the passthru operand.
- Value *Passthru =
- IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
- V = IC.Builder.CreateSelect(Mask, V, Passthru);
- }
-
- // Insert the result back into the original argument 0.
- V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
-
- return IC.replaceInstUsesWith(II, V);
- }
- }
- break;
-
- // Constant fold ashr( <A x Bi>, Ci ).
- // Constant fold lshr( <A x Bi>, Ci ).
- // Constant fold shl( <A x Bi>, Ci ).
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psrai_d_512:
- case Intrinsic::x86_avx512_psrai_q_512:
- case Intrinsic::x86_avx512_psrai_w_512:
- case Intrinsic::x86_sse2_psrli_d:
- case Intrinsic::x86_sse2_psrli_q:
- case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx512_psrli_d_512:
- case Intrinsic::x86_avx512_psrli_q_512:
- case Intrinsic::x86_avx512_psrli_w_512:
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx512_pslli_d_512:
- case Intrinsic::x86_avx512_pslli_q_512:
- case Intrinsic::x86_avx512_pslli_w_512:
- if (Value *V = simplifyX86immShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx512_psra_q_128:
- case Intrinsic::x86_avx512_psra_q_256:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psrl_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psrl_w:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_psll_w_512: {
- if (Value *V = simplifyX86immShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
-
- // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
- // operand to compute the shift amount.
- Value *Arg1 = II.getArgOperand(1);
- assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
- "Unexpected packed shift size");
- unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
-
- if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
- return IC.replaceOperand(II, 1, V);
- }
- break;
- }
-
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx512_psllv_d_512:
- case Intrinsic::x86_avx512_psllv_q_512:
- case Intrinsic::x86_avx512_psllv_w_128:
- case Intrinsic::x86_avx512_psllv_w_256:
- case Intrinsic::x86_avx512_psllv_w_512:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- case Intrinsic::x86_avx512_psrav_q_128:
- case Intrinsic::x86_avx512_psrav_q_256:
- case Intrinsic::x86_avx512_psrav_d_512:
- case Intrinsic::x86_avx512_psrav_q_512:
- case Intrinsic::x86_avx512_psrav_w_128:
- case Intrinsic::x86_avx512_psrav_w_256:
- case Intrinsic::x86_avx512_psrav_w_512:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx512_psrlv_d_512:
- case Intrinsic::x86_avx512_psrlv_q_512:
- case Intrinsic::x86_avx512_psrlv_w_128:
- case Intrinsic::x86_avx512_psrlv_w_256:
- case Intrinsic::x86_avx512_psrlv_w_512:
- if (Value *V = simplifyX86varShift(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx512_packssdw_512:
- case Intrinsic::x86_avx512_packsswb_512:
- if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx512_packusdw_512:
- case Intrinsic::x86_avx512_packuswb_512:
- if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_pclmulqdq:
- case Intrinsic::x86_pclmulqdq_256:
- case Intrinsic::x86_pclmulqdq_512: {
- if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
- unsigned Imm = C->getZExtValue();
-
- bool MadeChange = false;
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- unsigned VWidth =
- cast<FixedVectorType>(Arg0->getType())->getNumElements();
-
- APInt UndefElts1(VWidth, 0);
- APInt DemandedElts1 =
- APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
- if (Value *V =
- IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
-
- APInt UndefElts2(VWidth, 0);
- APInt DemandedElts2 =
- APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
- if (Value *V =
- IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
-
- // If either input elements are undef, the result is zero.
- if (DemandedElts1.isSubsetOf(UndefElts1) ||
- DemandedElts2.isSubsetOf(UndefElts2)) {
- return IC.replaceInstUsesWith(II,
- ConstantAggregateZero::get(II.getType()));
- }
-
- if (MadeChange) {
- return &II;
- }
- }
- break;
- }
-
- case Intrinsic::x86_sse41_insertps:
- if (Value *V = simplifyX86insertps(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_sse4a_extrq: {
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
- unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
- VWidth1 == 16 && "Unexpected operand sizes");
-
- // See if we're dealing with constant values.
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CILength =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
- : nullptr;
- ConstantInt *CIIndex =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
- : nullptr;
-
- // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
- if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
-
- // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
- // operands and the lowest 16-bits of the second.
- bool MadeChange = false;
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
-
- case Intrinsic::x86_sse4a_extrqi: {
- // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
- // bits of the lower 64-bits. The upper 64-bits are undefined.
- Value *Op0 = II.getArgOperand(0);
- unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
- "Unexpected operand size");
-
- // See if we're dealing with constant values.
- ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
- ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
-
- // Attempt to simplify to a constant or shuffle vector.
- if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
-
- // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
- // operand.
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
-
- case Intrinsic::x86_sse4a_insertq: {
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
- cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
- "Unexpected operand size");
-
- // See if we're dealing with constant values.
- Constant *C1 = dyn_cast<Constant>(Op1);
- ConstantInt *CI11 =
- C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
- : nullptr;
-
- // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
- if (CI11) {
- const APInt &V11 = CI11->getValue();
- APInt Len = V11.zextOrTrunc(6);
- APInt Idx = V11.lshr(8).zextOrTrunc(6);
- if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- }
-
- // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
- // operand.
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
- return IC.replaceOperand(II, 0, V);
- }
- break;
- }
-
- case Intrinsic::x86_sse4a_insertqi: {
- // INSERTQI: Extract lowest Length bits from lower half of second source and
- // insert over first source starting at Index bit. The upper 64-bits are
- // undefined.
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
- unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
- assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
- Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
- VWidth1 == 2 && "Unexpected operand sizes");
-
- // See if we're dealing with constant values.
- ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
- ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
-
- // Attempt to simplify to a constant or shuffle vector.
- if (CILength && CIIndex) {
- APInt Len = CILength->getValue().zextOrTrunc(6);
- APInt Idx = CIIndex->getValue().zextOrTrunc(6);
- if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- }
-
- // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
- // operands.
- bool MadeChange = false;
- if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
- IC.replaceOperand(II, 0, V);
- MadeChange = true;
- }
- if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
- IC.replaceOperand(II, 1, V);
- MadeChange = true;
- }
- if (MadeChange) {
- return &II;
- }
- break;
- }
-
- case Intrinsic::x86_sse41_pblendvb:
- case Intrinsic::x86_sse41_blendvps:
- case Intrinsic::x86_sse41_blendvpd:
- case Intrinsic::x86_avx_blendv_ps_256:
- case Intrinsic::x86_avx_blendv_pd_256:
- case Intrinsic::x86_avx2_pblendvb: {
- // fold (blend A, A, Mask) -> A
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
- Value *Mask = II.getArgOperand(2);
- if (Op0 == Op1) {
- return IC.replaceInstUsesWith(II, Op0);
- }
-
- // Zero Mask - select 1st argument.
- if (isa<ConstantAggregateZero>(Mask)) {
- return IC.replaceInstUsesWith(II, Op0);
- }
-
- // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
- if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
- Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
- return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
- }
-
- // Convert to a vector select if we can bypass casts and find a boolean
- // vector condition value.
- Value *BoolVec;
- Mask = InstCombiner::peekThroughBitcast(Mask);
- if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
- BoolVec->getType()->isVectorTy() &&
- BoolVec->getType()->getScalarSizeInBits() == 1) {
- assert(Mask->getType()->getPrimitiveSizeInBits() ==
- II.getType()->getPrimitiveSizeInBits() &&
- "Not expecting mask and operands with different sizes");
-
- unsigned NumMaskElts =
- cast<FixedVectorType>(Mask->getType())->getNumElements();
- unsigned NumOperandElts =
- cast<FixedVectorType>(II.getType())->getNumElements();
- if (NumMaskElts == NumOperandElts) {
- return SelectInst::Create(BoolVec, Op1, Op0);
- }
-
- // If the mask has less elements than the operands, each mask bit maps to
- // multiple elements of the operands. Bitcast back and forth.
- if (NumMaskElts < NumOperandElts) {
- Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
- Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
- Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
- return new BitCastInst(Sel, II.getType());
- }
- }
-
- break;
- }
-
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- case Intrinsic::x86_avx512_pshuf_b_512:
- if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_avx_vpermilvar_ps:
- case Intrinsic::x86_avx_vpermilvar_ps_256:
- case Intrinsic::x86_avx512_vpermilvar_ps_512:
- case Intrinsic::x86_avx_vpermilvar_pd:
- case Intrinsic::x86_avx_vpermilvar_pd_256:
- case Intrinsic::x86_avx512_vpermilvar_pd_512:
- if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps:
- case Intrinsic::x86_avx512_permvar_df_256:
- case Intrinsic::x86_avx512_permvar_df_512:
- case Intrinsic::x86_avx512_permvar_di_256:
- case Intrinsic::x86_avx512_permvar_di_512:
- case Intrinsic::x86_avx512_permvar_hi_128:
- case Intrinsic::x86_avx512_permvar_hi_256:
- case Intrinsic::x86_avx512_permvar_hi_512:
- case Intrinsic::x86_avx512_permvar_qi_128:
- case Intrinsic::x86_avx512_permvar_qi_256:
- case Intrinsic::x86_avx512_permvar_qi_512:
- case Intrinsic::x86_avx512_permvar_sf_512:
- case Intrinsic::x86_avx512_permvar_si_512:
- if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- case Intrinsic::x86_avx_maskload_ps:
- case Intrinsic::x86_avx_maskload_pd:
- case Intrinsic::x86_avx_maskload_ps_256:
- case Intrinsic::x86_avx_maskload_pd_256:
- case Intrinsic::x86_avx2_maskload_d:
- case Intrinsic::x86_avx2_maskload_q:
- case Intrinsic::x86_avx2_maskload_d_256:
- case Intrinsic::x86_avx2_maskload_q_256:
- if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
- return I;
- }
- break;
-
- case Intrinsic::x86_sse2_maskmov_dqu:
- case Intrinsic::x86_avx_maskstore_ps:
- case Intrinsic::x86_avx_maskstore_pd:
- case Intrinsic::x86_avx_maskstore_ps_256:
- case Intrinsic::x86_avx_maskstore_pd_256:
- case Intrinsic::x86_avx2_maskstore_d:
- case Intrinsic::x86_avx2_maskstore_q:
- case Intrinsic::x86_avx2_maskstore_d_256:
- case Intrinsic::x86_avx2_maskstore_q_256:
- if (simplifyX86MaskedStore(II, IC)) {
- return nullptr;
- }
- break;
-
- case Intrinsic::x86_addcarry_32:
- case Intrinsic::x86_addcarry_64:
- if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
- return IC.replaceInstUsesWith(II, V);
- }
- break;
-
- default:
- break;
- }
- return None;
-}
-
-Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
- InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
- bool &KnownBitsComputed) const {
- switch (II.getIntrinsicID()) {
- default:
- break;
- case Intrinsic::x86_mmx_pmovmskb:
- case Intrinsic::x86_sse_movmsk_ps:
- case Intrinsic::x86_sse2_movmsk_pd:
- case Intrinsic::x86_sse2_pmovmskb_128:
- case Intrinsic::x86_avx_movmsk_ps_256:
- case Intrinsic::x86_avx_movmsk_pd_256:
- case Intrinsic::x86_avx2_pmovmskb: {
- // MOVMSK copies the vector elements' sign bits to the low bits
- // and zeros the high bits.
- unsigned ArgWidth;
- if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
- ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
- } else {
- auto Arg = II.getArgOperand(0);
- auto ArgType = cast<FixedVectorType>(Arg->getType());
- ArgWidth = ArgType->getNumElements();
- }
-
- // If we don't need any of low bits then return zero,
- // we know that DemandedMask is non-zero already.
- APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
- Type *VTy = II.getType();
- if (DemandedElts.isNullValue()) {
- return ConstantInt::getNullValue(VTy);
- }
-
- // We know that the upper bits are set to zero.
- Known.Zero.setBitsFrom(ArgWidth);
- KnownBitsComputed = true;
- break;
- }
- }
- return None;
-}
-
-Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
- InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
- APInt &UndefElts2, APInt &UndefElts3,
- std::function<void(Instruction *, unsigned, APInt, APInt &)>
- simplifyAndSetOp) const {
- unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
- switch (II.getIntrinsicID()) {
- default:
- break;
- case Intrinsic::x86_xop_vfrcz_ss:
- case Intrinsic::x86_xop_vfrcz_sd:
- // The instructions for these intrinsics are speced to zero upper bits not
- // pass them through like other scalar intrinsics. So we shouldn't just
- // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
- // Instead we should return a zero vector.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return ConstantAggregateZero::get(II.getType());
- }
-
- // Only the lower element is used.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
-
- // Only the lower element is undefined. The high elements are zero.
- UndefElts = UndefElts[0];
- break;
-
- // Unary scalar-as-vector operations that work column-wise.
- case Intrinsic::x86_sse_rcp_ss:
- case Intrinsic::x86_sse_rsqrt_ss:
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
-
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
- // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
- // checks).
- break;
-
- // Binary scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0. The low element is a function of both
- // operands.
- case Intrinsic::x86_sse_min_ss:
- case Intrinsic::x86_sse_max_ss:
- case Intrinsic::x86_sse_cmp_ss:
- case Intrinsic::x86_sse2_min_sd:
- case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse2_cmp_sd: {
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
-
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
-
- // Only lower element is used for operand 1.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
-
- // Lower element is undefined if both lower elements are undefined.
- // Consider things like undef&0. The result is known zero, not undef.
- if (!UndefElts2[0])
- UndefElts.clearBit(0);
-
- break;
- }
-
- // Binary scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0 and the low element comes from operand 1.
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd: {
- // Don't use the low element of operand 0.
- APInt DemandedElts2 = DemandedElts;
- DemandedElts2.clearBit(0);
- simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
-
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
-
- // Only lower element is used for operand 1.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
-
- // Take the high undef elements from operand 0 and take the lower element
- // from operand 1.
- UndefElts.clearBit(0);
- UndefElts |= UndefElts2[0];
- break;
- }
-
- // Three input scalar-as-vector operations that work column-wise. The high
- // elements come from operand 0 and the low element is a function of all
- // three inputs.
- case Intrinsic::x86_avx512_mask_add_ss_round:
- case Intrinsic::x86_avx512_mask_div_ss_round:
- case Intrinsic::x86_avx512_mask_mul_ss_round:
- case Intrinsic::x86_avx512_mask_sub_ss_round:
- case Intrinsic::x86_avx512_mask_max_ss_round:
- case Intrinsic::x86_avx512_mask_min_ss_round:
- case Intrinsic::x86_avx512_mask_add_sd_round:
- case Intrinsic::x86_avx512_mask_div_sd_round:
- case Intrinsic::x86_avx512_mask_mul_sd_round:
- case Intrinsic::x86_avx512_mask_sub_sd_round:
- case Intrinsic::x86_avx512_mask_max_sd_round:
- case Intrinsic::x86_avx512_mask_min_sd_round:
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
-
- // If lowest element of a scalar op isn't used then use Arg0.
- if (!DemandedElts[0]) {
- IC.addToWorklist(&II);
- return II.getArgOperand(0);
- }
-
- // Only lower element is used for operand 1 and 2.
- DemandedElts = 1;
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
-
- // Lower element is undefined if all three lower elements are undefined.
- // Consider things like undef&0. The result is known zero, not undef.
- if (!UndefElts2[0] || !UndefElts3[0])
- UndefElts.clearBit(0);
- break;
-
- // TODO: Add fmaddsub support?
- case Intrinsic::x86_sse3_addsub_pd:
- case Intrinsic::x86_sse3_addsub_ps:
- case Intrinsic::x86_avx_addsub_pd_256:
- case Intrinsic::x86_avx_addsub_ps_256: {
- // If none of the even or none of the odd lanes are required, turn this
- // into a generic FP math instruction.
- APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
- APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
- bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
- bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
- if (IsSubOnly || IsAddOnly) {
- assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
- IRBuilderBase::InsertPointGuard Guard(IC.Builder);
- IC.Builder.SetInsertPoint(&II);
- Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
- return IC.Builder.CreateBinOp(
- IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
- }
-
- simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
- UndefElts &= UndefElts2;
- break;
- }
-
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx2_packusdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx512_packssdw_512:
- case Intrinsic::x86_avx512_packsswb_512:
- case Intrinsic::x86_avx512_packusdw_512:
- case Intrinsic::x86_avx512_packuswb_512: {
- auto *Ty0 = II.getArgOperand(0)->getType();
- unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
- assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
-
- unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
- unsigned VWidthPerLane = VWidth / NumLanes;
- unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
-
- // Per lane, pack the elements of the first input and then the second.
- // e.g.
- // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
- // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
- for (int OpNum = 0; OpNum != 2; ++OpNum) {
- APInt OpDemandedElts(InnerVWidth, 0);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- unsigned LaneIdx = Lane * VWidthPerLane;
- for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
- unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
- if (DemandedElts[Idx])
- OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
- }
- }
-
- // Demand elements from the operand.
- APInt OpUndefElts(InnerVWidth, 0);
- simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
-
- // Pack the operand's UNDEF elements, one lane at a time.
- OpUndefElts = OpUndefElts.zext(VWidth);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
- LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
- LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
- UndefElts |= LaneElts;
- }
- }
- break;
- }
-
- // PSHUFB
- case Intrinsic::x86_ssse3_pshuf_b_128:
- case Intrinsic::x86_avx2_pshuf_b:
- case Intrinsic::x86_avx512_pshuf_b_512:
- // PERMILVAR
- case Intrinsic::x86_avx_vpermilvar_ps:
- case Intrinsic::x86_avx_vpermilvar_ps_256:
- case Intrinsic::x86_avx512_vpermilvar_ps_512:
- case Intrinsic::x86_avx_vpermilvar_pd:
- case Intrinsic::x86_avx_vpermilvar_pd_256:
- case Intrinsic::x86_avx512_vpermilvar_pd_512:
- // PERMV
- case Intrinsic::x86_avx2_permd:
- case Intrinsic::x86_avx2_permps: {
- simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
- break;
- }
-
- // SSE4A instructions leave the upper 64-bits of the 128-bit result
- // in an undefined state.
- case Intrinsic::x86_sse4a_extrq:
- case Intrinsic::x86_sse4a_extrqi:
- case Intrinsic::x86_sse4a_insertq:
- case Intrinsic::x86_sse4a_insertqi:
- UndefElts.setHighBits(VWidth / 2);
- break;
- }
- return None;
-}
+//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86tti"
+
+/// Return a constant boolean vector that has true elements in all positions
+/// where the input constant data vector has an element with the sign bit set.
+static Constant *getNegativeIsTrueBoolVec(Constant *V) {
+ VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
+ V = ConstantExpr::getBitCast(V, IntTy);
+ V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
+ V);
+ return V;
+}
+
+/// Convert the x86 XMM integer vector mask to a vector of bools based on
+/// each element's most significant bit (the sign bit).
+static Value *getBoolVecFromMask(Value *Mask) {
+ // Fold Constant Mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
+ return getNegativeIsTrueBoolVec(ConstantMask);
+
+ // Mask was extended from a boolean vector.
+ Value *ExtMask;
+ if (PatternMatch::match(
+ Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
+ ExtMask->getType()->isIntOrIntVectorTy(1))
+ return ExtMask;
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Constant *ZeroVec = Constant::getNullValue(II.getType());
+
+ // Zero Mask - masked load instruction creates a zero vector.
+ if (isa<ConstantAggregateZero>(Mask))
+ return IC.replaceInstUsesWith(II, ZeroVec);
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
+ // the LLVM intrinsic definition for the pointer argument.
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ // The pass-through vector for an x86 masked load is a zero vector.
+ CallInst *NewMaskedLoad =
+ IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
+ return IC.replaceInstUsesWith(II, NewMaskedLoad);
+ }
+
+ return nullptr;
+}
+
+// TODO: If the x86 backend knew how to convert a bool vector mask back to an
+// XMM register mask efficiently, we could transform all x86 masked intrinsics
+// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
+static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
+ Value *Ptr = II.getOperand(0);
+ Value *Mask = II.getOperand(1);
+ Value *Vec = II.getOperand(2);
+
+ // Zero Mask - this masked store instruction does nothing.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
+ // anything else at this level.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
+ return false;
+
+ // The mask is constant or extended from a bool vector. Convert this x86
+ // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
+ if (Value *BoolMask = getBoolVecFromMask(Mask)) {
+ unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
+ Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
+
+ IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
+
+ // 'Replace uses' doesn't work for stores. Erase the original masked store.
+ IC.eraseInstFromFunction(II);
+ return true;
+ }
+
+ return false;
+}
+
+static Value *simplifyX86immShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+ bool IsImm = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ IsImm = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(Vec->getType());
+ auto SVT = VT->getElementType();
+ auto AmtVT = Amt->getType();
+ unsigned VWidth = VT->getNumElements();
+ unsigned BitWidth = SVT->getPrimitiveSizeInBits();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift. If its guaranteed to be out of range, logical shifts combine
+ // to zero and arithmetic shifts are clamped to (BitWidth - 1).
+ if (IsImm) {
+ assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
+ KnownBits KnownAmtBits =
+ llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+ if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+ Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+ Amt = Builder.CreateVectorSplat(VWidth, Amt);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+ Amt = ConstantInt::get(SVT, BitWidth - 1);
+ return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+ }
+ } else {
+ // Ensure the first element has an in-range value and the rest of the
+ // elements in the bottom 64 bits are zero.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+ unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
+ APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+ APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+ KnownBits KnownLowerBits = llvm::computeKnownBits(
+ Amt, DemandedLower, II.getModule()->getDataLayout());
+ KnownBits KnownUpperBits = llvm::computeKnownBits(
+ Amt, DemandedUpper, II.getModule()->getDataLayout());
+ if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+ (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+ SmallVector<int, 16> ZeroSplat(VWidth, 0);
+ Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+ }
+
+ // Simplify if count is constant vector.
+ auto CDV = dyn_cast<ConstantDataVector>(Amt);
+ if (!CDV)
+ return nullptr;
+
+ // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+ cast<VectorType>(AmtVT)->getElementType() == SVT &&
+ "Unexpected shift-by-scalar type");
+
+ // Concatenate the sub-elements to create the 64-bit value.
+ APInt Count(64, 0);
+ for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+ unsigned SubEltIdx = (NumSubElts - 1) - i;
+ auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+ Count <<= BitWidth;
+ Count |= SubElt->getValue().zextOrTrunc(64);
+ }
+
+ // If shift-by-zero then just return the original value.
+ if (Count.isNullValue())
+ return Vec;
+
+ // Handle cases when Shift >= BitWidth.
+ if (Count.uge(BitWidth)) {
+ // If LogicalShift - just return zero.
+ if (LogicalShift)
+ return ConstantAggregateZero::get(VT);
+
+ // If ArithmeticShift - clamp Shift to (BitWidth - 1).
+ Count = APInt(64, BitWidth - 1);
+ }
+
+ // Get a constant vector of the same type as the first operand.
+ auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
+ auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
+// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
+// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
+static Value *simplifyX86varShift(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ bool LogicalShift = false;
+ bool ShiftLeft = false;
+
+ switch (II.getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ LogicalShift = false;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ LogicalShift = true;
+ ShiftLeft = false;
+ break;
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ LogicalShift = true;
+ ShiftLeft = true;
+ break;
+ }
+ assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
+
+ auto Vec = II.getArgOperand(0);
+ auto Amt = II.getArgOperand(1);
+ auto VT = cast<FixedVectorType>(II.getType());
+ auto SVT = VT->getElementType();
+ int NumElts = VT->getNumElements();
+ int BitWidth = SVT->getIntegerBitWidth();
+
+ // If the shift amount is guaranteed to be in-range we can replace it with a
+ // generic shift.
+ APInt UpperBits =
+ APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+ if (llvm::MaskedValueIsZero(Amt, UpperBits,
+ II.getModule()->getDataLayout())) {
+ return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+ : Builder.CreateLShr(Vec, Amt))
+ : Builder.CreateAShr(Vec, Amt));
+ }
+
+ // Simplify if all shift amounts are constant/undef.
+ auto *CShift = dyn_cast<Constant>(Amt);
+ if (!CShift)
+ return nullptr;
+
+ // Collect each element's shift amount.
+ // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
+ bool AnyOutOfRange = false;
+ SmallVector<int, 8> ShiftAmts;
+ for (int I = 0; I < NumElts; ++I) {
+ auto *CElt = CShift->getAggregateElement(I);
+ if (isa_and_nonnull<UndefValue>(CElt)) {
+ ShiftAmts.push_back(-1);
+ continue;
+ }
+
+ auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
+ if (!COp)
+ return nullptr;
+
+ // Handle out of range shifts.
+ // If LogicalShift - set to BitWidth (special case).
+ // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
+ APInt ShiftVal = COp->getValue();
+ if (ShiftVal.uge(BitWidth)) {
+ AnyOutOfRange = LogicalShift;
+ ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
+ continue;
+ }
+
+ ShiftAmts.push_back((int)ShiftVal.getZExtValue());
+ }
+
+ // If all elements out of range or UNDEF, return vector of zeros/undefs.
+ // ArithmeticShift should only hit this if they are all UNDEF.
+ auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
+ if (llvm::all_of(ShiftAmts, OutOfRange)) {
+ SmallVector<Constant *, 8> ConstantVec;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0) {
+ ConstantVec.push_back(UndefValue::get(SVT));
+ } else {
+ assert(LogicalShift && "Logical shift expected");
+ ConstantVec.push_back(ConstantInt::getNullValue(SVT));
+ }
+ }
+ return ConstantVector::get(ConstantVec);
+ }
+
+ // We can't handle only some out of range values with generic logical shifts.
+ if (AnyOutOfRange)
+ return nullptr;
+
+ // Build the shift amount constant vector.
+ SmallVector<Constant *, 8> ShiftVecAmts;
+ for (int Idx : ShiftAmts) {
+ if (Idx < 0)
+ ShiftVecAmts.push_back(UndefValue::get(SVT));
+ else
+ ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
+ }
+ auto ShiftVec = ConstantVector::get(ShiftVecAmts);
+
+ if (ShiftLeft)
+ return Builder.CreateShl(Vec, ShiftVec);
+
+ if (LogicalShift)
+ return Builder.CreateLShr(Vec, ShiftVec);
+
+ return Builder.CreateAShr(Vec, ShiftVec);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder, bool IsSigned) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Type *ResTy = II.getType();
+
+ // Fast all undef handling.
+ if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+ return UndefValue::get(ResTy);
+
+ auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
+ unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+ unsigned NumSrcElts = ArgTy->getNumElements();
+ assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
+ "Unexpected packing types");
+
+ unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+ unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+ unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
+ assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
+ "Unexpected packing types");
+
+ // Constant folding.
+ if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+ return nullptr;
+
+ // Clamp Values - signed/unsigned both use signed clamp values, but they
+ // differ on the min/max values.
+ APInt MinValue, MaxValue;
+ if (IsSigned) {
+ // PACKSS: Truncate signed value with signed saturation.
+ // Source values less than dst minint are saturated to minint.
+ // Source values greater than dst maxint are saturated to maxint.
+ MinValue =
+ APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ MaxValue =
+ APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
+ } else {
+ // PACKUS: Truncate signed value with unsigned saturation.
+ // Source values less than zero are saturated to zero.
+ // Source values greater than dst maxuint are saturated to maxuint.
+ MinValue = APInt::getNullValue(SrcScalarSizeInBits);
+ MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
+ }
+
+ auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
+ auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
+ Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
+ Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
+
+ // Shuffle clamped args together at the lane level.
+ SmallVector<int, 32> PackMask;
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
+ PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
+ }
+ auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
+
+ // Truncate to dst size.
+ return Builder.CreateTrunc(Shuffle, ResTy);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Arg = II.getArgOperand(0);
+ Type *ResTy = II.getType();
+
+ // movmsk(undef) -> zero as we must ensure the upper bits are zero.
+ if (isa<UndefValue>(Arg))
+ return Constant::getNullValue(ResTy);
+
+ auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
+ // We can't easily peek through x86_mmx types.
+ if (!ArgTy)
+ return nullptr;
+
+ // Expand MOVMSK to compare/bitcast/zext:
+ // e.g. PMOVMSKB(v16i8 x):
+ // %cmp = icmp slt <16 x i8> %x, zeroinitializer
+ // %int = bitcast <16 x i1> %cmp to i16
+ // %res = zext i16 %int to i32
+ unsigned NumElts = ArgTy->getNumElements();
+ Type *IntegerVecTy = VectorType::getInteger(ArgTy);
+ Type *IntegerTy = Builder.getIntNTy(NumElts);
+
+ Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
+ Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
+ Res = Builder.CreateBitCast(Res, IntegerTy);
+ Res = Builder.CreateZExtOrTrunc(Res, ResTy);
+ return Res;
+}
+
+static Value *simplifyX86addcarry(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Value *CarryIn = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Op2 = II.getArgOperand(2);
+ Type *RetTy = II.getType();
+ Type *OpTy = Op1->getType();
+ assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
+ RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
+ "Unexpected types for x86 addcarry");
+
+ // If carry-in is zero, this is just an unsigned add with overflow.
+ if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+ Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
+ {Op1, Op2});
+ // The types have to be adjusted to match the x86 call types.
+ Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
+ Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
+ Builder.getInt8Ty());
+ Value *Res = UndefValue::get(RetTy);
+ Res = Builder.CreateInsertValue(Res, UAddOV, 0);
+ return Builder.CreateInsertValue(Res, UAddResult, 1);
+ }
+
+ return nullptr;
+}
+
+static Value *simplifyX86insertps(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ if (!CInt)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
+
+ // The immediate permute control byte looks like this:
+ // [3:0] - zero mask for each 32-bit lane
+ // [5:4] - select one 32-bit destination lane
+ // [7:6] - select one 32-bit source lane
+
+ uint8_t Imm = CInt->getZExtValue();
+ uint8_t ZMask = Imm & 0xf;
+ uint8_t DestLane = (Imm >> 4) & 0x3;
+ uint8_t SourceLane = (Imm >> 6) & 0x3;
+
+ ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+ // If all zero mask bits are set, this was just a weird way to
+ // generate a zero vector.
+ if (ZMask == 0xf)
+ return ZeroVector;
+
+ // Initialize by passing all of the first source bits through.
+ int ShuffleMask[4] = {0, 1, 2, 3};
+
+ // We may replace the second operand with the zero vector.
+ Value *V1 = II.getArgOperand(1);
+
+ if (ZMask) {
+ // If the zero mask is being used with a single input or the zero mask
+ // overrides the destination lane, this is a shuffle with the zero vector.
+ if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
+ (ZMask & (1 << DestLane))) {
+ V1 = ZeroVector;
+ // We may still move 32-bits of the first source vector from one lane
+ // to another.
+ ShuffleMask[DestLane] = SourceLane;
+ // The zero mask may override the previous insert operation.
+ for (unsigned i = 0; i < 4; ++i)
+ if ((ZMask >> i) & 0x1)
+ ShuffleMask[i] = i + 4;
+ } else {
+ // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
+ return nullptr;
+ }
+ } else {
+ // Replace the selected destination lane with the selected source lane.
+ ShuffleMask[DestLane] = SourceLane + 4;
+ }
+
+ return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
+}
+
+/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
+/// or conversion to a shuffle vector.
+static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
+ ConstantInt *CILength, ConstantInt *CIIndex,
+ InstCombiner::BuilderTy &Builder) {
+ auto LowConstantHighUndef = [&](uint64_t Val) {
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ };
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ ConstantInt *CI0 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Attempt to constant fold.
+ if (CILength && CIIndex) {
+ // From AMD documentation: "The bit index and field length are each six
+ // bits in length other bits of the field are ignored."
+ APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
+ APInt APLength = CILength->getValue().zextOrTrunc(6);
+
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize EXTRQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + Index);
+ for (int i = Length; i != 8; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(
+ Builder.CreateBitCast(Op0, ShufTy),
+ ConstantAggregateZero::get(ShufTy), ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // Constant Fold - shift Index'th bit to lowest position and mask off
+ // Length bits.
+ if (CI0) {
+ APInt Elt = CI0->getValue();
+ Elt.lshrInPlace(Index);
+ Elt = Elt.zextOrTrunc(Length);
+ return LowConstantHighUndef(Elt.getZExtValue());
+ }
+
+ // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
+ Value *Args[] = {Op0, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+ return Builder.CreateCall(F, Args);
+ }
+ }
+
+ // Constant Fold - extraction from zero is always {zero, undef}.
+ if (CI0 && CI0->isZero())
+ return LowConstantHighUndef(0);
+
+ return nullptr;
+}
+
+/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
+/// folding or conversion to a shuffle vector.
+static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
+ APInt APLength, APInt APIndex,
+ InstCombiner::BuilderTy &Builder) {
+ // From AMD documentation: "The bit index and field length are each six bits
+ // in length other bits of the field are ignored."
+ APIndex = APIndex.zextOrTrunc(6);
+ APLength = APLength.zextOrTrunc(6);
+
+ // Attempt to constant fold.
+ unsigned Index = APIndex.getZExtValue();
+
+ // From AMD documentation: "a value of zero in the field length is
+ // defined as length of 64".
+ unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
+
+ // From AMD documentation: "If the sum of the bit index + length field
+ // is greater than 64, the results are undefined".
+ unsigned End = Index + Length;
+
+ // Note that both field index and field length are 8-bit quantities.
+ // Since variables 'Index' and 'Length' are unsigned values
+ // obtained from zero-extending field index and field length
+ // respectively, their sum should never wrap around.
+ if (End > 64)
+ return UndefValue::get(II.getType());
+
+ // If we are inserting whole bytes, we can convert this to a shuffle.
+ // Lowering can recognize INSERTQI shuffle masks.
+ if ((Length % 8) == 0 && (Index % 8) == 0) {
+ // Convert bit indices to byte indices.
+ Length /= 8;
+ Index /= 8;
+
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ auto *ShufTy = FixedVectorType::get(IntTy8, 16);
+
+ SmallVector<int, 16> ShuffleMask;
+ for (int i = 0; i != (int)Index; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 0; i != (int)Length; ++i)
+ ShuffleMask.push_back(i + 16);
+ for (int i = Index + Length; i != 8; ++i)
+ ShuffleMask.push_back(i);
+ for (int i = 8; i != 16; ++i)
+ ShuffleMask.push_back(-1);
+
+ Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
+ Builder.CreateBitCast(Op1, ShufTy),
+ ShuffleMask);
+ return Builder.CreateBitCast(SV, II.getType());
+ }
+
+ // See if we're dealing with constant values.
+ Constant *C0 = dyn_cast<Constant>(Op0);
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI00 =
+ C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CI10 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+
+ // Constant Fold - insert bottom Length bits starting at the Index'th bit.
+ if (CI00 && CI10) {
+ APInt V00 = CI00->getValue();
+ APInt V10 = CI10->getValue();
+ APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
+ V00 = V00 & ~Mask;
+ V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
+ APInt Val = V00 | V10;
+ Type *IntTy64 = Type::getInt64Ty(II.getContext());
+ Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
+ UndefValue::get(IntTy64)};
+ return ConstantVector::get(Args);
+ }
+
+ // If we were an INSERTQ call, we'll save demanded elements if we convert to
+ // INSERTQI.
+ if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
+ Type *IntTy8 = Type::getInt8Ty(II.getContext());
+ Constant *CILength = ConstantInt::get(IntTy8, Length, false);
+ Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
+
+ Value *Args[] = {Op0, Op1, CILength, CIIndex};
+ Module *M = II.getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+ return Builder.CreateCall(F, Args);
+ }
+
+ return nullptr;
+}
+
+/// Attempt to convert pshufb* to shufflevector if the mask is constant.
+static Value *simplifyX86pshufb(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
+ "Unexpected number of elements in shuffle mask!");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ // Each byte in the shuffle control mask forms an index to permute the
+ // corresponding byte in the destination operand.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
+
+ // If the most significant bit (bit[7]) of each byte of the shuffle
+ // control mask is set, then zero is written in the result byte.
+ // The zero vector is in the right-hand side of the resulting
+ // shufflevector.
+
+ // The value of each index for the high 128-bit lane is the least
+ // significant 4 bits of the respective shuffle control byte.
+ Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ auto V2 = Constant::getNullValue(VecTy);
+ return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
+static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+ bool IsPD = VecTy->getScalarType()->isDoubleTy();
+ unsigned NumLaneElts = IsPD ? 2 : 4;
+ assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[16];
+
+ // The intrinsics only read one or two bits, clear the rest.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ APInt Index = cast<ConstantInt>(COp)->getValue();
+ Index = Index.zextOrTrunc(32).getLoBits(2);
+
+ // The PD variants uses bit 1 to select per-lane element index, so
+ // shift down to convert to generic shuffle mask index.
+ if (IsPD)
+ Index.lshrInPlace(1);
+
+ // The _256 variants are a bit trickier since the mask bits always index
+ // into the corresponding 128 half. In order to convert to a generic
+ // shuffle, we have to make that explicit.
+ Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
+
+ Indexes[I] = Index.getZExtValue();
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
+}
+
+/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
+static Value *simplifyX86vpermv(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ auto *V = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!V)
+ return nullptr;
+
+ auto *VecTy = cast<FixedVectorType>(II.getType());
+ unsigned Size = VecTy->getNumElements();
+ assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
+ "Unexpected shuffle mask size");
+
+ // Construct a shuffle mask from constant integers or UNDEFs.
+ int Indexes[64];
+
+ for (unsigned I = 0; I < Size; ++I) {
+ Constant *COp = V->getAggregateElement(I);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+ return nullptr;
+
+ if (isa<UndefValue>(COp)) {
+ Indexes[I] = -1;
+ continue;
+ }
+
+ uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
+ Index &= Size - 1;
+ Indexes[I] = Index;
+ }
+
+ auto V1 = II.getArgOperand(0);
+ return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
+}
+
+Optional<Instruction *>
+X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
+ unsigned DemandedWidth) {
+ APInt UndefElts(Width, 0);
+ APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
+ return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
+ };
+
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::x86_bmi_bextr_32:
+ case Intrinsic::x86_bmi_bextr_64:
+ case Intrinsic::x86_tbm_bextri_u32:
+ case Intrinsic::x86_tbm_bextri_u64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Shift = C->getZExtValue();
+ uint64_t Length = (Shift >> 8) & 0xff;
+ Shift &= 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ // If the length is 0 or the shift is out of range, replace with zero.
+ if (Length == 0 || Shift >= BitWidth) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue() >> Shift;
+ if (Length > BitWidth)
+ Length = BitWidth;
+ Result &= maskTrailingOnes<uint64_t>(Length);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
+ // are only masking bits that a shift already cleared?
+ }
+ break;
+
+ case Intrinsic::x86_bmi_bzhi_32:
+ case Intrinsic::x86_bmi_bzhi_64:
+ // If the RHS is a constant we can try some simplifications.
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ uint64_t Index = C->getZExtValue() & 0xff;
+ unsigned BitWidth = II.getType()->getIntegerBitWidth();
+ if (Index >= BitWidth) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (Index == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ // If the LHS is also a constant, we can completely constant fold this.
+ if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Result = InC->getZExtValue();
+ Result &= maskTrailingOnes<uint64_t>(Index);
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ // TODO should we convert this to an AND if the RHS is constant?
+ }
+ break;
+ case Intrinsic::x86_bmi_pext_32:
+ case Intrinsic::x86_bmi_pext_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+ Value *Shifted = IC.Builder.CreateLShr(Masked,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ return IC.replaceInstUsesWith(II, Shifted);
+ }
+
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToSet = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToTest = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToSet <<= 1;
+ // Clear lowest set bit.
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+ case Intrinsic::x86_bmi_pdep_32:
+ case Intrinsic::x86_bmi_pdep_64:
+ if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
+ if (MaskC->isNullValue()) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
+ }
+ if (MaskC->isAllOnesValue()) {
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Shifted = IC.Builder.CreateShl(Input,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+ return IC.replaceInstUsesWith(II, Masked);
+ }
+
+ if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ uint64_t Src = SrcC->getZExtValue();
+ uint64_t Mask = MaskC->getZExtValue();
+ uint64_t Result = 0;
+ uint64_t BitToTest = 1;
+
+ while (Mask) {
+ // Isolate lowest set bit.
+ uint64_t BitToSet = Mask & -Mask;
+ if (BitToTest & Src)
+ Result |= BitToSet;
+
+ BitToTest <<= 1;
+ // Clear lowest set bit;
+ Mask &= Mask - 1;
+ }
+
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), Result));
+ }
+ }
+ break;
+
+ case Intrinsic::x86_sse_cvtss2si:
+ case Intrinsic::x86_sse_cvtss2si64:
+ case Intrinsic::x86_sse_cvttss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse2_cvtsd2si:
+ case Intrinsic::x86_sse2_cvtsd2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ case Intrinsic::x86_avx512_vcvtss2si32:
+ case Intrinsic::x86_avx512_vcvtss2si64:
+ case Intrinsic::x86_avx512_vcvtss2usi32:
+ case Intrinsic::x86_avx512_vcvtss2usi64:
+ case Intrinsic::x86_avx512_vcvtsd2si32:
+ case Intrinsic::x86_avx512_vcvtsd2si64:
+ case Intrinsic::x86_avx512_vcvtsd2usi32:
+ case Intrinsic::x86_avx512_vcvtsd2usi64:
+ case Intrinsic::x86_avx512_cvttss2si:
+ case Intrinsic::x86_avx512_cvttss2si64:
+ case Intrinsic::x86_avx512_cvttss2usi:
+ case Intrinsic::x86_avx512_cvttss2usi64:
+ case Intrinsic::x86_avx512_cvttsd2si:
+ case Intrinsic::x86_avx512_cvttsd2si64:
+ case Intrinsic::x86_avx512_cvttsd2usi:
+ case Intrinsic::x86_avx512_cvttsd2usi64: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ Value *Arg = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx2_pmovmskb:
+ if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ case Intrinsic::x86_avx512_vcomi_ss:
+ case Intrinsic::x86_avx512_vcomi_sd:
+ case Intrinsic::x86_avx512_mask_cmp_ss:
+ case Intrinsic::x86_avx512_mask_cmp_sd: {
+ // These intrinsics only demand the 0th element of their input vectors. If
+ // we can simplify the input based on that, do so now.
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ if (R->getValue() == 4) {
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ V = IC.Builder.CreateFAdd(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
+ V = IC.Builder.CreateFSub(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ V = IC.Builder.CreateFMul(Arg0, Arg1);
+ break;
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ V = IC.Builder.CreateFDiv(Arg0, Arg1);
+ break;
+ }
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
+ // IR operations.
+ if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
+ if (R->getValue() == 4) {
+ // Extract the element as scalars.
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
+ Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
+
+ Value *V;
+ switch (IID) {
+ default:
+ llvm_unreachable("Case stmts out of sync!");
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ V = IC.Builder.CreateFAdd(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ V = IC.Builder.CreateFSub(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ V = IC.Builder.CreateFMul(LHS, RHS);
+ break;
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ V = IC.Builder.CreateFDiv(LHS, RHS);
+ break;
+ }
+
+ // Handle the masking aspect of the intrinsic.
+ Value *Mask = II.getArgOperand(3);
+ auto *C = dyn_cast<ConstantInt>(Mask);
+ // We don't need a select if we know the mask bit is a 1.
+ if (!C || !C->getValue()[0]) {
+ // Cast the mask to an i1 vector and then extract the lowest element.
+ auto *MaskTy = FixedVectorType::get(
+ IC.Builder.getInt1Ty(),
+ cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
+ Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
+ // Extract the lowest element from the passthru operand.
+ Value *Passthru =
+ IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
+ V = IC.Builder.CreateSelect(Mask, V, Passthru);
+ }
+
+ // Insert the result back into the original argument 0.
+ V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
+
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+ break;
+
+ // Constant fold ashr( <A x Bi>, Ci ).
+ // Constant fold lshr( <A x Bi>, Ci ).
+ // Constant fold shl( <A x Bi>, Ci ).
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_psll_w_512: {
+ if (Value *V = simplifyX86immShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
+ // operand to compute the shift amount.
+ Value *Arg1 = II.getArgOperand(1);
+ assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
+ "Unexpected packed shift size");
+ unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
+
+ if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
+ return IC.replaceOperand(II, 1, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx512_psllv_w_128:
+ case Intrinsic::x86_avx512_psllv_w_256:
+ case Intrinsic::x86_avx512_psllv_w_512:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ case Intrinsic::x86_avx512_psrav_w_128:
+ case Intrinsic::x86_avx512_psrav_w_256:
+ case Intrinsic::x86_avx512_psrav_w_512:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx512_psrlv_w_128:
+ case Intrinsic::x86_avx512_psrlv_w_256:
+ case Intrinsic::x86_avx512_psrlv_w_512:
+ if (Value *V = simplifyX86varShift(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512:
+ if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512: {
+ if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+ unsigned Imm = C->getZExtValue();
+
+ bool MadeChange = false;
+ Value *Arg0 = II.getArgOperand(0);
+ Value *Arg1 = II.getArgOperand(1);
+ unsigned VWidth =
+ cast<FixedVectorType>(Arg0->getType())->getNumElements();
+
+ APInt UndefElts1(VWidth, 0);
+ APInt DemandedElts1 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+
+ APInt UndefElts2(VWidth, 0);
+ APInt DemandedElts2 =
+ APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
+ if (Value *V =
+ IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+
+ // If either input elements are undef, the result is zero.
+ if (DemandedElts1.isSubsetOf(UndefElts1) ||
+ DemandedElts2.isSubsetOf(UndefElts2)) {
+ return IC.replaceInstUsesWith(II,
+ ConstantAggregateZero::get(II.getType()));
+ }
+
+ if (MadeChange) {
+ return &II;
+ }
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_insertps:
+ if (Value *V = simplifyX86insertps(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_sse4a_extrq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 16 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CILength =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
+ : nullptr;
+ ConstantInt *CIIndex =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
+ // operands and the lowest 16-bits of the second.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_extrqi: {
+ // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
+ // bits of the lower 64-bits. The upper 64-bits are undefined.
+ Value *Op0 = II.getArgOperand(0);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+
+ // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertq: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
+ cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
+ "Unexpected operand size");
+
+ // See if we're dealing with constant values.
+ Constant *C1 = dyn_cast<Constant>(Op1);
+ ConstantInt *CI11 =
+ C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
+ : nullptr;
+
+ // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
+ if (CI11) {
+ const APInt &V11 = CI11->getValue();
+ APInt Len = V11.zextOrTrunc(6);
+ APInt Idx = V11.lshr(8).zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
+ // operand.
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
+ return IC.replaceOperand(II, 0, V);
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse4a_insertqi: {
+ // INSERTQI: Extract lowest Length bits from lower half of second source and
+ // insert over first source starting at Index bit. The upper 64-bits are
+ // undefined.
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
+ unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
+ assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
+ Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
+ VWidth1 == 2 && "Unexpected operand sizes");
+
+ // See if we're dealing with constant values.
+ ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
+
+ // Attempt to simplify to a constant or shuffle vector.
+ if (CILength && CIIndex) {
+ APInt Len = CILength->getValue().zextOrTrunc(6);
+ APInt Idx = CIIndex->getValue().zextOrTrunc(6);
+ if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ }
+
+ // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
+ // operands.
+ bool MadeChange = false;
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
+ IC.replaceOperand(II, 0, V);
+ MadeChange = true;
+ }
+ if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
+ IC.replaceOperand(II, 1, V);
+ MadeChange = true;
+ }
+ if (MadeChange) {
+ return &II;
+ }
+ break;
+ }
+
+ case Intrinsic::x86_sse41_pblendvb:
+ case Intrinsic::x86_sse41_blendvps:
+ case Intrinsic::x86_sse41_blendvpd:
+ case Intrinsic::x86_avx_blendv_ps_256:
+ case Intrinsic::x86_avx_blendv_pd_256:
+ case Intrinsic::x86_avx2_pblendvb: {
+ // fold (blend A, A, Mask) -> A
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Mask = II.getArgOperand(2);
+ if (Op0 == Op1) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Zero Mask - select 1st argument.
+ if (isa<ConstantAggregateZero>(Mask)) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
+ if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
+ Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
+ return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
+ }
+
+ // Convert to a vector select if we can bypass casts and find a boolean
+ // vector condition value.
+ Value *BoolVec;
+ Mask = InstCombiner::peekThroughBitcast(Mask);
+ if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+ BoolVec->getType()->isVectorTy() &&
+ BoolVec->getType()->getScalarSizeInBits() == 1) {
+ assert(Mask->getType()->getPrimitiveSizeInBits() ==
+ II.getType()->getPrimitiveSizeInBits() &&
+ "Not expecting mask and operands with different sizes");
+
+ unsigned NumMaskElts =
+ cast<FixedVectorType>(Mask->getType())->getNumElements();
+ unsigned NumOperandElts =
+ cast<FixedVectorType>(II.getType())->getNumElements();
+ if (NumMaskElts == NumOperandElts) {
+ return SelectInst::Create(BoolVec, Op1, Op0);
+ }
+
+ // If the mask has less elements than the operands, each mask bit maps to
+ // multiple elements of the operands. Bitcast back and forth.
+ if (NumMaskElts < NumOperandElts) {
+ Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
+ Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
+ Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
+ return new BitCastInst(Sel, II.getType());
+ }
+ }
+
+ break;
+ }
+
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps:
+ case Intrinsic::x86_avx512_permvar_df_256:
+ case Intrinsic::x86_avx512_permvar_df_512:
+ case Intrinsic::x86_avx512_permvar_di_256:
+ case Intrinsic::x86_avx512_permvar_di_512:
+ case Intrinsic::x86_avx512_permvar_hi_128:
+ case Intrinsic::x86_avx512_permvar_hi_256:
+ case Intrinsic::x86_avx512_permvar_hi_512:
+ case Intrinsic::x86_avx512_permvar_qi_128:
+ case Intrinsic::x86_avx512_permvar_qi_256:
+ case Intrinsic::x86_avx512_permvar_qi_512:
+ case Intrinsic::x86_avx512_permvar_sf_512:
+ case Intrinsic::x86_avx512_permvar_si_512:
+ if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ case Intrinsic::x86_avx_maskload_ps:
+ case Intrinsic::x86_avx_maskload_pd:
+ case Intrinsic::x86_avx_maskload_ps_256:
+ case Intrinsic::x86_avx_maskload_pd_256:
+ case Intrinsic::x86_avx2_maskload_d:
+ case Intrinsic::x86_avx2_maskload_q:
+ case Intrinsic::x86_avx2_maskload_d_256:
+ case Intrinsic::x86_avx2_maskload_q_256:
+ if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
+ return I;
+ }
+ break;
+
+ case Intrinsic::x86_sse2_maskmov_dqu:
+ case Intrinsic::x86_avx_maskstore_ps:
+ case Intrinsic::x86_avx_maskstore_pd:
+ case Intrinsic::x86_avx_maskstore_ps_256:
+ case Intrinsic::x86_avx_maskstore_pd_256:
+ case Intrinsic::x86_avx2_maskstore_d:
+ case Intrinsic::x86_avx2_maskstore_q:
+ case Intrinsic::x86_avx2_maskstore_d_256:
+ case Intrinsic::x86_avx2_maskstore_q_256:
+ if (simplifyX86MaskedStore(II, IC)) {
+ return nullptr;
+ }
+ break;
+
+ case Intrinsic::x86_addcarry_32:
+ case Intrinsic::x86_addcarry_64:
+ if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
+ return IC.replaceInstUsesWith(II, V);
+ }
+ break;
+
+ default:
+ break;
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const {
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_sse2_pmovmskb_128:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_avx2_pmovmskb: {
+ // MOVMSK copies the vector elements' sign bits to the low bits
+ // and zeros the high bits.
+ unsigned ArgWidth;
+ if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
+ ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
+ } else {
+ auto Arg = II.getArgOperand(0);
+ auto ArgType = cast<FixedVectorType>(Arg->getType());
+ ArgWidth = ArgType->getNumElements();
+ }
+
+ // If we don't need any of low bits then return zero,
+ // we know that DemandedMask is non-zero already.
+ APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
+ Type *VTy = II.getType();
+ if (DemandedElts.isNullValue()) {
+ return ConstantInt::getNullValue(VTy);
+ }
+
+ // We know that the upper bits are set to zero.
+ Known.Zero.setBitsFrom(ArgWidth);
+ KnownBitsComputed = true;
+ break;
+ }
+ }
+ return None;
+}
+
+Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ simplifyAndSetOp) const {
+ unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
+ switch (II.getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::x86_xop_vfrcz_ss:
+ case Intrinsic::x86_xop_vfrcz_sd:
+ // The instructions for these intrinsics are speced to zero upper bits not
+ // pass them through like other scalar intrinsics. So we shouldn't just
+ // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
+ // Instead we should return a zero vector.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return ConstantAggregateZero::get(II.getType());
+ }
+
+ // Only the lower element is used.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // Only the lower element is undefined. The high elements are zero.
+ UndefElts = UndefElts[0];
+ break;
+
+ // Unary scalar-as-vector operations that work column-wise.
+ case Intrinsic::x86_sse_rcp_ss:
+ case Intrinsic::x86_sse_rsqrt_ss:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+ // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
+ // checks).
+ break;
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0. The low element is a function of both
+ // operands.
+ case Intrinsic::x86_sse_min_ss:
+ case Intrinsic::x86_sse_max_ss:
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_min_sd:
+ case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_cmp_sd: {
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Lower element is undefined if both lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0])
+ UndefElts.clearBit(0);
+
+ break;
+ }
+
+ // Binary scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element comes from operand 1.
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ // Don't use the low element of operand 0.
+ APInt DemandedElts2 = DemandedElts;
+ DemandedElts2.clearBit(0);
+ simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+
+ // Take the high undef elements from operand 0 and take the lower element
+ // from operand 1.
+ UndefElts.clearBit(0);
+ UndefElts |= UndefElts2[0];
+ break;
+ }
+
+ // Three input scalar-as-vector operations that work column-wise. The high
+ // elements come from operand 0 and the low element is a function of all
+ // three inputs.
+ case Intrinsic::x86_avx512_mask_add_ss_round:
+ case Intrinsic::x86_avx512_mask_div_ss_round:
+ case Intrinsic::x86_avx512_mask_mul_ss_round:
+ case Intrinsic::x86_avx512_mask_sub_ss_round:
+ case Intrinsic::x86_avx512_mask_max_ss_round:
+ case Intrinsic::x86_avx512_mask_min_ss_round:
+ case Intrinsic::x86_avx512_mask_add_sd_round:
+ case Intrinsic::x86_avx512_mask_div_sd_round:
+ case Intrinsic::x86_avx512_mask_mul_sd_round:
+ case Intrinsic::x86_avx512_mask_sub_sd_round:
+ case Intrinsic::x86_avx512_mask_max_sd_round:
+ case Intrinsic::x86_avx512_mask_min_sd_round:
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+
+ // If lowest element of a scalar op isn't used then use Arg0.
+ if (!DemandedElts[0]) {
+ IC.addToWorklist(&II);
+ return II.getArgOperand(0);
+ }
+
+ // Only lower element is used for operand 1 and 2.
+ DemandedElts = 1;
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
+
+ // Lower element is undefined if all three lower elements are undefined.
+ // Consider things like undef&0. The result is known zero, not undef.
+ if (!UndefElts2[0] || !UndefElts3[0])
+ UndefElts.clearBit(0);
+ break;
+
+ // TODO: Add fmaddsub support?
+ case Intrinsic::x86_sse3_addsub_pd:
+ case Intrinsic::x86_sse3_addsub_ps:
+ case Intrinsic::x86_avx_addsub_pd_256:
+ case Intrinsic::x86_avx_addsub_ps_256: {
+ // If none of the even or none of the odd lanes are required, turn this
+ // into a generic FP math instruction.
+ APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
+ APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
+ bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
+ bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
+ if (IsSubOnly || IsAddOnly) {
+ assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+ Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
+ return IC.Builder.CreateBinOp(
+ IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
+ }
+
+ simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
+ UndefElts &= UndefElts2;
+ break;
+ }
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx2_packusdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx512_packssdw_512:
+ case Intrinsic::x86_avx512_packsswb_512:
+ case Intrinsic::x86_avx512_packusdw_512:
+ case Intrinsic::x86_avx512_packuswb_512: {
+ auto *Ty0 = II.getArgOperand(0)->getType();
+ unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
+ assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+ unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+ unsigned VWidthPerLane = VWidth / NumLanes;
+ unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+ // Per lane, pack the elements of the first input and then the second.
+ // e.g.
+ // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+ // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+ for (int OpNum = 0; OpNum != 2; ++OpNum) {
+ APInt OpDemandedElts(InnerVWidth, 0);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ unsigned LaneIdx = Lane * VWidthPerLane;
+ for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+ unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+ if (DemandedElts[Idx])
+ OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+ }
+ }
+
+ // Demand elements from the operand.
+ APInt OpUndefElts(InnerVWidth, 0);
+ simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
+
+ // Pack the operand's UNDEF elements, one lane at a time.
+ OpUndefElts = OpUndefElts.zext(VWidth);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+ LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+ LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+ UndefElts |= LaneElts;
+ }
+ }
+ break;
+ }
+
+ // PSHUFB
+ case Intrinsic::x86_ssse3_pshuf_b_128:
+ case Intrinsic::x86_avx2_pshuf_b:
+ case Intrinsic::x86_avx512_pshuf_b_512:
+ // PERMILVAR
+ case Intrinsic::x86_avx_vpermilvar_ps:
+ case Intrinsic::x86_avx_vpermilvar_ps_256:
+ case Intrinsic::x86_avx512_vpermilvar_ps_512:
+ case Intrinsic::x86_avx_vpermilvar_pd:
+ case Intrinsic::x86_avx_vpermilvar_pd_256:
+ case Intrinsic::x86_avx512_vpermilvar_pd_512:
+ // PERMV
+ case Intrinsic::x86_avx2_permd:
+ case Intrinsic::x86_avx2_permps: {
+ simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
+ break;
+ }
+
+ // SSE4A instructions leave the upper 64-bits of the 128-bit result
+ // in an undefined state.
+ case Intrinsic::x86_sse4a_extrq:
+ case Intrinsic::x86_sse4a_extrqi:
+ case Intrinsic::x86_sse4a_insertq:
+ case Intrinsic::x86_sse4a_insertqi:
+ UndefElts.setHighBits(VWidth / 2);
+ break;
+ }
+ return None;
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td
index e4f3290cab..e9ec53476c 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td
@@ -16,21 +16,21 @@
let Predicates = [HasAMXTILE, In64BitMode] in {
let SchedRW = [WriteSystem] in {
- let hasSideEffects = 1,
- Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ let hasSideEffects = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"ldtilecfg\t$src",
[(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
- let hasSideEffects = 1 in
+ let hasSideEffects = 1 in
def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
"sttilecfg\t$src",
[(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
- let mayLoad = 1 in
+ let mayLoad = 1 in
def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloadd\t{$src, $dst|$dst, $src}", []>,
VEX, T8XD;
- let mayLoad = 1 in
+ let mayLoad = 1 in
def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
(ins sibmem:$src),
"tileloaddt1\t{$src, $dst|$dst, $src}", []>,
@@ -38,7 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
"tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
- let mayStore = 1 in
+ let mayStore = 1 in
def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
(ins sibmem:$dst, TILE:$src),
"tilestored\t{$src, $dst|$dst, $src}", []>,
@@ -47,25 +47,25 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
"tilezero\t$dst", []>,
VEX, T8XD;
- // Pseduo instruction for RA.
- let hasSideEffects = 1, mayLoad = 1,
- Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
- def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
-
- let hasSideEffects = 1, mayStore = 1 in
- def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
-
- def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2,
- opaquemem:$src3,
- TILECFG:$cfg), []>;
- def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
- GR16:$src2, opaquemem:$src3,
- TILE:$src4, TILECFG:$cfg), []>;
- def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2,
- TILECFG:$cfg), []>;
-
+ // Pseduo instruction for RA.
+ let hasSideEffects = 1, mayLoad = 1,
+ Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
+
+ let hasSideEffects = 1, mayStore = 1 in
+ def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
+
+ def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3,
+ TILECFG:$cfg), []>;
+ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
+ GR16:$src2, opaquemem:$src3,
+ TILE:$src4, TILECFG:$cfg), []>;
+ def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2,
+ TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
@@ -74,7 +74,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
sibmem:$src2), []>;
def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
- [(int_x86_tilezero timm:$src)]>;
+ [(int_x86_tilezero timm:$src)]>;
}
} // SchedRW
} // HasAMXTILE
@@ -100,31 +100,31 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
VEX_4V, T8PS;
}
- // Pseduo instruction for RA.
- let Constraints = "$src4 = $dst" in
- def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
-
+ // Pseduo instruction for RA.
+ let Constraints = "$src4 = $dst" in
+ def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbssd timm:$src1,
- timm:$src2, timm:$src3)]>;
+ [(int_x86_tdpbssd timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbsud timm:$src1,
- timm:$src2, timm:$src3)]>;
+ [(int_x86_tdpbsud timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbusd timm:$src1,
- timm:$src2, timm:$src3)]>;
+ [(int_x86_tdpbusd timm:$src1,
+ timm:$src2, timm:$src3)]>;
def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbuud timm:$src1,
- timm:$src2, timm:$src3)]>;
+ [(int_x86_tdpbuud timm:$src1,
+ timm:$src2, timm:$src3)]>;
}
}
} // HasAMXTILE
@@ -142,8 +142,8 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
// To be translated to the actual instructions in X86ISelLowering.cpp
def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
u8imm:$src2, u8imm:$src3),
- [(int_x86_tdpbf16ps timm:$src1,
- timm:$src2, timm:$src3)]>;
+ [(int_x86_tdpbf16ps timm:$src1,
+ timm:$src2, timm:$src3)]>;
}
}
} // HasAMXTILE, HasAMXBF16
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td
index 19012797ae..654dcc1b39 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td
@@ -1123,10 +1123,10 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
EXTRACT_get_vextract256_imm, [HasAVX512]>;
// vextractps - extract 32 bits from XMM
-def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst),
+def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst),
(ins VR128X:$src1, u8imm:$src2),
"vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+ [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
EVEX, VEX_WIG, Sched<[WriteVecExtract]>;
def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
@@ -1414,12 +1414,12 @@ defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode,
- X86VectorVTInfo _Dst,
- X86VectorVTInfo _Src> {
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
- (_Dst.VT (OpNode addr:$src))>,
+ (_Dst.VT (OpNode addr:$src))>,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
@@ -1428,14 +1428,14 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
// the unmasked patterns so that we only use the DQ instructions when masking
// is requested.
multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode,
- X86VectorVTInfo _Dst,
- X86VectorVTInfo _Src> {
+ SDPatternOperator OpNode,
+ X86VectorVTInfo _Dst,
+ X86VectorVTInfo _Src> {
let hasSideEffects = 0, mayLoad = 1 in
defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
(null_frag),
- (_Dst.VT (OpNode addr:$src))>,
+ (_Dst.VT (OpNode addr:$src))>,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
@@ -1445,194 +1445,194 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
//
defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
- X86SubVBroadcastld128, v16i32_info, v4i32x_info>,
+ X86SubVBroadcastld128, v16i32_info, v4i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
- X86SubVBroadcastld128, v16f32_info, v4f32x_info>,
+ X86SubVBroadcastld128, v16f32_info, v4f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4",
- X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W,
+ X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT4>;
defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
- X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W,
+ X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT4>;
let Predicates = [HasAVX512] in {
-def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
+def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
- (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
- (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
+def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
+def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
+def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)),
(VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)),
+ (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF32X4rm addr:$src)>;
-def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
- (VBROADCASTF32X4rm addr:$src)>;
-def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
- (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4rm addr:$src)>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
(v16f32 immAllZerosV)),
(VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
(v16i32 immAllZerosV)),
(VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
(v8f64 immAllZerosV)),
(VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
(v8i64 immAllZerosV)),
(VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
let Predicates = [HasVLX] in {
defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
- X86SubVBroadcastld128, v8i32x_info, v4i32x_info>,
+ X86SubVBroadcastld128, v8i32x_info, v4i32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
- X86SubVBroadcastld128, v8f32x_info, v4f32x_info>,
+ X86SubVBroadcastld128, v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
-def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF32X4Z256rm addr:$src)>;
-def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
- (VBROADCASTF32X4Z256rm addr:$src)>;
-def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
- (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTI32X4Z256rm addr:$src)>;
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
(v8f32 immAllZerosV)),
(VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
(v8i32 immAllZerosV)),
(VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
}
let Predicates = [HasVLX, HasDQI] in {
defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
+ X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
+ X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
(v4f64 immAllZerosV)),
(VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
(v4i64 immAllZerosV)),
(VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))),
VR256X:$src0),
(VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
}
let Predicates = [HasDQI] in {
defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
- X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W,
+ X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8",
- X86SubVBroadcastld256, v16i32_info, v8i32x_info>,
+ X86SubVBroadcastld256, v16i32_info, v8i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
- X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W,
+ X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
- X86SubVBroadcastld256, v16f32_info, v8f32x_info>,
+ X86SubVBroadcastld256, v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
// Patterns for selects of bitcasted operations.
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
(v16f32 immAllZerosV)),
(VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
(v16i32 immAllZerosV)),
(VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
+ (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))),
VR512:$src0),
(VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
(v8f64 immAllZerosV)),
(VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
(v8i64 immAllZerosV)),
(VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
def : Pat<(vselect_mask VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
+ (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))),
VR512:$src0),
(VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
@@ -2531,71 +2531,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
(X86cmpm_imm_commute timm:$cc))>;
-
- // Patterns for mask intrinsics.
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
- (_.KVT immAllOnesV)),
- (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
-
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
- _.RC:$src2, timm:$cc)>;
-
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
- (_.KVT immAllOnesV)),
- (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
-
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
- _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
- addr:$src2, timm:$cc)>;
-
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
- (_.KVT immAllOnesV)),
- (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
-
- def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
- _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
- addr:$src2, timm:$cc)>;
-
- // Patterns for mask intrinsics with loads in other operand.
- def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
- (_.KVT immAllOnesV)),
- (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
- (X86cmpm_imm_commute timm:$cc))>;
-
- def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
- _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
- _.RC:$src1, addr:$src2,
- (X86cmpm_imm_commute timm:$cc))>;
-
- def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
- (_.KVT immAllOnesV)),
- (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
- (X86cmpm_imm_commute timm:$cc))>;
-
- def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
- _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
- _.RC:$src1, addr:$src2,
- (X86cmpm_imm_commute timm:$cc))>;
+
+ // Patterns for mask intrinsics.
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1,
+ _.RC:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>;
+
+ def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1,
+ addr:$src2, timm:$cc)>;
+
+ // Patterns for mask intrinsics with loads in other operand.
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ (_.KVT immAllOnesV)),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
+
+ def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc,
+ _.KRCWM:$mask),
+ (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (X86cmpm_imm_commute timm:$cc))>;
}
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
let Uses = [MXCSR] in
- defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
- (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst),
+ (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
"$src1, $src2, {sae}, $cc",
- [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
- (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
- [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
- (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))],
+ [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1),
+ (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>,
EVEX_B, Sched<[sched]>;
}
@@ -2855,8 +2855,8 @@ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
(EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
-def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>;
+def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>;
def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
(COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
@@ -2937,9 +2937,9 @@ let Predicates = [HasAVX512] in {
def : Pat<(insert_subvector (v16i1 immAllZerosV),
(v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)),
- (KMOVWkr (AND32ri8
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
- (i32 1)))>;
+ (KMOVWkr (AND32ri8
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit),
+ (i32 1)))>;
}
// Mask unary operation
@@ -6504,8 +6504,8 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
- fma, X86FmaddRnd>;
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
@@ -6595,8 +6595,8 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
- fma, X86FmaddRnd>;
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
@@ -6687,8 +6687,8 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512vl_f64_info, "PD">, VEX_W;
}
-defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
- fma, X86FmaddRnd>;
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
+ fma, X86FmaddRnd>;
defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
@@ -6790,7 +6790,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>;
defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
@@ -6998,7 +6998,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
}
}
-defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
"SS", X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
"SS", X86Movss, v4f32x_info, fp32imm0>;
@@ -7007,7 +7007,7 @@ defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMA
defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
"SS", X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
+defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD",
"SD", X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
"SD", X86Movsd, v2f64x_info, fp64imm0>;
@@ -7540,7 +7540,7 @@ multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
SDNode OpNode, SDNode OpNodeRnd,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
- let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
@@ -7551,7 +7551,7 @@ multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
SDNode OpNode, SDNode OpNodeSAE,
X86FoldableSchedWrite sched,
X86VectorVTInfo _src, X86VectorVTInfo _dst> {
- let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+ let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
EVEX_CD8<32, CD8VT1>, XS;
@@ -10879,7 +10879,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
def mr : AVX512Ii8<opc, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))),
+ [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))),
addr:$dst)]>,
EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>;
}
@@ -10890,7 +10890,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
+ (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
EVEX, TAPD, Sched<[WriteVecExtract]>;
defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
@@ -10903,7 +10903,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
+ (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
EVEX, PD, Sched<[WriteVecExtract]>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
@@ -10943,13 +10943,13 @@ defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, PatFrag LdFrag,
- SDPatternOperator immoperator> {
+ X86VectorVTInfo _, PatFrag LdFrag,
+ SDPatternOperator immoperator> {
def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
+ (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -10960,10 +10960,10 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
Sched<[WriteVecInsert]>;
- defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
+ defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
}
}
@@ -10978,7 +10978,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
- _.ScalarLdFrag, imm>, TAPD;
+ _.ScalarLdFrag, imm>, TAPD;
}
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td
index e83e1e74ff..37768af345 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td
@@ -1182,15 +1182,15 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
X86sub_flag, sub, 0, 1, 0>;
}
-// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
-// __builtin_parity where the last step xors an h-register with an l-register.
-let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
- Defs = [EFLAGS], isCommutable = 1 in
-def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
- (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
- "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
- Sched<[WriteALU]>;
-
+// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
+// __builtin_parity where the last step xors an h-register with an l-register.
+let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst",
+ Defs = [EFLAGS], isCommutable = 1 in
+def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
+ (ins GR8_NOREX:$src1, GR8_NOREX:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}", []>,
+ Sched<[WriteALU]>;
+
// Arithmetic.
defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
1, 0>;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td
index dc6361aecc..49ab46522c 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td
@@ -73,32 +73,32 @@ let usesCustomInserter = 1, Defs = [EFLAGS] in {
def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
(outs),
(ins GR8:$al,
- i32imm:$regsavefi, i32imm:$offset,
+ i32imm:$regsavefi, i32imm:$offset,
variable_ops),
"#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
[(X86vastart_save_xmm_regs GR8:$al,
- timm:$regsavefi,
- timm:$offset),
+ timm:$regsavefi,
+ timm:$offset),
(implicit EFLAGS)]>;
-// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
-// va_list, and place the address of the next argument into a register.
-let Defs = [EFLAGS] in {
+// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the
+// va_list, and place the address of the next argument into a register.
+let Defs = [EFLAGS] in {
def VAARG_64 : I<0, Pseudo,
(outs GR64:$dst),
(ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
"#VAARG_64 $dst, $ap, $size, $mode, $align",
[(set GR64:$dst,
- (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)),
- (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>;
-def VAARG_X32 : I<0, Pseudo,
- (outs GR32:$dst),
- (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
- "#VAARG_X32 $dst, $ap, $size, $mode, $align",
- [(set GR32:$dst,
- (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)),
- (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>;
-}
+ (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>;
+def VAARG_X32 : I<0, Pseudo,
+ (outs GR32:$dst),
+ (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+ "#VAARG_X32 $dst, $ap, $size, $mode, $align",
+ [(set GR32:$dst,
+ (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)),
+ (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>;
+}
// When using segmented stacks these are lowered into instructions which first
// check if the current stacklet has enough free memory. If it does, memory is
@@ -474,19 +474,19 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
[(X86tlsaddr tls64addr:$sym)]>,
- Requires<[In64BitMode, IsLP64]>;
+ Requires<[In64BitMode, IsLP64]>;
def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_base_addr64",
[(X86tlsbaseaddr tls64baseaddr:$sym)]>,
- Requires<[In64BitMode, IsLP64]>;
-def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
- "# TLS_addrX32",
- [(X86tlsaddr tls32addr:$sym)]>,
- Requires<[In64BitMode, NotLP64]>;
-def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
- "# TLS_base_addrX32",
- [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
- Requires<[In64BitMode, NotLP64]>;
+ Requires<[In64BitMode, IsLP64]>;
+def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_addrX32",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
+def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+ "# TLS_base_addrX32",
+ [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+ Requires<[In64BitMode, NotLP64]>;
}
// Darwin TLS Support
@@ -847,21 +847,21 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
}
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, usesCustomInserter = 1 in {
-def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
- "cmpxchg8b\t$ptr",
- [(X86cas8 addr:$ptr)]>, TB, LOCK;
-}
-
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
-def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
- "cmpxchg16b\t$ptr",
- []>, TB, LOCK;
+ Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+ "cmpxchg8b\t$ptr",
+ [(X86cas8 addr:$ptr)]>, TB, LOCK;
}
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+ "cmpxchg16b\t$ptr",
+ []>, TB, LOCK;
+}
+
// This pseudo must be used when the frame uses RBX as
// the base pointer. Indeed, in such situation RBX is a reserved
// register and the register allocator will ignore any use/def of
@@ -869,64 +869,64 @@ def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
// RBX that will happen when setting the arguments for the instrucion.
//
// Unlike the actual related instruction, we mark that this one
-// defines RBX (instead of using RBX).
+// defines RBX (instead of using RBX).
// The rationale is that we will define RBX during the expansion of
-// the pseudo. The argument feeding RBX is rbx_input.
+// the pseudo. The argument feeding RBX is rbx_input.
//
-// The additional argument, $rbx_save, is a temporary register used to
+// The additional argument, $rbx_save, is a temporary register used to
// save the value of RBX across the actual instruction.
//
-// To make sure the register assigned to $rbx_save does not interfere with
+// To make sure the register assigned to $rbx_save does not interfere with
// the definition of the actual instruction, we use a definition $dst which
// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
// the instruction and we are sure we will have a valid register to restore
// the value of RBX.
-let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, isPseudo = 1,
- mayLoad = 1, mayStore = 1, hasSideEffects = 0,
- Constraints = "$rbx_save = $dst" in {
-def LCMPXCHG16B_SAVE_RBX :
- I<0, Pseudo, (outs GR64:$dst),
- (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
-}
-
-// Pseudo instruction that doesn't read/write RBX. Will be turned into either
-// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
- Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
- isCodeGenOnly = 1, isPseudo = 1,
- mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
+ Constraints = "$rbx_save = $dst" in {
+def LCMPXCHG16B_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
+}
+
+// Pseudo instruction that doesn't read/write RBX. Will be turned into either
+// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
+ Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
+ isCodeGenOnly = 1, isPseudo = 1,
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0,
usesCustomInserter = 1 in {
-def LCMPXCHG16B_NO_RBX :
- I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
- [(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
+def LCMPXCHG16B_NO_RBX :
+ I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
+ [(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
}
-// This pseudo must be used when the frame uses RBX/EBX as
-// the base pointer.
-// cf comment for LCMPXCHG16B_SAVE_RBX.
-let Defs = [EBX], Uses = [ECX, EAX],
- Predicates = [HasMWAITX], SchedRW = [WriteSystem],
- isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in {
-def MWAITX_SAVE_RBX :
- I<0, Pseudo, (outs GR64:$dst),
- (ins GR32:$ebx_input, GR64:$rbx_save),
- "mwaitx",
- []>;
+// This pseudo must be used when the frame uses RBX/EBX as
+// the base pointer.
+// cf comment for LCMPXCHG16B_SAVE_RBX.
+let Defs = [EBX], Uses = [ECX, EAX],
+ Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in {
+def MWAITX_SAVE_RBX :
+ I<0, Pseudo, (outs GR64:$dst),
+ (ins GR32:$ebx_input, GR64:$rbx_save),
+ "mwaitx",
+ []>;
}
-// Pseudo mwaitx instruction to use for custom insertion.
-let Predicates = [HasMWAITX], SchedRW = [WriteSystem],
- isCodeGenOnly = 1, isPseudo = 1,
+// Pseudo mwaitx instruction to use for custom insertion.
+let Predicates = [HasMWAITX], SchedRW = [WriteSystem],
+ isCodeGenOnly = 1, isPseudo = 1,
usesCustomInserter = 1 in {
-def MWAITX :
- I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx),
- "mwaitx",
- [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>;
+def MWAITX :
+ I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx),
+ "mwaitx",
+ [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>;
}
-
+
defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>;
// Atomic exchange and add
@@ -1213,49 +1213,49 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
return true;
}]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
- (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
// callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), timm:$off),
- (TCRETURNmi addr:$dst, timm:$off)>,
+def : Pat<(X86tcret (load addr:$dst), timm:$off),
+ (TCRETURNmi addr:$dst, timm:$off)>,
Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
-def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off),
- (TCRETURNdi tglobaladdr:$dst, timm:$off)>,
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi tglobaladdr:$dst, timm:$off)>,
Requires<[NotLP64]>;
-def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
- (TCRETURNdi texternalsym:$dst, timm:$off)>,
+def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
+ (TCRETURNdi texternalsym:$dst, timm:$off)>,
Requires<[NotLP64]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
- (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.
-def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
- (TCRETURNmi64 addr:$dst, timm:$off)>,
+def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
+ (TCRETURNmi64 addr:$dst, timm:$off)>,
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
- (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[In64BitMode, UseIndirectThunkCalls]>;
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
- (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
+ (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>,
Requires<[Not64BitMode, UseIndirectThunkCalls]>;
-def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
- (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
Requires<[IsLP64]>;
-def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
- (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
+def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
+ (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
Requires<[IsLP64]>;
// Normal calls, with various flavors of addresses.
@@ -1344,18 +1344,18 @@ def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>;
// Any instruction that defines a 32-bit result leaves the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
-// anything about the upper 32 bits, they're probably just qualifying a
-// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
-// operation will zero-extend up to 64 bits.
+// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
+// anything about the upper 32 bits, they're probably just qualifying a
+// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
+// operation will zero-extend up to 64 bits.
def def32 : PatLeaf<(i32 GR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
N->getOpcode() != ISD::CopyFromReg &&
N->getOpcode() != ISD::AssertSext &&
- N->getOpcode() != ISD::AssertZext &&
- N->getOpcode() != ISD::AssertAlign &&
- N->getOpcode() != ISD::FREEZE;
+ N->getOpcode() != ISD::AssertZext &&
+ N->getOpcode() != ISD::AssertAlign &&
+ N->getOpcode() != ISD::FREEZE;
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -1732,16 +1732,16 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
(EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
Requires<[In64BitMode]>;
-// Special pattern to catch the last step of __builtin_parity handling. Our
-// goal is to use an xor of an h-register with the corresponding l-register.
-// The above patterns would handle this on non 64-bit targets, but for 64-bit
-// we need to be more careful. We're using a NOREX instruction here in case
-// register allocation fails to keep the two registers together. So we need to
-// make sure we can't accidentally mix R8-R15 with an h-register.
-def : Pat<(X86xor_flag (i8 (trunc GR32:$src)),
- (i8 (trunc (srl_su GR32:$src, (i8 8))))),
- (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit),
- (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
+// Special pattern to catch the last step of __builtin_parity handling. Our
+// goal is to use an xor of an h-register with the corresponding l-register.
+// The above patterns would handle this on non 64-bit targets, but for 64-bit
+// we need to be more careful. We're using a NOREX instruction here in case
+// register allocation fails to keep the two registers together. So we need to
+// make sure we can't accidentally mix R8-R15 with an h-register.
+def : Pat<(X86xor_flag (i8 (trunc GR32:$src)),
+ (i8 (trunc (srl_su GR32:$src, (i8 8))))),
+ (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
// (shl x, 1) ==> (add x, x)
// Note that if x is undef (immediate or otherwise), we could theoretically
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td
index f9be3a7832..bfaa75c0ce 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td
@@ -123,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
- loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
+ loadv4f32, loadv8f32, any_fma, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
@@ -138,7 +138,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
- loadv2f64, loadv4f64, any_fma, v2f64,
+ loadv2f64, loadv4f64, any_fma, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
@@ -319,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
VR128, sdmem, sched>, VEX_W;
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma,
SchedWriteFMA.Scl>, VEX_LIG;
defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
@@ -372,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
}
}
-defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
@@ -538,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
- defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
@@ -555,7 +555,7 @@ let ExeDomain = SSEPackedSingle in {
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -571,7 +571,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
- defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
@@ -588,7 +588,7 @@ let ExeDomain = SSEPackedDouble in {
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -629,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
}
}
-defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td
index 961b4e5903..cfbfe39f88 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td
@@ -392,13 +392,13 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW, FPCW], mayLoad = 1 in {
-def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">;
-def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">;
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">;
+def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">;
}
let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
-def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">;
-def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">;
}
let Uses = [FPSW], mayStore = 1 in
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp
index 17fe7f0bd3..1e3fb7f227 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp
@@ -300,13 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
- { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
@@ -359,8 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE },
- { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
- { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
@@ -371,8 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
- { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
- { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
@@ -3748,26 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
- { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
+ { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
{ X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
{ X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
{ X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
- { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
- { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
+ { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
+ { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
{ X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
{ X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
{ X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
- { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
- { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
+ { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
+ { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
{ X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
{ X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
{ X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
- { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
- { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
+ { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
+ { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
{ X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
{ X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
{ X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
- { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
+ { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td
index 686b19fc0a..23cec4e363 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td
@@ -216,7 +216,7 @@ class T8XS : T8 { Prefix OpPrefix = XS; }
class TAPS : TA { Prefix OpPrefix = PS; }
class TAPD : TA { Prefix OpPrefix = PD; }
class TAXD : TA { Prefix OpPrefix = XD; }
-class TAXS : TA { Prefix OpPrefix = XS; }
+class TAXS : TA { Prefix OpPrefix = XS; }
class VEX { Encoding OpEnc = EncVEX; }
class VEX_W { bit HasVEX_W = 1; }
class VEX_WIG { bit IgnoresVEX_W = 1; }
@@ -264,9 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
// Prevent EVEX->VEX conversion from considering this instruction.
class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
-// Force the instruction to use VEX encoding.
-class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
-
+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr, Domain d = GenericDomain>
: Instruction {
@@ -351,7 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
- bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
+ bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
@@ -380,7 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{51-45} = CD8_Scale;
let TSFlags{52} = hasEVEX_RC;
let TSFlags{53} = hasNoTrackPrefix;
- let TSFlags{54} = ExplicitVEXPrefix;
+ let TSFlags{54} = ExplicitVEXPrefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td
index 777c5a158b..0d9595128f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -87,16 +87,16 @@ def X86multishift : SDNode<"X86ISD::MULTISHIFT",
SDTCisSameAs<1,2>]>>;
def X86pextrb : SDNode<"X86ISD::PEXTRB",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>,
- SDTCisVT<2, i8>]>>;
+ SDTCisVT<2, i8>]>>;
def X86pextrw : SDNode<"X86ISD::PEXTRW",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>,
- SDTCisVT<2, i8>]>>;
+ SDTCisVT<2, i8>]>>;
def X86pinsrb : SDNode<"X86ISD::PINSRB",
SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
def X86pinsrw : SDNode<"X86ISD::PINSRW",
SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
- SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
+ SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>;
def X86insertps : SDNode<"X86ISD::INSERTPS",
SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
@@ -109,8 +109,8 @@ def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -209,21 +209,21 @@ def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
-def X86MaskCmpMaskCC :
- SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
- SDTCisVec<1>, SDTCisSameAs<2, 1>,
- SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
+def X86MaskCmpMaskCC :
+ SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
+ SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>;
def X86CmpMaskCCScalar :
SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
-def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>;
+def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>;
def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_cmpm node:$src1, node:$src2, node:$src3),
(X86cmpm node:$src1, node:$src2, node:$src3)]>;
-def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
+def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
@@ -961,16 +961,16 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
-def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
- (X86SubVBroadcastld node:$src), [{
- return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
-}]>;
-
-def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
- (X86SubVBroadcastld node:$src), [{
- return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
-}]>;
-
+def X86SubVBroadcastld128 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16;
+}]>;
+
+def X86SubVBroadcastld256 : PatFrag<(ops node:$src),
+ (X86SubVBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32;
+}]>;
+
// Scalar SSE intrinsic fragments to match several different types of loads.
// Used by scalar SSE intrinsic instructions which have 128 bit types, but
// only load a single element.
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp
index d9bab14f0c..283ca4164f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp
@@ -28,7 +28,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -947,9 +947,9 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
}
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
-static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
+static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
- if (!BaseReg.isVirtual())
+ if (!BaseReg.isVirtual())
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -1127,8 +1127,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
- if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
- MachineBasicBlock::LQR_Dead) {
+ if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
+ MachineBasicBlock::LQR_Dead) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
int Value;
@@ -1206,7 +1206,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
isKill = Src.isKill();
assert(!Src.isUndef() && "Undef op doesn't need optimization");
- if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+ if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
return true;
@@ -1214,7 +1214,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
- if (SrcReg.isPhysical()) {
+ if (SrcReg.isPhysical()) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
@@ -1409,8 +1409,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
- if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
- Src.getReg(), &X86::GR64_NOSPRegClass))
+ if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
+ Src.getReg(), &X86::GR64_NOSPRegClass))
return nullptr;
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
@@ -2566,10 +2566,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
- case X86::VPDPWSSDYrr:
- case X86::VPDPWSSDrr:
- case X86::VPDPWSSDSYrr:
- case X86::VPDPWSSDSrr:
+ case X86::VPDPWSSDYrr:
+ case X86::VPDPWSSDrr:
+ case X86::VPDPWSSDSYrr:
+ case X86::VPDPWSSDSrr:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:
@@ -3530,10 +3530,10 @@ X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
return None;
}
-static unsigned getLoadStoreRegOpcode(Register Reg,
+static unsigned getLoadStoreRegOpcode(Register Reg,
const TargetRegisterClass *RC,
- bool IsStackAligned,
- const X86Subtarget &STI, bool load) {
+ bool IsStackAligned,
+ const X86Subtarget &STI, bool load) {
bool HasAVX = STI.hasAVX();
bool HasAVX512 = STI.hasAVX512();
bool HasVLX = STI.hasVLX();
@@ -3606,7 +3606,7 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 16: {
if (X86::VR128XRegClass.hasSubClassEq(RC)) {
// If stack is realigned we can use aligned stores.
- if (IsStackAligned)
+ if (IsStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ128rm :
HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
@@ -3638,7 +3638,7 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 32:
assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
- if (IsStackAligned)
+ if (IsStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ256rm :
HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
@@ -3657,80 +3657,80 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
- if (IsStackAligned)
+ if (IsStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
}
-Optional<ExtAddrMode>
-X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
- const TargetRegisterInfo *TRI) const {
- const MCInstrDesc &Desc = MemI.getDesc();
- int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
- if (MemRefBegin < 0)
- return None;
-
- MemRefBegin += X86II::getOperandBias(Desc);
-
- auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
- if (!BaseOp.isReg()) // Can be an MO_FrameIndex
- return None;
-
- const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
- // Displacement can be symbolic
- if (!DispMO.isImm())
- return None;
-
- ExtAddrMode AM;
- AM.BaseReg = BaseOp.getReg();
- AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
- AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
- AM.Displacement = DispMO.getImm();
- return AM;
-}
-
-bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
- const Register Reg,
- int64_t &ImmVal) const {
- if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
- return false;
- // Mov Src can be a global address.
- if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
- return false;
- ImmVal = MI.getOperand(1).getImm();
- return true;
-}
-
-bool X86InstrInfo::preservesZeroValueInReg(
- const MachineInstr *MI, const Register NullValueReg,
- const TargetRegisterInfo *TRI) const {
- if (!MI->modifiesRegister(NullValueReg, TRI))
- return true;
- switch (MI->getOpcode()) {
- // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
- // X.
- case X86::SHR64ri:
- case X86::SHR32ri:
- case X86::SHL64ri:
- case X86::SHL32ri:
- assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
- "expected for shift opcode!");
- return MI->getOperand(0).getReg() == NullValueReg &&
- MI->getOperand(1).getReg() == NullValueReg;
- // Zero extend of a sub-reg of NullValueReg into itself does not change the
- // null value.
- case X86::MOV32rr:
- return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
- return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
- });
- default:
- return false;
- }
- llvm_unreachable("Should be handled above!");
-}
-
+Optional<ExtAddrMode>
+X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemI.getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemRefBegin < 0)
+ return None;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
+ if (!BaseOp.isReg()) // Can be an MO_FrameIndex
+ return None;
+
+ const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return None;
+
+ ExtAddrMode AM;
+ AM.BaseReg = BaseOp.getReg();
+ AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
+ AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
+ AM.Displacement = DispMO.getImm();
+ return AM;
+}
+
+bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
+ const Register Reg,
+ int64_t &ImmVal) const {
+ if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
+ return false;
+ // Mov Src can be a global address.
+ if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
+ return false;
+ ImmVal = MI.getOperand(1).getImm();
+ return true;
+}
+
+bool X86InstrInfo::preservesZeroValueInReg(
+ const MachineInstr *MI, const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const {
+ if (!MI->modifiesRegister(NullValueReg, TRI))
+ return true;
+ switch (MI->getOpcode()) {
+ // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
+ // X.
+ case X86::SHR64ri:
+ case X86::SHR32ri:
+ case X86::SHL64ri:
+ case X86::SHL32ri:
+ assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
+ "expected for shift opcode!");
+ return MI->getOperand(0).getReg() == NullValueReg &&
+ MI->getOperand(1).getReg() == NullValueReg;
+ // Zero extend of a sub-reg of NullValueReg into itself does not change the
+ // null value.
+ case X86::MOV32rr:
+ return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
+ return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
+ });
+ default:
+ return false;
+ }
+ llvm_unreachable("Should be handled above!");
+}
+
bool X86InstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
@@ -3775,17 +3775,17 @@ bool X86InstrInfo::getMemOperandsWithOffsetWidth(
return true;
}
-static unsigned getStoreRegOpcode(Register SrcReg,
+static unsigned getStoreRegOpcode(Register SrcReg,
const TargetRegisterClass *RC,
- bool IsStackAligned,
+ bool IsStackAligned,
const X86Subtarget &STI) {
- return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
+ return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
}
-static unsigned getLoadRegOpcode(Register DestReg,
+static unsigned getLoadRegOpcode(Register DestReg,
const TargetRegisterClass *RC,
- bool IsStackAligned, const X86Subtarget &STI) {
- return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
+ bool IsStackAligned, const X86Subtarget &STI) {
+ return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
}
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -3796,31 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
- if (RC->getID() == X86::TILERegClassID) {
- unsigned Opc = X86::TILESTORED;
- // tilestored %tmm, (%sp, %idx)
- MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
- Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
- BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
- MachineInstr *NewMI =
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
- MachineOperand &MO = NewMI->getOperand(2);
- MO.setReg(VirtReg);
- MO.setIsKill(true);
- } else if (RC->getID() == X86::TILECFGRegClassID) {
- unsigned Opc = X86::PSTTILECFG;
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
- } else {
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
- .addReg(SrcReg, getKillRegState(isKill));
- }
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILESTORED;
+ // tilestored %tmm, (%sp, %idx)
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ MachineOperand &MO = NewMI->getOperand(2);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PSTTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+ }
}
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -3828,32 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
- if (RC->getID() == X86::TILERegClassID) {
- unsigned Opc = X86::TILELOADD;
- // tileloadd (%sp, %idx), %tmm
- MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
- Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
- MachineInstr *NewMI =
- BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
- NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
- FrameIdx);
- MachineOperand &MO = NewMI->getOperand(3);
- MO.setReg(VirtReg);
- MO.setIsKill(true);
- } else if (RC->getID() == X86::TILECFGRegClassID) {
- unsigned Opc = X86::PLDTILECFG;
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
- FrameIdx);
- } else {
- const MachineFunction &MF = *MBB.getParent();
- unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
- bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
- RI.canRealignStack(MF);
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
- FrameIdx);
- }
+ if (RC->getID() == X86::TILERegClassID) {
+ unsigned Opc = X86::TILELOADD;
+ // tileloadd (%sp, %idx), %tmm
+ MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+ Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ MachineInstr *NewMI =
+ BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
+ NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ MachineOperand &MO = NewMI->getOperand(3);
+ MO.setReg(VirtReg);
+ MO.setIsKill(true);
+ } else if (RC->getID() == X86::TILECFGRegClassID) {
+ unsigned Opc = X86::PLDTILECFG;
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ } else {
+ const MachineFunction &MF = *MBB.getParent();
+ unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
+ bool isAligned =
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
+ RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
+ FrameIdx);
+ }
}
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
@@ -4416,7 +4416,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
/// instructions in-between do not load or store, and have no side effects.
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
- Register &FoldAsLoadDefReg,
+ Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
@@ -4479,8 +4479,8 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
/// %k4 = K_SET1
/// to:
/// %k4 = KXNORrr %k0, %k0
-static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
- Register Reg) {
+static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
+ Register Reg) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
MIB->setDesc(Desc);
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
@@ -4926,7 +4926,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);
Register Reg = MO.getReg();
- if (Reg.isVirtual()) {
+ if (Reg.isVirtual()) {
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
return 0;
} else {
@@ -5224,12 +5224,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
/// high bits that are passed-through are not live.
unsigned
-X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
- const MachineOperand &MO = MI.getOperand(OpNum);
- if (Register::isPhysicalRegister(MO.getReg()) &&
- hasUndefRegUpdate(MI.getOpcode(), OpNum))
- return UndefRegClearance;
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (Register::isPhysicalRegister(MO.getReg()) &&
+ hasUndefRegUpdate(MI.getOpcode(), OpNum))
+ return UndefRegClearance;
return 0;
}
@@ -5311,7 +5311,7 @@ static void updateOperandRegConstraints(MachineFunction &MF,
if (!MO.isReg())
continue;
Register Reg = MO.getReg();
- if (!Reg.isVirtual())
+ if (!Reg.isVirtual())
continue;
auto *NewRC = MRI.constrainRegClass(
@@ -5562,10 +5562,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) {
unsigned Opcode = I->DstOp;
- bool FoldedLoad =
- isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
- bool FoldedStore =
- isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
+ bool FoldedLoad =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
+ bool FoldedStore =
+ isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
MaybeAlign MinAlign =
decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
if (MinAlign && Alignment < *MinAlign)
@@ -5576,25 +5576,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
&RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- // Check if it's safe to fold the load. If the size of the object is
- // narrower than the load width, then it's not.
- // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
- if (FoldedLoad && Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
+ if (FoldedLoad && Size < RCSize) {
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
- if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
- return nullptr;
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return nullptr;
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
return nullptr;
Opcode = X86::MOV32rm;
NarrowToMOV32rm = true;
}
- // For stores, make sure the size of the object is equal to the size of
- // the store. If the object is larger, the extra bits would be garbage. If
- // the object is smaller we might overwrite another object or fault.
- if (FoldedStore && Size != RCSize)
- return nullptr;
+ // For stores, make sure the size of the object is equal to the size of
+ // the store. If the object is larger, the extra bits would be garbage. If
+ // the object is smaller we might overwrite another object or fault.
+ if (FoldedStore && Size != RCSize)
+ return nullptr;
}
if (isTwoAddrFold)
@@ -5607,7 +5607,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// value and zero-extend the top bits. Change the destination register
// to a 32-bit one.
Register DstReg = NewMI->getOperand(0).getReg();
- if (DstReg.isPhysical())
+ if (DstReg.isPhysical())
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -6464,7 +6464,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
}
if (Load)
BeforeOps.push_back(SDValue(Load, 0));
- llvm::append_range(BeforeOps, AfterOps);
+ llvm::append_range(BeforeOps, AfterOps);
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
switch (Opc) {
default: break;
@@ -6782,18 +6782,18 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
return true;
}
-bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
- const MachineBasicBlock *MBB,
- const MachineFunction &MF) const {
-
- // ENDBR instructions should not be scheduled around.
- unsigned Opcode = MI.getOpcode();
- if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
- return true;
-
- return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
-}
-
+bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const {
+
+ // ENDBR instructions should not be scheduled around.
+ unsigned Opcode = MI.getOpcode();
+ if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32)
+ return true;
+
+ return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
bool X86InstrInfo::
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -6824,7 +6824,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
"X86-64 PIC uses RIP relative addressing");
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
- Register GlobalBaseReg = X86FI->getGlobalBaseReg();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
if (GlobalBaseReg != 0)
return GlobalBaseReg;
@@ -8380,7 +8380,7 @@ describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
// If the described register is a sub-register of the destination register,
// then pick out the source register's corresponding sub-register.
if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
- Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
+ Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
}
@@ -8644,7 +8644,7 @@ namespace {
return false;
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- Register GlobalBaseReg = X86FI->getGlobalBaseReg();
+ Register GlobalBaseReg = X86FI->getGlobalBaseReg();
// If we didn't need a GlobalBaseReg, don't insert code.
if (GlobalBaseReg == 0)
@@ -8657,7 +8657,7 @@ namespace {
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const X86InstrInfo *TII = STI.getInstrInfo();
- Register PC;
+ Register PC;
if (STI.isPICStyleGOT())
PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
else
@@ -8727,7 +8727,7 @@ namespace {
MachineFunctionPass::getAnalysisUsage(AU);
}
};
-} // namespace
+} // namespace
char CGBR::ID = 0;
FunctionPass*
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h
index d7d2370c6f..6b0f1a9f14 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h
@@ -317,17 +317,17 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- Optional<ExtAddrMode>
- getAddrModeFromMemoryOp(const MachineInstr &MemI,
- const TargetRegisterInfo *TRI) const override;
-
- bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
- int64_t &ImmVal) const override;
-
- bool preservesZeroValueInReg(const MachineInstr *MI,
- const Register NullValueReg,
- const TargetRegisterInfo *TRI) const override;
-
+ Optional<ExtAddrMode>
+ getAddrModeFromMemoryOp(const MachineInstr &MemI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg,
+ int64_t &ImmVal) const override;
+
+ bool preservesZeroValueInReg(const MachineInstr *MI,
+ const Register NullValueReg,
+ const TargetRegisterInfo *TRI) const override;
+
bool getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt,
SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
@@ -420,13 +420,13 @@ public:
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
int64_t &Offset2) const override;
- /// isSchedulingBoundary - Overrides the isSchedulingBoundary from
- /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR
- /// intructions and prevent it from being re-scheduled.
- bool isSchedulingBoundary(const MachineInstr &MI,
- const MachineBasicBlock *MBB,
- const MachineFunction &MF) const override;
-
+ /// isSchedulingBoundary - Overrides the isSchedulingBoundary from
+ /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR
+ /// intructions and prevent it from being re-scheduled.
+ bool isSchedulingBoundary(const MachineInstr &MI,
+ const MachineBasicBlock *MBB,
+ const MachineFunction &MF) const override;
+
/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
/// should be scheduled togther. On some targets if two loads are loading from
@@ -470,7 +470,7 @@ public:
unsigned
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
- unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
+ unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
@@ -525,7 +525,7 @@ public:
/// the machine instruction generated due to folding.
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
- Register &FoldAsLoadDefReg,
+ Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
std::pair<unsigned, unsigned>
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td
index b006d1d9aa..94e85df086 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td
@@ -69,8 +69,8 @@ def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
SDTCisVT<2, i8>]>;
-def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
-def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
+def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisPtrTy<1>,
@@ -94,11 +94,11 @@ def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
SDTCisVT<1, iPTR>,
SDTCisVT<2, iPTR>]>;
-def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
- SDTCisPtrTy<1>,
- SDTCisVT<2, i32>,
- SDTCisVT<3, i8>,
- SDTCisVT<4, i32>]>;
+def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i32>,
+ SDTCisVT<3, i8>,
+ SDTCisVT<4, i32>]>;
def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
@@ -127,11 +127,11 @@ def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>;
-def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
- SDTCisVT<1, i32>,
- SDTCisVT<2, v2i64>,
- SDTCisPtrTy<3>]>;
-
+def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, v2i64>,
+ SDTCisPtrTy<3>]>;
+
def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
[SDNPHasChain,SDNPSideEffect]>;
def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
@@ -169,10 +169,10 @@ def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
-def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
SDNPMayLoad, SDNPMemOperand]>;
@@ -186,13 +186,13 @@ def X86vastart_save_xmm_regs :
SDT_X86VASTART_SAVE_XMM_REGS,
[SDNPHasChain, SDNPVariadic]>;
def X86vaarg64 :
- SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
- [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
- SDNPMemOperand]>;
-def X86vaargx32 :
- SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG,
+ SDNode<"X86ISD::VAARG_64", SDT_X86VAARG,
[SDNPHasChain, SDNPMayLoad, SDNPMayStore,
SDNPMemOperand]>;
+def X86vaargx32 :
+ SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG,
+ [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+ SDNPMemOperand]>;
def X86callseq_start :
SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
[SDNPHasChain, SDNPOutGlue]>;
@@ -280,7 +280,7 @@ def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
SDNPMemOperand]>;
def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
-def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
+def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;
@@ -320,23 +320,23 @@ def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD,
[SDNPHasChain, SDNPSideEffect]>;
def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD,
[SDNPHasChain, SDNPSideEffect]>;
-def X86testui : SDNode<"X86ISD::TESTUI",
- SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
- [SDNPHasChain, SDNPSideEffect]>;
-
-def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
- [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
- SDNPMemOperand]>;
-def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
- [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
- SDNPMemOperand]>;
-def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
- [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
- SDNPMemOperand]>;
-def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
- [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
- SDNPMemOperand]>;
-
+def X86testui : SDNode<"X86ISD::TESTUI",
+ SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL,
+ [SDNPHasChain, SDNPMayLoad, SDNPSideEffect,
+ SDNPMemOperand]>;
+
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
//
@@ -914,8 +914,8 @@ def PKU : Predicate<"Subtarget->hasPKU()">;
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
-def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
-def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
@@ -979,15 +979,15 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
-def HasKL : Predicate<"Subtarget->hasKL()">;
-def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">;
-def HasHRESET : Predicate<"Subtarget->hasHRESET()">;
+def HasKL : Predicate<"Subtarget->hasKL()">;
+def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">;
+def HasHRESET : Predicate<"Subtarget->hasHRESET()">;
def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">;
def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">;
def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
-def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
+def HasUINTR : Predicate<"Subtarget->hasUINTR()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
@@ -1035,7 +1035,7 @@ def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
-def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
+def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
@@ -1073,7 +1073,7 @@ def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>;
def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
-def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
+def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
return isSExtAbsoluteSymbolRef(8, N);
@@ -2679,11 +2679,11 @@ let Predicates = [HasBMI2] in {
//
let Predicates = [HasTBM], Defs = [EFLAGS] in {
-multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
- SDNode OpNode, Operand immtype,
- SDPatternOperator immoperator,
- X86FoldableSchedWrite Sched> {
+multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ SDNode OpNode, Operand immtype,
+ SDPatternOperator immoperator,
+ X86FoldableSchedWrite Sched> {
def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
@@ -2697,12 +2697,12 @@ multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr,
XOP, XOPA, Sched<[Sched.Folded]>;
}
-defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32,
- X86bextri, i32imm, timm, WriteBEXTR>;
+defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32,
+ X86bextri, i32imm, timm, WriteBEXTR>;
let ImmT = Imm32S in
-defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64,
- X86bextri, i64i32imm,
- i64timmSExt32, WriteBEXTR>, VEX_W;
+defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64,
+ X86bextri, i64i32imm,
+ i64timmSExt32, WriteBEXTR>, VEX_W;
multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
RegisterClass RC, string OpcodeStr,
@@ -2808,7 +2808,7 @@ let SchedRW = [ WriteSystem ] in {
let Uses = [ ECX, EAX, EBX ] in {
def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
- []>, TB, Requires<[ HasMWAITX ]>;
+ []>, TB, Requires<[ HasMWAITX ]>;
}
} // SchedRW
@@ -2925,41 +2925,41 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
-// INVLPGB Instruction
-// OPCODE 0F 01 FE
-//
-let SchedRW = [WriteSystem] in {
- let Uses = [EAX, EDX] in
- def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
- "invlpgb}", []>,
- PS, Requires<[Not64BitMode]>;
- let Uses = [RAX, EDX] in
- def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
- "invlpgb", []>,
- PS, Requires<[In64BitMode]>;
-} // SchedRW
-
-def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>;
-def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>;
-
-//===----------------------------------------------------------------------===//
-// TLBSYNC Instruction
-// OPCODE 0F 01 FF
-//
-let SchedRW = [WriteSystem] in {
- def TLBSYNC : I<0x01, MRM_FF, (outs), (ins),
- "tlbsync", []>,
- PS, Requires<[]>;
-} // SchedRW
-
-//===----------------------------------------------------------------------===//
-// HRESET Instruction
-//
-let Uses = [EAX], SchedRW = [WriteSystem] in
- def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
- Requires<[HasHRESET]>, TAXS;
-
-//===----------------------------------------------------------------------===//
+// INVLPGB Instruction
+// OPCODE 0F 01 FE
+//
+let SchedRW = [WriteSystem] in {
+ let Uses = [EAX, EDX] in
+ def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb}", []>,
+ PS, Requires<[Not64BitMode]>;
+ let Uses = [RAX, EDX] in
+ def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
+ "invlpgb", []>,
+ PS, Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// TLBSYNC Instruction
+// OPCODE 0F 01 FF
+//
+let SchedRW = [WriteSystem] in {
+ def TLBSYNC : I<0x01, MRM_FF, (outs), (ins),
+ "tlbsync", []>,
+ PS, Requires<[]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// HRESET Instruction
+//
+let Uses = [EAX], SchedRW = [WriteSystem] in
+ def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
+ Requires<[HasHRESET]>, TAXS;
+
+//===----------------------------------------------------------------------===//
// SERIALIZE Instruction
//
def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
@@ -2977,25 +2977,25 @@ let Predicates = [HasTSXLDTRK] in {
}
//===----------------------------------------------------------------------===//
-// UINTR Instructions
-//
-let Predicates = [HasUINTR, In64BitMode] in {
- def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
- []>, XS;
- def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
- [(int_x86_clui)]>, XS;
- def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
- [(int_x86_stui)]>, XS;
-
- def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
- [(int_x86_senduipi GR64:$arg)]>, XS;
-
- let Defs = [EFLAGS] in
- def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
- [(set EFLAGS, (X86testui))]>, XS;
-}
-
-//===----------------------------------------------------------------------===//
+// UINTR Instructions
+//
+let Predicates = [HasUINTR, In64BitMode] in {
+ def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
+ []>, XS;
+ def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
+ [(int_x86_clui)]>, XS;
+ def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
+ [(int_x86_stui)]>, XS;
+
+ def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
+ [(int_x86_senduipi GR64:$arg)]>, XS;
+
+ let Defs = [EFLAGS] in
+ def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
+ [(set EFLAGS, (X86testui))]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
//===----------------------------------------------------------------------===//
@@ -3154,16 +3154,16 @@ include "X86InstrMPX.td"
include "X86InstrVMX.td"
include "X86InstrSVM.td"
-include "X86InstrSNP.td"
+include "X86InstrSNP.td"
include "X86InstrTSX.td"
include "X86InstrSGX.td"
-include "X86InstrTDX.td"
-
-// Key Locker instructions
-include "X86InstrKL.td"
-
+include "X86InstrTDX.td"
+
+// Key Locker instructions
+include "X86InstrKL.td"
+
// AMX instructions
include "X86InstrAMX.td"
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td
index b91e563a15..6f2fbb56e1 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td
@@ -1,86 +1,86 @@
-//===---------------------------*-tablegen-*-------------------------------===//
-//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the instructions that make up the Intel key locker
-// instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Key Locker instructions
-
-let SchedRW = [WriteSystem], Predicates = [HasKL] in {
- let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
- def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
- "loadiwkey\t{$src2, $src1|$src1, $src2}",
- [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS,
- NotMemoryFoldable;
- }
-
- let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
- def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS,
- NotMemoryFoldable;
- }
-
- let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
- def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS,
- NotMemoryFoldable;
- }
-
- let Constraints = "$src1 = $dst",
- Defs = [EFLAGS] in {
- def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
- "aesenc128kl\t{$src2, $src1|$src1, $src2}",
- [(set VR128:$dst, EFLAGS,
- (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS,
- NotMemoryFoldable;
-
- def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
- "aesdec128kl\t{$src2, $src1|$src1, $src2}",
- [(set VR128:$dst, EFLAGS,
- (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS,
- NotMemoryFoldable;
-
- def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
- "aesenc256kl\t{$src2, $src1|$src1, $src2}",
- [(set VR128:$dst, EFLAGS,
- (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS,
- NotMemoryFoldable;
-
- def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
- "aesdec256kl\t{$src2, $src1|$src1, $src2}",
- [(set VR128:$dst, EFLAGS,
- (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS,
- NotMemoryFoldable;
- }
-
-} // SchedRW, Predicates
-
-let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
- let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
- Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
- mayLoad = 1 in {
- def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
- "aesencwide128kl\t$src", []>, T8XS,
- NotMemoryFoldable;
- def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
- "aesdecwide128kl\t$src", []>, T8XS,
- NotMemoryFoldable;
- def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
- "aesencwide256kl\t$src", []>, T8XS,
- NotMemoryFoldable;
- def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
- "aesdecwide256kl\t$src", []>, T8XS,
- NotMemoryFoldable;
- }
-
-} // SchedRW, Predicates
+//===---------------------------*-tablegen-*-------------------------------===//
+//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel key locker
+// instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Key Locker instructions
+
+let SchedRW = [WriteSystem], Predicates = [HasKL] in {
+ let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
+ def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "loadiwkey\t{$src2, $src1|$src1, $src2}",
+ [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
+ def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+ let Constraints = "$src1 = $dst",
+ Defs = [EFLAGS] in {
+ def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec128kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesenc256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+
+ def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
+ "aesdec256kl\t{$src2, $src1|$src1, $src2}",
+ [(set VR128:$dst, EFLAGS,
+ (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
+
+let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
+ let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
+ mayLoad = 1 in {
+ def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
+ "aesencwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
+ "aesdecwide128kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
+ "aesencwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
+ "aesdecwide256kl\t$src", []>, T8XS,
+ NotMemoryFoldable;
+ }
+
+} // SchedRW, Predicates
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td
index bb3e6df3bf..cb18a9b59a 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td
@@ -472,7 +472,7 @@ defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
SchedWriteVarShuffle.MMX>;
-let Predicates = [HasMMX, HasSSE1] in {
+let Predicates = [HasMMX, HasSSE1] in {
def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -486,7 +486,7 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX.Folded]>;
-}
+}
// -- Conversion Instructions
defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td
index de59f3fe27..be95d70282 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td
@@ -1,47 +1,47 @@
-//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the instructions that make up the AMD Secure Nested
-// Paging (SNP) instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SNP instructions
-
-let SchedRW = [WriteSystem] in {
-// F3 0F 01 FF
-let Uses = [RAX] in
-def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
- Requires<[In64BitMode]>;
-
-// F2 0F 01 FF
-let Uses = [RAX] in
-def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
- XD, Requires<[In64BitMode]>;
-
-let Uses = [EAX] in
-def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
- XD, Requires<[Not64BitMode]>;
-
-// F2 0F 01 FE
-let Uses = [RAX] in
-def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
- Requires<[In64BitMode]>;
-
-// F3 0F 01 FE
-let Uses = [RAX] in
-def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
- Requires<[In64BitMode]>;
-} // SchedRW
-
-def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>;
-def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>;
-def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>;
-def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>;
-def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>;
+//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD Secure Nested
+// Paging (SNP) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SNP instructions
+
+let SchedRW = [WriteSystem] in {
+// F3 0F 01 FF
+let Uses = [RAX] in
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+ Requires<[In64BitMode]>;
+
+// F2 0F 01 FF
+let Uses = [RAX] in
+def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[In64BitMode]>;
+
+let Uses = [EAX] in
+def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
+ XD, Requires<[Not64BitMode]>;
+
+// F2 0F 01 FE
+let Uses = [RAX] in
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+ Requires<[In64BitMode]>;
+
+// F3 0F 01 FE
+let Uses = [RAX] in
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>;
+def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>;
+def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>;
+def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td
index a185a2007b..29ac01b143 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td
@@ -1242,8 +1242,8 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
/// SSE 2 Only
// Convert scalar double to scalar single
-let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
- ExeDomain = SSEPackedSingle in {
+let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
+ ExeDomain = SSEPackedSingle in {
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1261,7 +1261,7 @@ def : Pat<(f32 (any_fpround FR64:$src)),
(VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
-let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
[(set FR32:$dst, (any_fpround FR64:$src))]>,
@@ -1273,7 +1273,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
}
-let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1307,7 +1307,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
-let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
+let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -1327,7 +1327,7 @@ def : Pat<(f64 (any_fpextend FR32:$src)),
def : Pat<(any_fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
-let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
+let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
[(set FR64:$dst, (any_fpextend FR32:$src))]>,
@@ -1339,8 +1339,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
} // isCodeGenOnly = 1
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
- ExeDomain = SSEPackedSingle in {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+ ExeDomain = SSEPackedSingle in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -3778,7 +3778,7 @@ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
VEX_4V, VEX_WIG;
defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, VEX_WIG;
+ VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
@@ -3794,7 +3794,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
VEX_4V, VEX_L, VEX_WIG;
defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, VEX_WIG;
+ VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -3930,7 +3930,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
+ (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : Ii8<0xC4, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1,
@@ -3940,7 +3940,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
"vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
- timm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -3950,13 +3950,13 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- timm:$src2))]>,
+ timm:$src2))]>,
PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
- timm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
// Insert
@@ -4756,7 +4756,7 @@ let isCommutable = 0 in {
SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
load, i128mem,
- SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
+ SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
int_x86_ssse3_psign_b_128,
SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
@@ -4802,7 +4802,7 @@ let isCommutable = 0 in {
SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
load, i256mem,
- SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
+ SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
@@ -5153,14 +5153,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
- timm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
let hasSideEffects = 0, mayStore = 1 in
def mr : SS4AIi8<opc, MRMDestMem, (outs),
(ins i8mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
+ [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
@@ -5184,7 +5184,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
(ins i16mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
+ [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
addr:$dst)]>, Sched<[WriteVecExtractSt]>;
}
@@ -5274,7 +5274,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
+ (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
@@ -5283,7 +5283,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
@@ -6503,7 +6503,7 @@ multiclass pcmpistrm_SS42AI<string asm> {
let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
+ defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ;
}
@@ -6521,7 +6521,7 @@ multiclass SS42AI_pcmpestrm<string asm> {
let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
+ defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">;
}
@@ -6539,7 +6539,7 @@ multiclass SS42AI_pcmpistri<string asm> {
let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
+ defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
}
@@ -6557,7 +6557,7 @@ multiclass SS42AI_pcmpestri<string asm> {
let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
let Predicates = [HasAVX] in
- defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
+ defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
}
@@ -7016,19 +7016,19 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
// NOTE: We're using FP instructions here, but execution domain fixing can
// convert to integer when profitable.
-def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
+def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
(VBROADCASTF128 addr:$src)>;
}
@@ -7164,68 +7164,68 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
-// AVX_VNNI
-//===----------------------------------------------------------------------===//
-let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
-multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- bit IsCommutable> {
- let isCommutable = IsCommutable in
- def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
- VR128:$src2, VR128:$src3)))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
-
- def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, i128mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
- (loadv4i32 addr:$src3))))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
-
- let isCommutable = IsCommutable in
- def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
- VR256:$src2, VR256:$src3)))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
-
- def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, i256mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
- (loadv8i32 addr:$src3))))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
-}
-
-defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
-defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
-defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
-defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
-
-def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
- (X86vpmaddwd node:$lhs, node:$rhs), [{
- return N->hasOneUse();
-}]>;
-
-let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
- def : Pat<(v8i32 (add VR256:$src1,
- (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
- (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
- def : Pat<(v8i32 (add VR256:$src1,
- (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
- (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
- def : Pat<(v4i32 (add VR128:$src1,
- (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
- (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
- def : Pat<(v4i32 (add VR128:$src1,
- (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
- (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
-}
-
-//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit IsCommutable> {
+ let isCommutable = IsCommutable in
+ def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+ VR128:$src2, VR128:$src3)))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+ (loadv4i32 addr:$src3))))]>,
+ VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+ let isCommutable = IsCommutable in
+ def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+ VR256:$src2, VR256:$src3)))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+ def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+ !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+ (loadv8i32 addr:$src3))))]>,
+ VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86vpmaddwd node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
+ (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+ def : Pat<(v8i32 (add VR256:$src1,
+ (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
+ (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
+ (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+ def : Pat<(v4i32 (add VR128:$src1,
+ (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
+ (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
+}
+
+//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
//
@@ -7287,12 +7287,12 @@ let ExeDomain = SSEPackedSingle in {
let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
- "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
- "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+ "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}
// Immediate transform to help with commuting.
@@ -7300,27 +7300,27 @@ def Perm2XCommuteImm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
}]>;
-multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
- def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
- (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
- def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
- (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
- // Pattern with load in other operand.
- def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
- (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
- (Perm2XCommuteImm timm:$imm))>;
-}
-
+multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
+ def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
+ // Pattern with load in other operand.
+ def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
+ (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
+ (Perm2XCommuteImm timm:$imm))>;
+}
+
let Predicates = [HasAVX] in {
- defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
- defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
+ defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
}
let Predicates = [HasAVX1Only] in {
- defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
- defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
- defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
- defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
+ defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>;
}
//===----------------------------------------------------------------------===//
@@ -7689,24 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
WriteFShuffle256, f256mem>, VEX_W;
//===----------------------------------------------------------------------===//
-// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
+// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
//
let isCommutable = 1 in
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
- "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
- "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
+ "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
-let Predicates = [HasAVX2] in {
- defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
- defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
- defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
- defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
-}
+let Predicates = [HasAVX2] in {
+ defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
+ defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>;
+ defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+ defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>;
+}
//===----------------------------------------------------------------------===//
// VINSERTI128 - Insert packed integer values
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td
index d8f70b016c..2bc32910a6 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td
@@ -26,47 +26,47 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB;
// 0F 01 D8
let Uses = [EAX] in
-def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
-def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
-def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
Requires<[Not64BitMode]>;
let Uses = [RAX] in
-def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB,
Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga", []>, TB, Requires<[Not64BitMode]>;
+ "invlpga", []>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga", []>, TB, Requires<[In64BitMode]>;
+ "invlpga", []>, TB, Requires<[In64BitMode]>;
} // SchedRW
-
-def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>;
-def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>;
-def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>;
-def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>;
+
+def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>;
+def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td
index eb8740896e..3b105cedb3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td
@@ -49,7 +49,7 @@ let Uses = [EFLAGS] in
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
} // SchedRW
-def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
+def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>;
// The long form of "int $3" turns into int3 as a size optimization.
// FIXME: This doesn't work because InstAlias can't match immediate constants.
//def : InstAlias<"int\t$3", (INT3)>;
@@ -172,17 +172,17 @@ def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
} // SchedRW
//===----------------------------------------------------------------------===//
-// Address-size override prefixes.
-//
-
-let SchedRW = [WriteNop] in {
-def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>,
- Requires<[In32BitMode]>;
-def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>,
- Requires<[In64BitMode]>;
-} // SchedRW
-
-//===----------------------------------------------------------------------===//
+// Address-size override prefixes.
+//
+
+let SchedRW = [WriteNop] in {
+def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>,
+ Requires<[In32BitMode]>;
+def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
// Moves to and from segment registers.
//
@@ -459,7 +459,7 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
// Cache instructions
let SchedRW = [WriteSystem] in {
def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
// wbnoinvd is like wbinvd, except without invalidation
// encoding: like wbinvd + an 0xF3 prefix
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td
index 8d7cd60820..e21028c8a3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td
@@ -1,39 +1,39 @@
-//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the instructions that make up the Intel TDX instruction
-// set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// TDX instructions
-
-// 64-bit only instructions
-let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
-// SEAMCALL - Call to SEAM VMX-root Operation Module
-def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
- "seamcall", []>, PD;
-
-// SEAMRET - Return to Legacy VMX-root Operation
-def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
- "seamret", []>, PD;
-
-// SEAMOPS - SEAM Operations
-def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
- "seamops", []>, PD;
-
-} // SchedRW
-
-// common instructions
-let SchedRW = [WriteSystem] in {
-// TDCALL - Call SEAM Module Functions
-def TDCALL : I<0x01, MRM_CC, (outs), (ins),
- "tdcall", []>, PD;
-
-} // SchedRW
+//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TDX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TDX instructions
+
+// 64-bit only instructions
+let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
+// SEAMCALL - Call to SEAM VMX-root Operation Module
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
+ "seamcall", []>, PD;
+
+// SEAMRET - Return to Legacy VMX-root Operation
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
+ "seamret", []>, PD;
+
+// SEAMOPS - SEAM Operations
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
+ "seamops", []>, PD;
+
+} // SchedRW
+
+// common instructions
+let SchedRW = [WriteSystem] in {
+// TDCALL - Call SEAM Module Functions
+def TDCALL : I<0x01, MRM_CC, (outs), (ins),
+ "tdcall", []>, PD;
+
+} // SchedRW
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp
index ff53171303..8a3091af28 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp
@@ -214,8 +214,8 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
return SubIdx;
}
-static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) {
- assert(Reg.isPhysical());
+static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) {
+ assert(Reg.isPhysical());
if (X86::GR64RegClass.contains(Reg))
return &X86::GR64RegClass;
if (X86::GR32RegClass.contains(Reg))
@@ -239,7 +239,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- if (DstReg.isPhysical()) {
+ if (DstReg.isPhysical()) {
assert(I.isCopy() && "Generic operators do not allow physical registers");
if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -266,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
- assert((!SrcReg.isPhysical() || I.isCopy()) &&
+ assert((!SrcReg.isPhysical() || I.isCopy()) &&
"No phys reg on generic operators");
assert((DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (SrcReg.isPhysical() &&
+ (SrcReg.isPhysical() &&
DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
"Copy with different width?!");
@@ -280,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRegBank.getID() == X86::GPRRegBankID &&
DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
- SrcReg.isPhysical()) {
+ SrcReg.isPhysical()) {
// Change the physical register to performe truncate.
const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -479,7 +479,7 @@ static void X86SelectAddress(const MachineInstr &I,
"unsupported type.");
if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
- if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
+ if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
if (isInt<32>(Imm)) { // Check for displacement overflow.
AM.Disp = static_cast<int32_t>(Imm);
@@ -780,18 +780,18 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
- assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) &&
- "8=>16 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) &&
+ "8=>16 Zext is handled by tablegen");
assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) &&
"8=>32 Zext is handled by tablegen");
assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) &&
"16=>32 Zext is handled by tablegen");
- assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) &&
- "8=>64 Zext is handled by tablegen");
- assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) &&
- "16=>64 Zext is handled by tablegen");
- assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) &&
- "32=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) &&
+ "8=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) &&
+ "16=>64 Zext is handled by tablegen");
+ assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) &&
+ "32=>64 Zext is handled by tablegen");
if (SrcTy != LLT::scalar(1))
return false;
@@ -808,17 +808,17 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
else
return false;
- Register DefReg = SrcReg;
+ Register DefReg = SrcReg;
if (DstTy != LLT::scalar(8)) {
- Register ImpDefReg =
- MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
- BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg);
-
+ Register ImpDefReg =
+ MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg);
+
DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
BuildMI(*I.getParent(), I, I.getDebugLoc(),
- TII.get(TargetOpcode::INSERT_SUBREG), DefReg)
- .addReg(ImpDefReg)
+ TII.get(TargetOpcode::INSERT_SUBREG), DefReg)
+ .addReg(ImpDefReg)
.addReg(SrcReg)
.addImm(X86::sub_8bit);
}
@@ -1559,9 +1559,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
}}, // i64
};
- auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
- return El.SizeInBits == RegTy.getSizeInBits();
- });
+ auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
+ return El.SizeInBits == RegTy.getSizeInBits();
+ });
if (OpEntryIt == std::end(OpTable))
return false;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp
index 95655dd472..83829e0b82 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp
@@ -44,8 +44,8 @@ namespace {
/// E.g. A group of interleaving access loads (Factor = 2; accessing every
/// other element)
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
-/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
+/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
class X86InterleavedAccessGroup {
/// Reference to the wide-load instruction of an interleaved access
/// group.
@@ -211,7 +211,7 @@ void X86InterleavedAccessGroup::decompose(
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
}
// Generate N loads of T type.
- assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
+ assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) &&
"VecBaseTy's size must be a multiple of 8");
const Align FirstAlignment = LI->getAlign();
const Align SubsequentAlignment = commonAlignment(
@@ -295,7 +295,7 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
if (VecElems == 16) {
for (unsigned i = 0; i < Stride; i++)
- TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
+ TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
return;
}
@@ -576,7 +576,7 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
for (int i = 0; i < 3; i++)
- Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
+ Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
@@ -598,8 +598,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
- Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
- TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
+ Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
}
@@ -656,8 +656,8 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
- Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
- Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
+ Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
+ Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
Vec[2] = InVec[2];
// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h
index 72ab3e9cf7..233ac1a3e3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h
@@ -22,7 +22,7 @@ namespace llvm {
enum IntrinsicType : uint16_t {
CVTNEPS2BF16_MASK,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
- INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
+ INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8,
INTR_TYPE_3OP_IMM8,
CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
CVTPD2PS_MASK,
@@ -458,12 +458,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FADDS, X86ISD::FADDS_RND),
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FADDS, X86ISD::FADDS_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+ X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
@@ -882,12 +882,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
+ X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
@@ -1098,7 +1098,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
- X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -1108,8 +1108,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
- X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
- X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
@@ -1132,10 +1132,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
X86ISD::GF2P8MULB, 0),
- X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
- X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
+ X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp
index 1b371ac2a1..9f51fa4f4c 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp
@@ -70,11 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoAVX512DQ();
setLegalizerInfoAVX512BW();
- getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
- .scalarize(0)
- .minScalar(0, LLT::scalar(32))
- .libcall();
-
+ getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+ .scalarize(0)
+ .minScalar(0, LLT::scalar(32))
+ .libcall();
+
setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
@@ -86,8 +86,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizeScalarToDifferentSizeStrategy(
G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
- getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
-
+ getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
+
computeTables();
verify(*STI.getInstrInfo());
}
@@ -155,11 +155,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
.legalFor({{s8, s8}, {s16, s8}, {s32, s8}})
.clampScalar(0, s8, s32)
.clampScalar(1, s8, s8);
-
- // Comparison
- getActionDefinitionsBuilder(G_ICMP)
- .legalForCartesianProduct({s8}, {s8, s16, s32, p0})
- .clampScalar(0, s8, s8);
+
+ // Comparison
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, p0})
+ .clampScalar(0, s8, s8);
}
// Control-flow
@@ -246,9 +246,9 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
.widenScalarToNextPow2(1);
// Comparison
- getActionDefinitionsBuilder(G_ICMP)
- .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0})
- .clampScalar(0, s8, s8);
+ getActionDefinitionsBuilder(G_ICMP)
+ .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0})
+ .clampScalar(0, s8, s8);
getActionDefinitionsBuilder(G_FCMP)
.legalForCartesianProduct({s8}, {s32, s64})
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 810fee052b..e419f45405 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -42,7 +42,7 @@
#include "X86TargetMachine.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -105,9 +105,9 @@ static cl::opt<bool> EmitDotVerify(
cl::init(false), cl::Hidden);
static llvm::sys::DynamicLibrary OptimizeDL;
-typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize,
- unsigned int *Edges, int *EdgeValues,
- int *CutEdges /* out */, unsigned int EdgesSize);
+typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize,
+ unsigned int *Edges, int *EdgeValues,
+ int *CutEdges /* out */, unsigned int EdgesSize);
static OptimizeCutT OptimizeCut = nullptr;
namespace {
@@ -149,8 +149,8 @@ public:
private:
using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
- using Edge = MachineGadgetGraph::Edge;
- using Node = MachineGadgetGraph::Node;
+ using Edge = MachineGadgetGraph::Edge;
+ using Node = MachineGadgetGraph::Node;
using EdgeSet = MachineGadgetGraph::EdgeSet;
using NodeSet = MachineGadgetGraph::NodeSet;
@@ -164,8 +164,8 @@ private:
const MachineDominanceFrontier &MDF) const;
int hardenLoadsWithPlugin(MachineFunction &MF,
std::unique_ptr<MachineGadgetGraph> Graph) const;
- int hardenLoadsWithHeuristic(MachineFunction &MF,
- std::unique_ptr<MachineGadgetGraph> Graph) const;
+ int hardenLoadsWithHeuristic(MachineFunction &MF,
+ std::unique_ptr<MachineGadgetGraph> Graph) const;
int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
EdgeSet &ElimEdges /* in, out */,
NodeSet &ElimNodes /* in, out */) const;
@@ -198,7 +198,7 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
using ChildIteratorType = typename Traits::ChildIteratorType;
using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
- DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+ DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
std::string getNodeLabel(NodeRef Node, GraphType *) {
if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
@@ -243,7 +243,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
AU.setPreservesCFG();
}
-static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF,
MachineGadgetGraph *G) {
WriteGraph(OS, G, /*ShortNames*/ false,
"Speculative gadgets for \"" + MF.getName() + "\" function");
@@ -279,7 +279,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
return false; // didn't find any gadgets
if (EmitDotVerify) {
- writeGadgetGraph(outs(), MF, Graph.get());
+ writeGadgetGraph(outs(), MF, Graph.get());
return false;
}
@@ -292,7 +292,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
raw_fd_ostream FileOut(FileName, FileError);
if (FileError)
errs() << FileError.message();
- writeGadgetGraph(FileOut, MF, Graph.get());
+ writeGadgetGraph(FileOut, MF, Graph.get());
FileOut.close();
LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
if (EmitDotOnly)
@@ -313,7 +313,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
}
FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
} else { // Use the default greedy heuristic
- FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph));
+ FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph));
}
if (FencesInserted > 0)
@@ -367,7 +367,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
// Use RDF to find all the uses of `Def`
rdf::NodeSet Uses;
- RegisterRef DefReg = Def.Addr->getRegRef(DFG);
+ RegisterRef DefReg = Def.Addr->getRegRef(DFG);
for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
auto Use = DFG.addr<UseNode *>(UseID);
if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
@@ -540,17 +540,17 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
// Returns the number of remaining gadget edges that could not be eliminated
int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
- MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */,
- NodeSet &ElimNodes /* in, out */) const {
+ MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */,
+ NodeSet &ElimNodes /* in, out */) const {
if (G.NumFences > 0) {
// Eliminate fences and CFG edges that ingress and egress the fence, as
// they are trivially mitigated.
- for (const Edge &E : G.edges()) {
- const Node *Dest = E.getDest();
+ for (const Edge &E : G.edges()) {
+ const Node *Dest = E.getDest();
if (isFence(Dest->getValue())) {
ElimNodes.insert(*Dest);
ElimEdges.insert(E);
- for (const Edge &DE : Dest->edges())
+ for (const Edge &DE : Dest->edges())
ElimEdges.insert(DE);
}
}
@@ -558,28 +558,28 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
// Find and eliminate gadget edges that have been mitigated.
int MitigatedGadgets = 0, RemainingGadgets = 0;
- NodeSet ReachableNodes{G};
- for (const Node &RootN : G.nodes()) {
+ NodeSet ReachableNodes{G};
+ for (const Node &RootN : G.nodes()) {
if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
continue; // skip this node if it isn't a gadget source
// Find all of the nodes that are CFG-reachable from RootN using DFS
ReachableNodes.clear();
- std::function<void(const Node *, bool)> FindReachableNodes =
- [&](const Node *N, bool FirstNode) {
- if (!FirstNode)
- ReachableNodes.insert(*N);
- for (const Edge &E : N->edges()) {
- const Node *Dest = E.getDest();
- if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) &&
- !ReachableNodes.contains(*Dest))
- FindReachableNodes(Dest, false);
- }
- };
+ std::function<void(const Node *, bool)> FindReachableNodes =
+ [&](const Node *N, bool FirstNode) {
+ if (!FirstNode)
+ ReachableNodes.insert(*N);
+ for (const Edge &E : N->edges()) {
+ const Node *Dest = E.getDest();
+ if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) &&
+ !ReachableNodes.contains(*Dest))
+ FindReachableNodes(Dest, false);
+ }
+ };
FindReachableNodes(&RootN, true);
// Any gadget whose sink is unreachable has been mitigated
- for (const Edge &E : RootN.edges()) {
+ for (const Edge &E : RootN.edges()) {
if (MachineGadgetGraph::isGadgetEdge(E)) {
if (ReachableNodes.contains(*E.getDest())) {
// This gadget's sink is reachable
@@ -597,8 +597,8 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
std::unique_ptr<MachineGadgetGraph>
X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
std::unique_ptr<MachineGadgetGraph> Graph) const {
- NodeSet ElimNodes{*Graph};
- EdgeSet ElimEdges{*Graph};
+ NodeSet ElimNodes{*Graph};
+ EdgeSet ElimEdges{*Graph};
int RemainingGadgets =
elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
if (ElimEdges.empty() && ElimNodes.empty()) {
@@ -629,11 +629,11 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
- for (const Node &N : Graph->nodes()) {
+ for (const Node &N : Graph->nodes()) {
Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
}
Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
- for (const Edge &E : Graph->edges()) {
+ for (const Edge &E : Graph->edges()) {
Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
}
@@ -650,67 +650,67 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
- Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges);
+ Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges);
} while (true);
return FencesInserted;
}
-int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic(
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic(
MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
- // If `MF` does not have any fences, then no gadgets would have been
- // mitigated at this point.
- if (Graph->NumFences > 0) {
- LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
- Graph = trimMitigatedEdges(std::move(Graph));
- LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
- }
-
+ // If `MF` does not have any fences, then no gadgets would have been
+ // mitigated at this point.
+ if (Graph->NumFences > 0) {
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+ Graph = trimMitigatedEdges(std::move(Graph));
+ LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+ }
+
if (Graph->NumGadgets == 0)
return 0;
LLVM_DEBUG(dbgs() << "Cutting edges...\n");
- EdgeSet CutEdges{*Graph};
-
- // Begin by collecting all ingress CFG edges for each node
- DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap;
- for (const Edge &E : Graph->edges())
- if (MachineGadgetGraph::isCFGEdge(E))
- IngressEdgeMap[E.getDest()].push_back(&E);
-
- // For each gadget edge, make cuts that guarantee the gadget will be
- // mitigated. A computationally efficient way to achieve this is to either:
- // (a) cut all egress CFG edges from the gadget source, or
- // (b) cut all ingress CFG edges to the gadget sink.
- //
- // Moreover, the algorithm tries not to make a cut into a loop by preferring
- // to make a (b)-type cut if the gadget source resides at a greater loop depth
- // than the gadget sink, or an (a)-type cut otherwise.
- for (const Node &N : Graph->nodes()) {
- for (const Edge &E : N.edges()) {
- if (!MachineGadgetGraph::isGadgetEdge(E))
+ EdgeSet CutEdges{*Graph};
+
+ // Begin by collecting all ingress CFG edges for each node
+ DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap;
+ for (const Edge &E : Graph->edges())
+ if (MachineGadgetGraph::isCFGEdge(E))
+ IngressEdgeMap[E.getDest()].push_back(&E);
+
+ // For each gadget edge, make cuts that guarantee the gadget will be
+ // mitigated. A computationally efficient way to achieve this is to either:
+ // (a) cut all egress CFG edges from the gadget source, or
+ // (b) cut all ingress CFG edges to the gadget sink.
+ //
+ // Moreover, the algorithm tries not to make a cut into a loop by preferring
+ // to make a (b)-type cut if the gadget source resides at a greater loop depth
+ // than the gadget sink, or an (a)-type cut otherwise.
+ for (const Node &N : Graph->nodes()) {
+ for (const Edge &E : N.edges()) {
+ if (!MachineGadgetGraph::isGadgetEdge(E))
continue;
- SmallVector<const Edge *, 2> EgressEdges;
- SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()];
- for (const Edge &EgressEdge : N.edges())
- if (MachineGadgetGraph::isCFGEdge(EgressEdge))
- EgressEdges.push_back(&EgressEdge);
-
- int EgressCutCost = 0, IngressCutCost = 0;
- for (const Edge *EgressEdge : EgressEdges)
- if (!CutEdges.contains(*EgressEdge))
- EgressCutCost += EgressEdge->getValue();
- for (const Edge *IngressEdge : IngressEdges)
- if (!CutEdges.contains(*IngressEdge))
- IngressCutCost += IngressEdge->getValue();
-
- auto &EdgesToCut =
- IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges;
- for (const Edge *E : EdgesToCut)
- CutEdges.insert(*E);
+ SmallVector<const Edge *, 2> EgressEdges;
+ SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()];
+ for (const Edge &EgressEdge : N.edges())
+ if (MachineGadgetGraph::isCFGEdge(EgressEdge))
+ EgressEdges.push_back(&EgressEdge);
+
+ int EgressCutCost = 0, IngressCutCost = 0;
+ for (const Edge *EgressEdge : EgressEdges)
+ if (!CutEdges.contains(*EgressEdge))
+ EgressCutCost += EgressEdge->getValue();
+ for (const Edge *IngressEdge : IngressEdges)
+ if (!CutEdges.contains(*IngressEdge))
+ IngressCutCost += IngressEdge->getValue();
+
+ auto &EdgesToCut =
+ IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges;
+ for (const Edge *E : EdgesToCut)
+ CutEdges.insert(*E);
}
- }
+ }
LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
@@ -726,8 +726,8 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
MachineFunction &MF, MachineGadgetGraph &G,
EdgeSet &CutEdges /* in, out */) const {
int FencesInserted = 0;
- for (const Node &N : G.nodes()) {
- for (const Edge &E : N.edges()) {
+ for (const Node &N : G.nodes()) {
+ for (const Edge &E : N.edges()) {
if (CutEdges.contains(E)) {
MachineInstr *MI = N.getValue(), *Prev;
MachineBasicBlock *MBB; // Insert an LFENCE in this MBB
@@ -743,7 +743,7 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences(
Prev = MI->getPrevNode();
// Remove all egress CFG edges from this branch because the inserted
// LFENCE prevents gadgets from crossing the branch.
- for (const Edge &E : N.edges()) {
+ for (const Edge &E : N.edges()) {
if (MachineGadgetGraph::isCFGEdge(E))
CutEdges.insert(E);
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
index 7b6276c1d8..4562c1aee2 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -75,35 +75,35 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
bool Modified = false;
for (auto &MBB : MF) {
- for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
- if (MBBI->getOpcode() != X86::RETQ)
- continue;
-
- unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
- if (ClobberReg != X86::NoRegister) {
- BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r))
- .addReg(ClobberReg, RegState::Define)
- .setMIFlag(MachineInstr::FrameDestroy);
- BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
- BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r))
- .addReg(ClobberReg);
- MBB.erase(MBBI);
- } else {
- // In case there is no available scratch register, we can still read
- // from RSP to assert that RSP points to a valid page. The write to RSP
- // is also helpful because it verifies that the stack's write
- // permissions are intact.
- MachineInstr *Fence =
- BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
- addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
- X86::RSP, false, 0)
- .addImm(0)
- ->addRegisterDead(X86::EFLAGS, TRI);
- }
-
- ++NumFences;
- Modified = true;
- break;
+ for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) {
+ if (MBBI->getOpcode() != X86::RETQ)
+ continue;
+
+ unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+ if (ClobberReg != X86::NoRegister) {
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r))
+ .addReg(ClobberReg, RegState::Define)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r))
+ .addReg(ClobberReg);
+ MBB.erase(MBBI);
+ } else {
+ // In case there is no available scratch register, we can still read
+ // from RSP to assert that RSP points to a valid page. The write to RSP
+ // is also helpful because it verifies that the stack's write
+ // permissions are intact.
+ MachineInstr *Fence =
+ BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE));
+ addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+ X86::RSP, false, 0)
+ .addImm(0)
+ ->addRegisterDead(X86::EFLAGS, TRI);
+ }
+
+ ++NumFences;
+ Modified = true;
+ break;
}
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp
index 85166decd8..4c7eb5862f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp
@@ -1,351 +1,351 @@
-//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Pass to transform <256 x i32> load/store
-/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
-/// provides simple operation on x86_amx. The basic elementwise operation
-/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
-/// and only AMX intrinsics can operate on the type, we need transform
-/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
-/// not be combined with load/store, we transform the bitcast to amx load/store
-/// and <256 x i32> store/load.
-//
-//===----------------------------------------------------------------------===//
-//
-#include "X86.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "lower-amx-type"
-
-static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
- Function &F = *BB->getParent();
- Module *M = BB->getModule();
- const DataLayout &DL = M->getDataLayout();
-
- Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
- LLVMContext &Ctx = Builder.getContext();
- auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
- unsigned AllocaAS = DL.getAllocaAddrSpace();
- AllocaInst *AllocaRes =
- new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
- AllocaRes->setAlignment(AllocaAlignment);
- return AllocaRes;
-}
-
-static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
- Value *Row = nullptr, *Col = nullptr;
- switch (II->getIntrinsicID()) {
- default:
- llvm_unreachable("Expect amx intrinsics");
- case Intrinsic::x86_tileloadd64_internal:
- case Intrinsic::x86_tilestored64_internal: {
- Row = II->getArgOperand(0);
- Col = II->getArgOperand(1);
- break;
- }
- // a * b + c
- // The shape depends on which operand.
- case Intrinsic::x86_tdpbssd_internal: {
- switch (OpNo) {
- case 3:
- Row = II->getArgOperand(0);
- Col = II->getArgOperand(1);
- break;
- case 4:
- Row = II->getArgOperand(0);
- Col = II->getArgOperand(2);
- break;
- case 5:
- Row = II->getArgOperand(2);
- Col = II->getArgOperand(1);
- break;
- }
- break;
- }
- }
-
- return std::make_pair(Row, Col);
-}
-
-// %src = load <256 x i32>, <256 x i32>* %addr, align 64
-// %2 = bitcast <256 x i32> %src to x86_amx
-// -->
-// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
-// i8* %addr, i64 %stride64)
-static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
- Value *Row = nullptr, *Col = nullptr;
- Use &U = *(Bitcast->use_begin());
- unsigned OpNo = U.getOperandNo();
- auto *II = cast<IntrinsicInst>(U.getUser());
- std::tie(Row, Col) = getShape(II, OpNo);
- IRBuilder<> Builder(Bitcast);
- // Use the maximun column as stride.
- Value *Stride = Builder.getInt64(64);
- Value *I8Ptr =
- Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
- std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
-
- Value *NewInst =
- Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
- Bitcast->replaceAllUsesWith(NewInst);
-}
-
-// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
-// %stride);
-// %13 = bitcast x86_amx %src to <256 x i32>
-// store <256 x i32> %13, <256 x i32>* %addr, align 64
-// -->
-// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
-// %stride64, %13)
-static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
-
- Value *Tile = Bitcast->getOperand(0);
- auto *II = cast<IntrinsicInst>(Tile);
- // Tile is output from AMX intrinsic. The first operand of the
- // intrinsic is row, the second operand of the intrinsic is column.
- Value *Row = II->getOperand(0);
- Value *Col = II->getOperand(1);
- IRBuilder<> Builder(ST);
- // Use the maximum column as stride. It must be the same with load
- // stride.
- Value *Stride = Builder.getInt64(64);
- Value *I8Ptr =
- Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
- std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
- Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
- if (Bitcast->hasOneUse())
- return;
- // %13 = bitcast x86_amx %src to <256 x i32>
- // store <256 x i32> %13, <256 x i32>* %addr, align 64
- // %add = <256 x i32> %13, <256 x i32> %src2
- // -->
- // %13 = bitcast x86_amx %src to <256 x i32>
- // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
- // %stride64, %13)
- // %14 = load <256 x i32>, %addr
- // %add = <256 x i32> %14, <256 x i32> %src2
- Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
- Bitcast->replaceAllUsesWith(Vec);
-}
-
-// transform bitcast to <store, load> instructions.
-static bool transformBitcast(BitCastInst *Bitcast) {
- IRBuilder<> Builder(Bitcast);
- AllocaInst *AllocaAddr;
- Value *I8Ptr, *Stride;
- auto *Src = Bitcast->getOperand(0);
-
- auto Prepare = [&]() {
- AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
- I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
- Stride = Builder.getInt64(64);
- };
-
- if (Bitcast->getType()->isX86_AMXTy()) {
- // %2 = bitcast <256 x i32> %src to x86_amx
- // -->
- // %addr = alloca <256 x i32>, align 64
- // store <256 x i32> %src, <256 x i32>* %addr, align 64
- // %addr2 = bitcast <256 x i32>* to i8*
- // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
- // i8* %addr2,
- // i64 64)
- Use &U = *(Bitcast->use_begin());
- unsigned OpNo = U.getOperandNo();
- auto *II = dyn_cast<IntrinsicInst>(U.getUser());
- if (!II)
- return false; // May be bitcast from x86amx to <256 x i32>.
- Prepare();
- Builder.CreateStore(Src, AllocaAddr);
- // TODO we can pick an constant operand for the shape.
- Value *Row = nullptr, *Col = nullptr;
- std::tie(Row, Col) = getShape(II, OpNo);
- std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
- Value *NewInst = Builder.CreateIntrinsic(
- Intrinsic::x86_tileloadd64_internal, None, Args);
- Bitcast->replaceAllUsesWith(NewInst);
- } else {
- // %2 = bitcast x86_amx %src to <256 x i32>
- // -->
- // %addr = alloca <256 x i32>, align 64
- // %addr2 = bitcast <256 x i32>* to i8*
- // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
- // i8* %addr2, i64 %stride)
- // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
- auto *II = dyn_cast<IntrinsicInst>(Src);
- if (!II)
- return false; // May be bitcast from <256 x i32> to x86amx.
- Prepare();
- Value *Row = II->getOperand(0);
- Value *Col = II->getOperand(1);
- std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
- Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
- Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
- Bitcast->replaceAllUsesWith(NewInst);
- }
-
- return true;
-}
-
-namespace {
-class X86LowerAMXType {
- Function &Func;
-
-public:
- X86LowerAMXType(Function &F) : Func(F) {}
- bool visit();
-};
-
-bool X86LowerAMXType::visit() {
- SmallVector<Instruction *, 8> DeadInsts;
-
- for (BasicBlock *BB : post_order(&Func)) {
- for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
- II != IE;) {
- Instruction &Inst = *II++;
- auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
- if (!Bitcast)
- continue;
-
- Value *Src = Bitcast->getOperand(0);
- if (Bitcast->getType()->isX86_AMXTy()) {
- if (Bitcast->user_empty()) {
- DeadInsts.push_back(Bitcast);
- continue;
- }
- LoadInst *LD = dyn_cast<LoadInst>(Src);
- if (!LD) {
- if (transformBitcast(Bitcast))
- DeadInsts.push_back(Bitcast);
- continue;
- }
- // If load has mutli-user, duplicate a vector load.
- // %src = load <256 x i32>, <256 x i32>* %addr, align 64
- // %2 = bitcast <256 x i32> %src to x86_amx
- // %add = add <256 x i32> %src, <256 x i32> %src2
- // -->
- // %src = load <256 x i32>, <256 x i32>* %addr, align 64
- // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
- // i8* %addr, i64 %stride64)
- // %add = add <256 x i32> %src, <256 x i32> %src2
-
- // If load has one user, the load will be eliminated in DAG ISel.
- // %src = load <256 x i32>, <256 x i32>* %addr, align 64
- // %2 = bitcast <256 x i32> %src to x86_amx
- // -->
- // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
- // i8* %addr, i64 %stride64)
- combineLoadBitcast(LD, Bitcast);
- DeadInsts.push_back(Bitcast);
- if (LD->hasOneUse())
- DeadInsts.push_back(LD);
- } else if (Src->getType()->isX86_AMXTy()) {
- if (Bitcast->user_empty()) {
- DeadInsts.push_back(Bitcast);
- continue;
- }
- StoreInst *ST = nullptr;
- for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
- UI != UE;) {
- Value *I = (UI++)->getUser();
- ST = dyn_cast<StoreInst>(I);
- if (ST)
- break;
- }
- if (!ST) {
- if (transformBitcast(Bitcast))
- DeadInsts.push_back(Bitcast);
- continue;
- }
- // If bitcast (%13) has one use, combine bitcast and store to amx store.
- // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
- // %stride);
- // %13 = bitcast x86_amx %src to <256 x i32>
- // store <256 x i32> %13, <256 x i32>* %addr, align 64
- // -->
- // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
- // %stride64, %13)
- //
- // If bitcast (%13) has multi-use, transform as below.
- // %13 = bitcast x86_amx %src to <256 x i32>
- // store <256 x i32> %13, <256 x i32>* %addr, align 64
- // %add = <256 x i32> %13, <256 x i32> %src2
- // -->
- // %13 = bitcast x86_amx %src to <256 x i32>
- // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
- // %stride64, %13)
- // %14 = load <256 x i32>, %addr
- // %add = <256 x i32> %14, <256 x i32> %src2
- //
- combineBitcastStore(Bitcast, ST);
- // Delete user first.
- DeadInsts.push_back(ST);
- DeadInsts.push_back(Bitcast);
- }
- }
- }
-
- bool C = !DeadInsts.empty();
-
- for (auto *Inst : DeadInsts)
- Inst->eraseFromParent();
-
- return C;
-}
-} // anonymous namespace
-
-namespace {
-
-class X86LowerAMXTypeLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
- initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- X86LowerAMXType LAT(F);
- bool C = LAT.visit();
- return C;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- }
-};
-
-} // anonymous namespace
-
-static const char PassName[] = "Lower AMX type for load/store";
-char X86LowerAMXTypeLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
- false)
-INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
- false)
-
-FunctionPass *llvm::createX86LowerAMXTypePass() {
- return new X86LowerAMXTypeLegacyPass();
-}
+//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to transform <256 x i32> load/store
+/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only
+/// provides simple operation on x86_amx. The basic elementwise operation
+/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32>
+/// and only AMX intrinsics can operate on the type, we need transform
+/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can
+/// not be combined with load/store, we transform the bitcast to amx load/store
+/// and <256 x i32> store/load.
+//
+//===----------------------------------------------------------------------===//
+//
+#include "X86.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-amx-type"
+
+static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) {
+ Function &F = *BB->getParent();
+ Module *M = BB->getModule();
+ const DataLayout &DL = M->getDataLayout();
+
+ Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
+ LLVMContext &Ctx = Builder.getContext();
+ auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
+ unsigned AllocaAS = DL.getAllocaAddrSpace();
+ AllocaInst *AllocaRes =
+ new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+ AllocaRes->setAlignment(AllocaAlignment);
+ return AllocaRes;
+}
+
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+ Value *Row = nullptr, *Col = nullptr;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Expect amx intrinsics");
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tilestored64_internal: {
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ // a * b + c
+ // The shape depends on which operand.
+ case Intrinsic::x86_tdpbssd_internal: {
+ switch (OpNo) {
+ case 3:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(1);
+ break;
+ case 4:
+ Row = II->getArgOperand(0);
+ Col = II->getArgOperand(2);
+ break;
+ case 5:
+ Row = II->getArgOperand(2);
+ Col = II->getArgOperand(1);
+ break;
+ }
+ break;
+ }
+ }
+
+ return std::make_pair(Row, Col);
+}
+
+// %src = load <256 x i32>, <256 x i32>* %addr, align 64
+// %2 = bitcast <256 x i32> %src to x86_amx
+// -->
+// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+// i8* %addr, i64 %stride64)
+static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) {
+ Value *Row = nullptr, *Col = nullptr;
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = cast<IntrinsicInst>(U.getUser());
+ std::tie(Row, Col) = getShape(II, OpNo);
+ IRBuilder<> Builder(Bitcast);
+ // Use the maximun column as stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy());
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+
+ Value *NewInst =
+ Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+}
+
+// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+// %stride);
+// %13 = bitcast x86_amx %src to <256 x i32>
+// store <256 x i32> %13, <256 x i32>* %addr, align 64
+// -->
+// call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+// %stride64, %13)
+static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) {
+
+ Value *Tile = Bitcast->getOperand(0);
+ auto *II = cast<IntrinsicInst>(Tile);
+ // Tile is output from AMX intrinsic. The first operand of the
+ // intrinsic is row, the second operand of the intrinsic is column.
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ IRBuilder<> Builder(ST);
+ // Use the maximum column as stride. It must be the same with load
+ // stride.
+ Value *Stride = Builder.getInt64(64);
+ Value *I8Ptr =
+ Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy());
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ if (Bitcast->hasOneUse())
+ return;
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1));
+ Bitcast->replaceAllUsesWith(Vec);
+}
+
+// transform bitcast to <store, load> instructions.
+static bool transformBitcast(BitCastInst *Bitcast) {
+ IRBuilder<> Builder(Bitcast);
+ AllocaInst *AllocaAddr;
+ Value *I8Ptr, *Stride;
+ auto *Src = Bitcast->getOperand(0);
+
+ auto Prepare = [&]() {
+ AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent());
+ I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+ Stride = Builder.getInt64(64);
+ };
+
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // store <256 x i32> %src, <256 x i32>* %addr, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr2,
+ // i64 64)
+ Use &U = *(Bitcast->use_begin());
+ unsigned OpNo = U.getOperandNo();
+ auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+ if (!II)
+ return false; // May be bitcast from x86amx to <256 x i32>.
+ Prepare();
+ Builder.CreateStore(Src, AllocaAddr);
+ // TODO we can pick an constant operand for the shape.
+ Value *Row = nullptr, *Col = nullptr;
+ std::tie(Row, Col) = getShape(II, OpNo);
+ std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride};
+ Value *NewInst = Builder.CreateIntrinsic(
+ Intrinsic::x86_tileloadd64_internal, None, Args);
+ Bitcast->replaceAllUsesWith(NewInst);
+ } else {
+ // %2 = bitcast x86_amx %src to <256 x i32>
+ // -->
+ // %addr = alloca <256 x i32>, align 64
+ // %addr2 = bitcast <256 x i32>* to i8*
+ // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+ // i8* %addr2, i64 %stride)
+ // %2 = load <256 x i32>, <256 x i32>* %addr, align 64
+ auto *II = dyn_cast<IntrinsicInst>(Src);
+ if (!II)
+ return false; // May be bitcast from <256 x i32> to x86amx.
+ Prepare();
+ Value *Row = II->getOperand(0);
+ Value *Col = II->getOperand(1);
+ std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
+ Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+ Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr);
+ Bitcast->replaceAllUsesWith(NewInst);
+ }
+
+ return true;
+}
+
+namespace {
+class X86LowerAMXType {
+ Function &Func;
+
+public:
+ X86LowerAMXType(Function &F) : Func(F) {}
+ bool visit();
+};
+
+bool X86LowerAMXType::visit() {
+ SmallVector<Instruction *, 8> DeadInsts;
+
+ for (BasicBlock *BB : post_order(&Func)) {
+ for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend();
+ II != IE;) {
+ Instruction &Inst = *II++;
+ auto *Bitcast = dyn_cast<BitCastInst>(&Inst);
+ if (!Bitcast)
+ continue;
+
+ Value *Src = Bitcast->getOperand(0);
+ if (Bitcast->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ LoadInst *LD = dyn_cast<LoadInst>(Src);
+ if (!LD) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If load has mutli-user, duplicate a vector load.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+ // -->
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ // %add = add <256 x i32> %src, <256 x i32> %src2
+
+ // If load has one user, the load will be eliminated in DAG ISel.
+ // %src = load <256 x i32>, <256 x i32>* %addr, align 64
+ // %2 = bitcast <256 x i32> %src to x86_amx
+ // -->
+ // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col,
+ // i8* %addr, i64 %stride64)
+ combineLoadBitcast(LD, Bitcast);
+ DeadInsts.push_back(Bitcast);
+ if (LD->hasOneUse())
+ DeadInsts.push_back(LD);
+ } else if (Src->getType()->isX86_AMXTy()) {
+ if (Bitcast->user_empty()) {
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ StoreInst *ST = nullptr;
+ for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end();
+ UI != UE;) {
+ Value *I = (UI++)->getUser();
+ ST = dyn_cast<StoreInst>(I);
+ if (ST)
+ break;
+ }
+ if (!ST) {
+ if (transformBitcast(Bitcast))
+ DeadInsts.push_back(Bitcast);
+ continue;
+ }
+ // If bitcast (%13) has one use, combine bitcast and store to amx store.
+ // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr,
+ // %stride);
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // -->
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ //
+ // If bitcast (%13) has multi-use, transform as below.
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // store <256 x i32> %13, <256 x i32>* %addr, align 64
+ // %add = <256 x i32> %13, <256 x i32> %src2
+ // -->
+ // %13 = bitcast x86_amx %src to <256 x i32>
+ // call void @llvm.x86.tilestored64.internal(%row, %col, %addr,
+ // %stride64, %13)
+ // %14 = load <256 x i32>, %addr
+ // %add = <256 x i32> %14, <256 x i32> %src2
+ //
+ combineBitcastStore(Bitcast, ST);
+ // Delete user first.
+ DeadInsts.push_back(ST);
+ DeadInsts.push_back(Bitcast);
+ }
+ }
+ }
+
+ bool C = !DeadInsts.empty();
+
+ for (auto *Inst : DeadInsts)
+ Inst->eraseFromParent();
+
+ return C;
+}
+} // anonymous namespace
+
+namespace {
+
+class X86LowerAMXTypeLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
+ initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ X86LowerAMXType LAT(F);
+ bool C = LAT.visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+
+} // anonymous namespace
+
+static const char PassName[] = "Lower AMX type for load/store";
+char X86LowerAMXTypeLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
+ false)
+
+FunctionPass *llvm::createX86LowerAMXTypePass() {
+ return new X86LowerAMXTypeLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp b/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp
index 89fa3ae3a3..6de5c84db3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp
@@ -977,24 +977,24 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
NoAutoPaddingScope NoPadScope(*OutStreamer);
- bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
- MI.getOpcode() != X86::TLS_base_addr32;
- bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
- MI.getOpcode() == X86::TLS_base_addr64;
+ bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
+ MI.getOpcode() != X86::TLS_base_addr32;
+ bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
+ MI.getOpcode() == X86::TLS_base_addr64;
MCContext &Ctx = OutStreamer->getContext();
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
case X86::TLS_addr32:
case X86::TLS_addr64:
- case X86::TLS_addrX32:
+ case X86::TLS_addrX32:
SRVK = MCSymbolRefExpr::VK_TLSGD;
break;
case X86::TLS_base_addr32:
SRVK = MCSymbolRefExpr::VK_TLSLDM;
break;
case X86::TLS_base_addr64:
- case X86::TLS_base_addrX32:
+ case X86::TLS_base_addrX32:
SRVK = MCSymbolRefExpr::VK_TLSLD;
break;
default:
@@ -1014,7 +1014,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
if (Is64Bits) {
bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
- if (NeedsPadding && Is64BitsLP64)
+ if (NeedsPadding && Is64BitsLP64)
EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
.addReg(X86::RDI)
@@ -1087,26 +1087,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
/// bytes. Return the size of nop emitted.
static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
const X86Subtarget *Subtarget) {
- // Determine the longest nop which can be efficiently decoded for the given
- // target cpu. 15-bytes is the longest single NOP instruction, but some
- // platforms can't decode the longest forms efficiently.
- unsigned MaxNopLength = 1;
- if (Subtarget->is64Bit()) {
- // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
- // IndexReg/BaseReg below need to be updated.
- if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
- MaxNopLength = 7;
- else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
- MaxNopLength = 15;
- else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
- MaxNopLength = 11;
- else
- MaxNopLength = 10;
- } if (Subtarget->is32Bit())
- MaxNopLength = 2;
-
+ // Determine the longest nop which can be efficiently decoded for the given
+ // target cpu. 15-bytes is the longest single NOP instruction, but some
+ // platforms can't decode the longest forms efficiently.
+ unsigned MaxNopLength = 1;
+ if (Subtarget->is64Bit()) {
+ // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the
+ // IndexReg/BaseReg below need to be updated.
+ if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP))
+ MaxNopLength = 7;
+ else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP))
+ MaxNopLength = 15;
+ else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP))
+ MaxNopLength = 11;
+ else
+ MaxNopLength = 10;
+ } if (Subtarget->is32Bit())
+ MaxNopLength = 2;
+
// Cap a single nop emission at the profitable value for the target
- NumBytes = std::min(NumBytes, MaxNopLength);
+ NumBytes = std::min(NumBytes, MaxNopLength);
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1334,7 +1334,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
MCInst MCI;
MCI.setOpcode(Opcode);
- for (auto &MO : drop_begin(MI.operands(), 2))
+ for (auto &MO : drop_begin(MI.operands(), 2))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
MCI.addOperand(MaybeOperand.getValue());
@@ -1710,7 +1710,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
unsigned OpCode = MI.getOperand(0).getImm();
MCInst Ret;
Ret.setOpcode(OpCode);
- for (auto &MO : drop_begin(MI.operands()))
+ for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
Ret.addOperand(MaybeOperand.getValue());
OutStreamer->emitInstruction(Ret, getSubtargetInfo());
@@ -1749,7 +1749,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
// Before emitting the instruction, add a comment to indicate that this is
// indeed a tail call.
OutStreamer->AddComment("TAILCALL");
- for (auto &MO : drop_begin(MI.operands()))
+ for (auto &MO : drop_begin(MI.operands()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
TC.addOperand(MaybeOperand.getValue());
OutStreamer->emitInstruction(TC, getSubtargetInfo());
@@ -1784,7 +1784,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
if (ConstantEntry.isMachineConstantPoolEntry())
return nullptr;
- return ConstantEntry.Val.ConstVal;
+ return ConstantEntry.Val.ConstVal;
}
static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
@@ -2446,10 +2446,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
case X86::TLS_addr32:
case X86::TLS_addr64:
- case X86::TLS_addrX32:
+ case X86::TLS_addrX32:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
- case X86::TLS_base_addrX32:
+ case X86::TLS_base_addrX32:
return LowerTlsAddr(MCInstLowering, *MI);
case X86::MOVPC32r: {
@@ -2598,15 +2598,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
}
return;
}
- case X86::UBSAN_UD1:
- EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm)
- .addReg(X86::EAX)
- .addReg(X86::EAX)
- .addImm(1)
- .addReg(X86::NoRegister)
- .addImm(MI->getOperand(0).getImm())
- .addReg(X86::NoRegister));
- return;
+ case X86::UBSAN_UD1:
+ EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm)
+ .addReg(X86::EAX)
+ .addReg(X86::EAX)
+ .addImm(1)
+ .addReg(X86::NoRegister)
+ .addImm(MI->getOperand(0).getImm())
+ .addReg(X86::NoRegister));
+ return;
}
MCInst TmpInst;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp b/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp
index babd923e74..db4203d314 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp
@@ -392,7 +392,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
break;
// Push incoming values to the worklist.
- append_range(Worklist, PN->incoming_values());
+ append_range(Worklist, PN->incoming_values());
continue;
}
@@ -401,7 +401,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
if (BO->getOpcode() == Instruction::Add) {
// Simple case. Single use, just push its operands to the worklist.
if (BO->hasNUses(BO == Root ? 2 : 1)) {
- append_range(Worklist, BO->operands());
+ append_range(Worklist, BO->operands());
continue;
}
@@ -424,7 +424,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
continue;
// The phi forms a loop with this Add, push its operands.
- append_range(Worklist, BO->operands());
+ append_range(Worklist, BO->operands());
}
}
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp b/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp
index 05ee6c6c83..b2f6d0604d 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp
@@ -1,265 +1,265 @@
-//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Pass to pre-config the shape of AMX register
-/// AMX register need to be configured before use. The shape of AMX register
-/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
-/// The pldtilecfg is to config tile registers. It should dominator all AMX
-/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
-/// register is used by all AMX instructions.
-/// This pass is to find the common dominator of all AMX instructions and
-/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
-/// produces is inserted as the last operand of each AMX instruction. We use
-/// this scheme to model the def-use relationship between AMX config instruction
-/// and other AMX instructions. Below is an example.
-///
-/// ----B1----
-/// / \
-/// / \
-/// B2 B3
-/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
-///
-/// is transformed to
-///
-/// B1
-/// %25:tilecfg = PLDTILECFG
-/// / \
-/// / \
-/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86RegisterInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TileShapeInfo.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "tile-pre-config"
-
-namespace {
-
-class X86PreTileConfig : public MachineFunctionPass {
- // context
- MachineFunction *MF = nullptr;
- const X86Subtarget *ST = nullptr;
- const TargetRegisterInfo *TRI;
- const TargetInstrInfo *TII;
- MachineDominatorTree *DomTree = nullptr;
- MachineRegisterInfo *MRI = nullptr;
-
- MachineInstr *getTileConfigPoint();
-
-public:
- X86PreTileConfig() : MachineFunctionPass(ID) {}
-
- /// Return the pass name.
- StringRef getPassName() const override {
- return "Tile Register Pre-configure";
- }
-
- /// X86PreTileConfig analysis usage.
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
- /// Perform register allocation.
- bool runOnMachineFunction(MachineFunction &mf) override;
-
- static char ID;
-};
-
-} // end anonymous namespace
-
-char X86PreTileConfig::ID = 0;
-
-INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
- "Tile Register Configure", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
- "Tile Register Configure", false, false)
-
-void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesAll();
- AU.addRequired<MachineDominatorTree>();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
- const TargetInstrInfo *TII,
- MachineRegisterInfo *MRI,
- const X86Subtarget *ST) {
- auto *MBB = MI->getParent();
-
- // FIXME: AMX should assume AVX512 enabled.
- if (ST->hasAVX512()) {
- // Zero stack slot.
- Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
- BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
- .addReg(Zmm, RegState::Undef)
- .addReg(Zmm, RegState::Undef);
- addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
- FrameIdx)
- .addReg(Zmm);
- }
-
- // build psuedo ldtilecfg
- Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
-
- addFrameReference(
- BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
-
- return VReg;
-}
-
-static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default:
- llvm_unreachable("Unexpected machine instruction on tile");
- case X86::PTILELOADDV:
- case X86::PTDPBSSDV:
- case X86::PTILEZEROV:
- MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
- MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
- ShapeT Shape(&MO1, &MO2, MRI);
- return Shape;
- }
-}
-
-MachineInstr *X86PreTileConfig::getTileConfigPoint() {
- DenseMap<Register, ShapeT> PhysShapeInfo;
- MachineBasicBlock *MBB = nullptr;
- DenseSet<const MachineInstr *> MIs;
- for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
- Register VirtReg = Register::index2VirtReg(i);
- if (MRI->reg_nodbg_empty(VirtReg))
- continue;
- const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- if (RC.getID() != X86::TILERegClassID)
- continue;
-
- // Find the common dominator for all MI that define tile register.
- for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
- if (MO.isUndef())
- continue;
- const auto *MI = MO.getParent();
- // PHI or IMPLICIT_DEF instructiion.
- // There must be a input tile before PHI instruction.
- if (MI->isTransient())
- continue;
- if (!MBB)
- MBB = const_cast<MachineBasicBlock *>(MI->getParent());
- MBB = DomTree->findNearestCommonDominator(
- MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
-
- // Collect the instructions that define shape.
- ShapeT Shape = getShape(*MI, MRI);
- std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
- Shape.getCol()};
- for (auto *ShapeMO : ShapeMOs) {
- Register ShapeReg = ShapeMO->getReg();
- for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
- const auto *ShapeMI = MO.getParent();
- MIs.insert(ShapeMI);
- }
- }
- }
- }
- if (!MBB)
- return nullptr;
- // This pass is before the pass of eliminating PHI node, so it
- // is in SSA form.
- assert(MRI->isSSA() && "Not SSA form in pre-tile config");
- // Shape def should dominate tile config MBB.
- // def s s1 s2
- // / \ \ /
- // / \ \ /
- // conf s3=phi(s1,s2)
- // |
- // c
- //
- for (const auto *MI : MIs) {
- const MachineBasicBlock *ShapeMBB = MI->getParent();
- if (DomTree->dominates(ShapeMBB, MBB))
- continue;
- if (MI->isMoveImmediate())
- continue;
- report_fatal_error(MF->getName() + ": Failed to config tile register, "
- "please define the shape earlier");
- }
-
- // ldtilecfg should be inserted after the MI that define the shape.
- MachineBasicBlock::reverse_instr_iterator I, E;
- for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
- auto *MI = &*I;
- if (MIs.count(MI) && (!MI->isMoveImmediate()))
- break;
- }
- MachineBasicBlock::iterator MII;
- if (I == E)
- MII = MBB->getFirstNonPHI();
- else {
- MII = MachineBasicBlock::iterator(&*I);
- MII++;
- }
- return &*MII;
-}
-
-static void addTileCFGUse(MachineFunction &MF, Register CFG) {
- for (MachineBasicBlock &MBB : MF) {
-
- // Traverse the basic block.
- for (MachineInstr &MI : MBB) {
- unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
- default:
- break;
- case X86::PTILELOADDV:
- case X86::PTILESTOREDV:
- case X86::PTDPBSSDV:
- case X86::PTILEZEROV:
- unsigned NumOperands = MI.getNumOperands();
- MI.RemoveOperand(NumOperands - 1);
- MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
- break;
- }
- }
- }
-}
-
-bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
- MF = &mf;
- MRI = &mf.getRegInfo();
- ST = &mf.getSubtarget<X86Subtarget>();
- TRI = ST->getRegisterInfo();
- TII = mf.getSubtarget().getInstrInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
-
- MachineInstr *MI = getTileConfigPoint();
- if (!MI)
- return false;
- unsigned Size = ST->getTileConfigSize();
- Align Alignment = ST->getTileConfigAlignment();
- int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
- Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
- addTileCFGUse(mf, CFG);
- return true;
-}
-
-FunctionPass *llvm::createX86PreTileConfigPass() {
- return new X86PreTileConfig();
-}
+//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to pre-config the shape of AMX register
+/// AMX register need to be configured before use. The shape of AMX register
+/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions.
+/// The pldtilecfg is to config tile registers. It should dominator all AMX
+/// instructions. The pldtilecfg produce a virtual cfg register and the cfg
+/// register is used by all AMX instructions.
+/// This pass is to find the common dominator of all AMX instructions and
+/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg
+/// produces is inserted as the last operand of each AMX instruction. We use
+/// this scheme to model the def-use relationship between AMX config instruction
+/// and other AMX instructions. Below is an example.
+///
+/// ----B1----
+/// / \
+/// / \
+/// B2 B3
+/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV
+///
+/// is transformed to
+///
+/// B1
+/// %25:tilecfg = PLDTILECFG
+/// / \
+/// / \
+/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-pre-config"
+
+namespace {
+
+class X86PreTileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+
+public:
+ X86PreTileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override {
+ return "Tile Register Pre-configure";
+ }
+
+ /// X86PreTileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86PreTileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
+ "Tile Register Configure", false, false)
+
+void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
+ const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI,
+ const X86Subtarget *ST) {
+ auto *MBB = MI->getParent();
+
+ // FIXME: AMX should assume AVX512 enabled.
+ if (ST->hasAVX512()) {
+ // Zero stack slot.
+ Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass);
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
+ .addReg(Zmm, RegState::Undef)
+ .addReg(Zmm, RegState::Undef);
+ addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
+ FrameIdx)
+ .addReg(Zmm);
+ }
+
+ // build psuedo ldtilecfg
+ Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
+
+ addFrameReference(
+ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
+
+ return VReg;
+}
+
+static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile");
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
+ MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
+ ShapeT Shape(&MO1, &MO2, MRI);
+ return Shape;
+ }
+}
+
+MachineInstr *X86PreTileConfig::getTileConfigPoint() {
+ DenseMap<Register, ShapeT> PhysShapeInfo;
+ MachineBasicBlock *MBB = nullptr;
+ DenseSet<const MachineInstr *> MIs;
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+
+ // Find the common dominator for all MI that define tile register.
+ for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
+ if (MO.isUndef())
+ continue;
+ const auto *MI = MO.getParent();
+ // PHI or IMPLICIT_DEF instructiion.
+ // There must be a input tile before PHI instruction.
+ if (MI->isTransient())
+ continue;
+ if (!MBB)
+ MBB = const_cast<MachineBasicBlock *>(MI->getParent());
+ MBB = DomTree->findNearestCommonDominator(
+ MBB, const_cast<MachineBasicBlock *>(MI->getParent()));
+
+ // Collect the instructions that define shape.
+ ShapeT Shape = getShape(*MI, MRI);
+ std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(),
+ Shape.getCol()};
+ for (auto *ShapeMO : ShapeMOs) {
+ Register ShapeReg = ShapeMO->getReg();
+ for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) {
+ const auto *ShapeMI = MO.getParent();
+ MIs.insert(ShapeMI);
+ }
+ }
+ }
+ }
+ if (!MBB)
+ return nullptr;
+ // This pass is before the pass of eliminating PHI node, so it
+ // is in SSA form.
+ assert(MRI->isSSA() && "Not SSA form in pre-tile config");
+ // Shape def should dominate tile config MBB.
+ // def s s1 s2
+ // / \ \ /
+ // / \ \ /
+ // conf s3=phi(s1,s2)
+ // |
+ // c
+ //
+ for (const auto *MI : MIs) {
+ const MachineBasicBlock *ShapeMBB = MI->getParent();
+ if (DomTree->dominates(ShapeMBB, MBB))
+ continue;
+ if (MI->isMoveImmediate())
+ continue;
+ report_fatal_error(MF->getName() + ": Failed to config tile register, "
+ "please define the shape earlier");
+ }
+
+ // ldtilecfg should be inserted after the MI that define the shape.
+ MachineBasicBlock::reverse_instr_iterator I, E;
+ for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) {
+ auto *MI = &*I;
+ if (MIs.count(MI) && (!MI->isMoveImmediate()))
+ break;
+ }
+ MachineBasicBlock::iterator MII;
+ if (I == E)
+ MII = MBB->getFirstNonPHI();
+ else {
+ MII = MachineBasicBlock::iterator(&*I);
+ MII++;
+ }
+ return &*MII;
+}
+
+static void addTileCFGUse(MachineFunction &MF, Register CFG) {
+ for (MachineBasicBlock &MBB : MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ break;
+ case X86::PTILELOADDV:
+ case X86::PTILESTOREDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ unsigned NumOperands = MI.getNumOperands();
+ MI.RemoveOperand(NumOperands - 1);
+ MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
+ break;
+ }
+ }
+ }
+}
+
+bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return false;
+ unsigned Size = ST->getTileConfigSize();
+ Align Alignment = ST->getTileConfigAlignment();
+ int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
+ addTileCFGUse(mf, CFG);
+ return true;
+}
+
+FunctionPass *llvm::createX86PreTileConfigPass() {
+ return new X86PreTileConfig();
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp
index d90b4e7bdc..eb919a0146 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp
@@ -18,8 +18,8 @@
#include "X86Subtarget.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -726,12 +726,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert((!needsStackRealignment(MF) ||
MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
"Return instruction can only reference SP relative frame objects");
- FIOffset =
- TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
+ FIOffset =
+ TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed();
} else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
} else {
- FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed();
+ FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed();
}
// LOCAL_ESCAPE uses a single offset, with no register. It only works in the
@@ -786,55 +786,55 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
}
}
-unsigned X86RegisterInfo::findDeadCallerSavedReg(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
- const MachineFunction *MF = MBB.getParent();
- if (MF->callsEHReturn())
- return 0;
-
- const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
-
- if (MBBI == MBB.end())
- return 0;
-
- switch (MBBI->getOpcode()) {
- default:
- return 0;
- case TargetOpcode::PATCHABLE_RET:
- case X86::RET:
- case X86::RETL:
- case X86::RETQ:
- case X86::RETIL:
- case X86::RETIQ:
- case X86::TCRETURNdi:
- case X86::TCRETURNri:
- case X86::TCRETURNmi:
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- case X86::EH_RETURN:
- case X86::EH_RETURN64: {
- SmallSet<uint16_t, 8> Uses;
- for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
- MachineOperand &MO = MBBI->getOperand(I);
- if (!MO.isReg() || MO.isDef())
- continue;
- Register Reg = MO.getReg();
- if (!Reg)
- continue;
- for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
- Uses.insert(*AI);
- }
-
- for (auto CS : AvailableRegs)
- if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
- return CS;
- }
- }
-
- return 0;
-}
-
+unsigned X86RegisterInfo::findDeadCallerSavedReg(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
+ const MachineFunction *MF = MBB.getParent();
+ if (MF->callsEHReturn())
+ return 0;
+
+ const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
+
+ if (MBBI == MBB.end())
+ return 0;
+
+ switch (MBBI->getOpcode()) {
+ default:
+ return 0;
+ case TargetOpcode::PATCHABLE_RET:
+ case X86::RET:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ SmallSet<uint16_t, 8> Uses;
+ for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+ MachineOperand &MO = MBBI->getOperand(I);
+ if (!MO.isReg() || MO.isDef())
+ continue;
+ Register Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
+ Uses.insert(*AI);
+ }
+
+ for (auto CS : AvailableRegs)
+ if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
+ return CS;
+ }
+ }
+
+ return 0;
+}
+
Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const X86FrameLowering *TFI = getFrameLowering(MF);
return TFI->hasFP(MF) ? FramePtr : StackPtr;
@@ -857,79 +857,79 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
}
-
-static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
- const MachineRegisterInfo *MRI) {
- if (VRM->hasShape(VirtReg))
- return VRM->getShape(VirtReg);
-
- const MachineOperand &Def = *MRI->def_begin(VirtReg);
- MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
- unsigned OpCode = MI->getOpcode();
- switch (OpCode) {
- default:
- llvm_unreachable("Unexpected machine instruction on tile register!");
- break;
- // We only collect the tile shape that is defined.
- case X86::PTILELOADDV:
- case X86::PTDPBSSDV:
- case X86::PTILEZEROV:
- MachineOperand &MO1 = MI->getOperand(1);
- MachineOperand &MO2 = MI->getOperand(2);
- ShapeT Shape(&MO1, &MO2, MRI);
- VRM->assignVirt2Shape(VirtReg, Shape);
- return Shape;
- }
-}
-
-bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
- ArrayRef<MCPhysReg> Order,
- SmallVectorImpl<MCPhysReg> &Hints,
- const MachineFunction &MF,
- const VirtRegMap *VRM,
- const LiveRegMatrix *Matrix) const {
- const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
- VirtReg, Order, Hints, MF, VRM, Matrix);
-
- if (RC.getID() != X86::TILERegClassID)
- return BaseImplRetVal;
-
- ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
- auto AddHint = [&](MCPhysReg PhysReg) {
- Register VReg = Matrix->getOneVReg(PhysReg);
- if (VReg == MCRegister::NoRegister) { // Not allocated yet
- Hints.push_back(PhysReg);
- return;
- }
- ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
- if (PhysShape == VirtShape)
- Hints.push_back(PhysReg);
- };
-
- SmallSet<MCPhysReg, 4> CopyHints;
- CopyHints.insert(Hints.begin(), Hints.end());
- Hints.clear();
- for (auto Hint : CopyHints) {
- if (RC.contains(Hint) && !MRI->isReserved(Hint))
- AddHint(Hint);
- }
- for (MCPhysReg PhysReg : Order) {
- if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
- !MRI->isReserved(PhysReg))
- AddHint(PhysReg);
- }
-
-#define DEBUG_TYPE "tile-hint"
- LLVM_DEBUG({
- dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
- for (auto Hint : Hints) {
- dbgs() << "tmm" << Hint << ",";
- }
- dbgs() << "\n";
- });
-#undef DEBUG_TYPE
-
- return true;
-}
+
+static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
+ const MachineRegisterInfo *MRI) {
+ if (VRM->hasShape(VirtReg))
+ return VRM->getShape(VirtReg);
+
+ const MachineOperand &Def = *MRI->def_begin(VirtReg);
+ MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent());
+ unsigned OpCode = MI->getOpcode();
+ switch (OpCode) {
+ default:
+ llvm_unreachable("Unexpected machine instruction on tile register!");
+ break;
+ // We only collect the tile shape that is defined.
+ case X86::PTILELOADDV:
+ case X86::PTDPBSSDV:
+ case X86::PTILEZEROV:
+ MachineOperand &MO1 = MI->getOperand(1);
+ MachineOperand &MO2 = MI->getOperand(2);
+ ShapeT Shape(&MO1, &MO2, MRI);
+ VRM->assignVirt2Shape(VirtReg, Shape);
+ return Shape;
+ }
+}
+
+bool X86RegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
+ VirtReg, Order, Hints, MF, VRM, Matrix);
+
+ if (RC.getID() != X86::TILERegClassID)
+ return BaseImplRetVal;
+
+ ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI);
+ auto AddHint = [&](MCPhysReg PhysReg) {
+ Register VReg = Matrix->getOneVReg(PhysReg);
+ if (VReg == MCRegister::NoRegister) { // Not allocated yet
+ Hints.push_back(PhysReg);
+ return;
+ }
+ ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI);
+ if (PhysShape == VirtShape)
+ Hints.push_back(PhysReg);
+ };
+
+ SmallSet<MCPhysReg, 4> CopyHints;
+ CopyHints.insert(Hints.begin(), Hints.end());
+ Hints.clear();
+ for (auto Hint : CopyHints) {
+ if (RC.contains(Hint) && !MRI->isReserved(Hint))
+ AddHint(Hint);
+ }
+ for (MCPhysReg PhysReg : Order) {
+ if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) &&
+ !MRI->isReserved(PhysReg))
+ AddHint(PhysReg);
+ }
+
+#define DEBUG_TYPE "tile-hint"
+ LLVM_DEBUG({
+ dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n";
+ for (auto Hint : Hints) {
+ dbgs() << "tmm" << Hint << ",";
+ }
+ dbgs() << "\n";
+ });
+#undef DEBUG_TYPE
+
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h
index 7fd10ddd1a..4e4fb3f368 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h
@@ -125,12 +125,12 @@ public:
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS = nullptr) const override;
- /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
- /// when it reaches the "return" instruction. We can then pop a stack object
- /// to this register without worry about clobbering it.
- unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI) const;
-
+ /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+ /// when it reaches the "return" instruction. We can then pop a stack object
+ /// to this register without worry about clobbering it.
+ unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI) const;
+
// Debug information queries.
Register getFrameRegister(const MachineFunction &MF) const override;
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
@@ -144,11 +144,11 @@ public:
Register getFramePtr() const { return FramePtr; }
// FIXME: Move to FrameInfok
unsigned getSlotSize() const { return SlotSize; }
-
- bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
- SmallVectorImpl<MCPhysReg> &Hints,
- const MachineFunction &MF, const VirtRegMap *VRM,
- const LiveRegMatrix *Matrix) const override;
+
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
};
} // End llvm namespace
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td
index 75cbd4e1cf..29aa2bc252 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td
@@ -265,9 +265,9 @@ let SubRegIndices = [sub_ymm] in {
}
}
-// Tile config registers.
-def TMMCFG: X86Reg<"tmmcfg", 0>;
-
+// Tile config registers.
+def TMMCFG: X86Reg<"tmmcfg", 0>;
+
// Tile "registers".
def TMM0: X86Reg<"tmm0", 0>;
def TMM1: X86Reg<"tmm1", 1>;
@@ -636,11 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
// Tiles
-let CopyCost = -1 in // Don't allow copying of tile registers
-def TILE : RegisterClass<"X86", [x86amx], 8192,
+let CopyCost = -1 in // Don't allow copying of tile registers
+def TILE : RegisterClass<"X86", [x86amx], 8192,
(sequence "TMM%u", 0, 7)> {let Size = 8192;}
-def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
- let CopyCost = -1; // Don't allow copying of tile config registers.
- let isAllocatable = 1;
- let Size = 512;
-}
+def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
+ let CopyCost = -1; // Don't allow copying of tile config registers.
+ let isAllocatable = 1;
+ let Size = 512;
+}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp
index e76908ef4b..8cfdeea3d1 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -24,10 +24,10 @@ using namespace llvm;
#define DEBUG_TYPE "x86-selectiondag-info"
-static cl::opt<bool>
- UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
- cl::desc("Use fast short rep mov in memcpy lowering"));
-
+static cl::opt<bool>
+ UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
+ cl::desc("Use fast short rep mov in memcpy lowering"));
+
bool X86SelectionDAGInfo::isBaseRegConflictPossible(
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -310,10 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
const X86Subtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
- // If enabled and available, use fast short rep mov.
- if (UseFSRMForMemcpy && Subtarget.hasFSRM())
- return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
-
+ // If enabled and available, use fast short rep mov.
+ if (UseFSRMForMemcpy && Subtarget.hasFSRM())
+ return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
+
/// Handle constant sizes,
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
return emitConstantSizeRepmov(
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 14a3fea240..c3ed77fcae 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -293,4 +293,4 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width,
}
}
-} // namespace llvm
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
index d57871130b..5746c91c0b 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -161,7 +161,7 @@ bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
// This branch requires adding an LFENCE.
if (!PrevInstIsLFENCE) {
- assert(FirstTerminator && "Unknown terminator instruction");
+ assert(FirstTerminator && "Unknown terminator instruction");
BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
NumLFENCEsInserted++;
Modified = true;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index aa73d4bce6..0edb63589a 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -184,7 +184,7 @@ private:
MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
void restoreEFLAGS(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- Register Reg);
+ Register Reg);
void mergePredStateIntoSP(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
@@ -200,8 +200,8 @@ private:
MachineInstr *
sinkPostLoadHardenedInst(MachineInstr &MI,
SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
- bool canHardenRegister(Register Reg);
- unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
+ bool canHardenRegister(Register Reg);
+ unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
DebugLoc Loc);
unsigned hardenPostLoad(MachineInstr &MI);
@@ -1520,7 +1520,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
/// reliably lower.
void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
- Register Reg) {
+ Register Reg) {
BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
++NumInstsInserted;
}
@@ -1842,7 +1842,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// just bail. Also check that its register class is one of the ones we
// can harden.
Register UseDefReg = UseMI.getOperand(0).getReg();
- if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
+ if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg))
return {};
SingleUseMI = &UseMI;
@@ -1864,7 +1864,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
return MI;
}
-bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
auto *RC = MRI->getRegClass(Reg);
int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
if (RegBytes > 8)
@@ -1908,10 +1908,10 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
/// The new, hardened virtual register is returned. It will have the same
/// register class as `Reg`.
unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
- Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
+ Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
assert(canHardenRegister(Reg) && "Cannot harden this register!");
- assert(Reg.isVirtual() && "Cannot harden a physical register!");
+ assert(Reg.isVirtual() && "Cannot harden a physical register!");
auto *RC = MRI->getRegClass(Reg);
int Bytes = TRI->getRegSizeInBits(*RC) / 8;
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp
index c95213c353..e0a96938be 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp
@@ -166,10 +166,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
}
- // 32-bit ELF references GlobalAddress directly in static relocation model.
- // We cannot use MO_GOT because EBX may not be set up.
- if (TM.getRelocationModel() == Reloc::Static)
- return X86II::MO_NO_FLAG;
+ // 32-bit ELF references GlobalAddress directly in static relocation model.
+ // We cannot use MO_GOT because EBX may not be set up.
+ if (TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
return X86II::MO_GOT;
}
@@ -206,9 +206,9 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
(!F && M.getRtLibUseGOT())) &&
is64Bit())
return X86II::MO_GOTPCREL;
- // Reference ExternalSymbol directly in static relocation model.
- if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static)
- return X86II::MO_NO_FLAG;
+ // Reference ExternalSymbol directly in static relocation model.
+ if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static)
+ return X86II::MO_NO_FLAG;
return X86II::MO_PLT;
}
@@ -234,22 +234,22 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
}
-void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
- StringRef FS) {
- if (CPU.empty())
- CPU = "generic";
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
+ StringRef FS) {
+ if (CPU.empty())
+ CPU = "generic";
- if (TuneCPU.empty())
- TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect.
+ if (TuneCPU.empty())
+ TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect.
- std::string FullFS = X86_MC::ParseX86Triple(TargetTriple);
- assert(!FullFS.empty() && "Failed to parse X86 triple");
+ std::string FullFS = X86_MC::ParseX86Triple(TargetTriple);
+ assert(!FullFS.empty() && "Failed to parse X86 triple");
- if (!FS.empty())
- FullFS = (Twine(FullFS) + "," + FS).str();
+ if (!FS.empty())
+ FullFS = (Twine(FullFS) + "," + FS).str();
// Parse features string and set the CPU.
- ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
+ ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
// All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of
// 16-bytes and under that are reasonably fast. These features were
@@ -265,13 +265,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
report_fatal_error("64-bit code requested on a subtarget that doesn't "
"support it!");
- // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
- // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
- // following the i386 psABI, while on Illumos it is always 16 bytes.
+ // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all
+ // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes
+ // following the i386 psABI, while on Illumos it is always 16 bytes.
if (StackAlignOverride)
stackAlignment = *StackAlignOverride;
- else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
- In64BitMode)
+ else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() ||
+ In64BitMode)
stackAlignment = Align(16);
// Consume the vector width attribute or apply any target specific limit.
@@ -284,24 +284,24 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
}
X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
- StringRef TuneCPU,
+ StringRef TuneCPU,
StringRef FS) {
- initSubtargetFeatures(CPU, TuneCPU, FS);
+ initSubtargetFeatures(CPU, TuneCPU, FS);
return *this;
}
-X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
- StringRef FS, const X86TargetMachine &TM,
+X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
+ StringRef FS, const X86TargetMachine &TM,
MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth)
- : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
- PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
- StackAlignOverride(StackAlignOverride),
+ : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
+ PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
+ StackAlignOverride(StackAlignOverride),
PreferVectorWidthOverride(PreferVectorWidthOverride),
RequiredVectorWidth(RequiredVectorWidth),
- InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
- TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+ InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
+ TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
// Determine the PICStyle based on the target selected.
if (!isPositionIndependent())
setPICStyle(PICStyles::Style::None);
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h
index fa2622333d..3bd6599215 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h
@@ -189,8 +189,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Processor has RDSEED instructions.
bool HasRDSEED = false;
- /// Processor has LAHF/SAHF instructions in 64-bit mode.
- bool HasLAHFSAHF64 = false;
+ /// Processor has LAHF/SAHF instructions in 64-bit mode.
+ bool HasLAHFSAHF64 = false;
/// Processor has MONITORX/MWAITX instructions.
bool HasMWAITX = false;
@@ -302,9 +302,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// True if the processor has enhanced REP MOVSB/STOSB.
bool HasERMSB = false;
- /// True if the processor has fast short REP MOV.
- bool HasFSRM = false;
-
+ /// True if the processor has fast short REP MOV.
+ bool HasFSRM = false;
+
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions = false;
@@ -355,9 +355,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
- /// Processor has AVX Vector Neural Network Instructions
- bool HasAVXVNNI = false;
-
+ /// Processor has AVX Vector Neural Network Instructions
+ bool HasAVXVNNI = false;
+
/// Processor has AVX-512 bfloat16 floating-point extensions
bool HasBF16 = false;
@@ -398,15 +398,15 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Processor supports PCONFIG instruction
bool HasPCONFIG = false;
- /// Processor support key locker instructions
- bool HasKL = false;
-
- /// Processor support key locker wide instructions
- bool HasWIDEKL = false;
-
- /// Processor supports HRESET instruction
- bool HasHRESET = false;
-
+ /// Processor support key locker instructions
+ bool HasKL = false;
+
+ /// Processor support key locker wide instructions
+ bool HasWIDEKL = false;
+
+ /// Processor supports HRESET instruction
+ bool HasHRESET = false;
+
/// Processor supports SERIALIZE instruction
bool HasSERIALIZE = false;
@@ -418,9 +418,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool HasAMXBF16 = false;
bool HasAMXINT8 = false;
- /// Processor supports User Level Interrupt instructions
- bool HasUINTR = false;
-
+ /// Processor supports User Level Interrupt instructions
+ bool HasUINTR = false;
+
/// Processor has a single uop BEXTR implementation.
bool HasFastBEXTR = false;
@@ -472,8 +472,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// entry to the function and which must be maintained by every function.
Align stackAlignment = Align(4);
- Align TileConfigAlignment = Align(4);
-
+ Align TileConfigAlignment = Align(4);
+
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
@@ -515,13 +515,13 @@ private:
unsigned RequiredVectorWidth;
/// True if compiling for 64-bit, false for 16-bit or 32-bit.
- bool In64BitMode = false;
+ bool In64BitMode = false;
/// True if compiling for 32-bit, false for 16-bit or 64-bit.
- bool In32BitMode = false;
+ bool In32BitMode = false;
/// True if compiling for 16-bit, false for 32-bit or 64-bit.
- bool In16BitMode = false;
+ bool In16BitMode = false;
X86SelectionDAGInfo TSInfo;
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
@@ -534,7 +534,7 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
///
- X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
+ X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth);
@@ -557,9 +557,9 @@ public:
return &getInstrInfo()->getRegisterInfo();
}
- unsigned getTileConfigSize() const { return 64; }
- Align getTileConfigAlignment() const { return TileConfigAlignment; }
-
+ unsigned getTileConfigSize() const { return 64; }
+ Align getTileConfigAlignment() const { return TileConfigAlignment; }
+
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
@@ -571,7 +571,7 @@ public:
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
- void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+ void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
/// Methods used by Global ISel
const CallLowering *getCallLowering() const override;
@@ -582,10 +582,10 @@ public:
private:
/// Initialize the full set of dependencies so we can use an initializer
/// list for X86Subtarget.
- X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
- StringRef TuneCPU,
- StringRef FS);
- void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+ X86Subtarget &initializeSubtargetDependencies(StringRef CPU,
+ StringRef TuneCPU,
+ StringRef FS);
+ void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
public:
/// Is this x86_64? (disregarding specific ABI / programming model)
@@ -684,7 +684,7 @@ public:
return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1();
}
bool hasRDSEED() const { return HasRDSEED; }
- bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
+ bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); }
bool hasMWAITX() const { return HasMWAITX; }
bool hasCLZERO() const { return HasCLZERO; }
bool hasCLDEMOTE() const { return HasCLDEMOTE; }
@@ -717,7 +717,7 @@ public:
bool hasMacroFusion() const { return HasMacroFusion; }
bool hasBranchFusion() const { return HasBranchFusion; }
bool hasERMSB() const { return HasERMSB; }
- bool hasFSRM() const { return HasFSRM; }
+ bool hasFSRM() const { return HasFSRM; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
@@ -748,17 +748,17 @@ public:
bool hasSGX() const { return HasSGX; }
bool hasINVPCID() const { return HasINVPCID; }
bool hasENQCMD() const { return HasENQCMD; }
- bool hasKL() const { return HasKL; }
- bool hasWIDEKL() const { return HasWIDEKL; }
- bool hasHRESET() const { return HasHRESET; }
+ bool hasKL() const { return HasKL; }
+ bool hasWIDEKL() const { return HasWIDEKL; }
+ bool hasHRESET() const { return HasHRESET; }
bool hasSERIALIZE() const { return HasSERIALIZE; }
bool hasTSXLDTRK() const { return HasTSXLDTRK; }
- bool hasUINTR() const { return HasUINTR; }
+ bool hasUINTR() const { return HasUINTR; }
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
}
- bool hasAVXVNNI() const { return HasAVXVNNI; }
+ bool hasAVXVNNI() const { return HasAVXVNNI; }
bool hasAMXTILE() const { return HasAMXTILE; }
bool hasAMXBF16() const { return HasAMXBF16; }
bool hasAMXINT8() const { return HasAMXINT8; }
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp
index c8f76c210a..762ea5bc6e 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp
@@ -62,7 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
PassRegistry &PR = *PassRegistry::getPassRegistry();
- initializeX86LowerAMXTypeLegacyPassPass(PR);
+ initializeX86LowerAMXTypeLegacyPassPass(PR);
initializeGlobalISel(PR);
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
@@ -72,7 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
- initializeX86TileConfigPass(PR);
+ initializeX86TileConfigPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
@@ -85,7 +85,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
initializeX86PartialReductionPass(PR);
- initializePseudoProbeInserterPass(PR);
+ initializePseudoProbeInserterPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -236,30 +236,30 @@ X86TargetMachine::~X86TargetMachine() = default;
const X86Subtarget *
X86TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
- Attribute TuneAttr = F.getFnAttribute("tune-cpu");
+ Attribute TuneAttr = F.getFnAttribute("tune-cpu");
Attribute FSAttr = F.getFnAttribute("target-features");
- StringRef CPU =
- CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
- StringRef TuneCPU =
- TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
- StringRef FS =
- FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
+ StringRef CPU =
+ CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
+ StringRef TuneCPU =
+ TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+ StringRef FS =
+ FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
SmallString<512> Key;
- // The additions here are ordered so that the definitely short strings are
- // added first so we won't exceed the small size. We append the
- // much longer FS string at the end so that we only heap allocate at most
- // one time.
+ // The additions here are ordered so that the definitely short strings are
+ // added first so we won't exceed the small size. We append the
+ // much longer FS string at the end so that we only heap allocate at most
+ // one time.
// Extract prefer-vector-width attribute.
unsigned PreferVectorWidthOverride = 0;
- Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width");
- if (PreferVecWidthAttr.isValid()) {
- StringRef Val = PreferVecWidthAttr.getValueAsString();
+ Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width");
+ if (PreferVecWidthAttr.isValid()) {
+ StringRef Val = PreferVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += "prefer-vector-width=";
+ Key += "prefer-vector-width=";
Key += Val;
PreferVectorWidthOverride = Width;
}
@@ -267,45 +267,45 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// Extract min-legal-vector-width attribute.
unsigned RequiredVectorWidth = UINT32_MAX;
- Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width");
- if (MinLegalVecWidthAttr.isValid()) {
- StringRef Val = MinLegalVecWidthAttr.getValueAsString();
+ Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width");
+ if (MinLegalVecWidthAttr.isValid()) {
+ StringRef Val = MinLegalVecWidthAttr.getValueAsString();
unsigned Width;
if (!Val.getAsInteger(0, Width)) {
- Key += "min-legal-vector-width=";
+ Key += "min-legal-vector-width=";
Key += Val;
RequiredVectorWidth = Width;
}
}
- // Add CPU to the Key.
- Key += CPU;
-
- // Add tune CPU to the Key.
- Key += "tune=";
- Key += TuneCPU;
-
- // Keep track of the start of the feature portion of the string.
- unsigned FSStart = Key.size();
-
- // FIXME: This is related to the code below to reset the target options,
- // we need to know whether or not the soft float flag is set on the
- // function before we can generate a subtarget. We also need to use
- // it as a key for the subtarget since that can be the only difference
- // between two functions.
- bool SoftFloat =
- F.getFnAttribute("use-soft-float").getValueAsString() == "true";
- // If the soft float attribute is set on the function turn on the soft float
- // subtarget feature.
- if (SoftFloat)
- Key += FS.empty() ? "+soft-float" : "+soft-float,";
-
- Key += FS;
-
- // We may have added +soft-float to the features so move the StringRef to
- // point to the full string in the Key.
- FS = Key.substr(FSStart);
-
+ // Add CPU to the Key.
+ Key += CPU;
+
+ // Add tune CPU to the Key.
+ Key += "tune=";
+ Key += TuneCPU;
+
+ // Keep track of the start of the feature portion of the string.
+ unsigned FSStart = Key.size();
+
+ // FIXME: This is related to the code below to reset the target options,
+ // we need to know whether or not the soft float flag is set on the
+ // function before we can generate a subtarget. We also need to use
+ // it as a key for the subtarget since that can be the only difference
+ // between two functions.
+ bool SoftFloat =
+ F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+ // If the soft float attribute is set on the function turn on the soft float
+ // subtarget feature.
+ if (SoftFloat)
+ Key += FS.empty() ? "+soft-float" : "+soft-float,";
+
+ Key += FS;
+
+ // We may have added +soft-float to the features so move the StringRef to
+ // point to the full string in the Key.
+ FS = Key.substr(FSStart);
+
auto &I = SubtargetMap[Key];
if (!I) {
// This needs to be done before we create a new subtarget since any
@@ -313,21 +313,21 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// function that reside in TargetOptions.
resetTargetOptions(F);
I = std::make_unique<X86Subtarget>(
- TargetTriple, CPU, TuneCPU, FS, *this,
+ TargetTriple, CPU, TuneCPU, FS, *this,
MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
RequiredVectorWidth);
}
return I.get();
}
-bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
- assert(SrcAS != DestAS && "Expected different address spaces!");
- if (getPointerSize(SrcAS) != getPointerSize(DestAS))
- return false;
- return SrcAS < 256 && DestAS < 256;
-}
-
+bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ assert(SrcAS != DestAS && "Expected different address spaces!");
+ if (getPointerSize(SrcAS) != getPointerSize(DestAS))
+ return false;
+ return SrcAS < 256 && DestAS < 256;
+}
+
//===----------------------------------------------------------------------===//
// X86 TTI query.
//===----------------------------------------------------------------------===//
@@ -381,7 +381,7 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
- bool addPreRewrite() override;
+ bool addPreRewrite() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -410,7 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
void X86PassConfig::addIRPasses() {
addPass(createAtomicExpandPass());
- addPass(createX86LowerAMXTypePass());
+ addPass(createX86LowerAMXTypePass());
TargetPassConfig::addIRPasses();
@@ -449,7 +449,7 @@ bool X86PassConfig::addInstSelector() {
}
bool X86PassConfig::addIRTranslator() {
- addPass(new IRTranslator(getOptLevel()));
+ addPass(new IRTranslator(getOptLevel()));
return false;
}
@@ -496,12 +496,12 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86SpeculativeLoadHardeningPass());
addPass(createX86FlagsCopyLoweringPass());
addPass(createX86WinAllocaExpander());
-
- if (getOptLevel() != CodeGenOpt::None) {
- addPass(createX86PreTileConfigPass());
- }
+
+ if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createX86PreTileConfigPass());
+ }
}
-
+
void X86PassConfig::addMachineSSAOptimization() {
addPass(createX86DomainReassignmentPass());
TargetPassConfig::addMachineSSAOptimization();
@@ -574,11 +574,11 @@ void X86PassConfig::addPreEmitPass2() {
addPass(createX86LoadValueInjectionRetHardeningPass());
}
-bool X86PassConfig::addPreRewrite() {
- addPass(createX86TileConfigPass());
- return true;
-}
-
+bool X86PassConfig::addPreRewrite() {
+ addPass(createX86TileConfigPass());
+ return true;
+}
+
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h
index 69d7e48b89..3c77581146 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h
@@ -54,8 +54,8 @@ public:
}
bool isJIT() const { return IsJIT; }
-
- bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
};
} // end namespace llvm
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h
index f4bf52c837..dbed7df9c6 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h
@@ -36,7 +36,7 @@ namespace llvm {
MCStreamer &Streamer) const override;
};
- /// This implementation is used for X86 ELF targets that don't
+ /// This implementation is used for X86 ELF targets that don't
/// have a further specialization.
class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
public:
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp
index 71455237fb..0741fa9ad3 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -232,16 +232,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
bool Op2Signed = false;
unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
- bool SignedMode = Op1Signed || Op2Signed;
+ bool SignedMode = Op1Signed || Op2Signed;
unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
if (OpMinSize <= 7)
return LT.first * 3; // pmullw/sext
- if (!SignedMode && OpMinSize <= 8)
+ if (!SignedMode && OpMinSize <= 8)
return LT.first * 3; // pmullw/zext
if (OpMinSize <= 15)
return LT.first * 5; // pmullw/pmulhw/pshuf
- if (!SignedMode && OpMinSize <= 16)
+ if (!SignedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
@@ -321,11 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
-
- { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
- { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
- { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
- { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
+
+ { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -341,11 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
-
- { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
- { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
- { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
- { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
+
+ { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -363,15 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
-
- { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
- { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
- { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
- { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
- { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
- { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
- { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
- { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
+
+ { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
+ { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
+ { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
+ { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
+ { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
+ { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
+ { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
};
// XOP has faster vXi8 shifts.
@@ -1128,9 +1128,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
-
- {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
- {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
+ {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
};
if (ST->hasBWI())
@@ -1184,13 +1184,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
-
- {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
- {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
- {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
- {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
- {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
- {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
+
+ {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
+ {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
+ {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
+ {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
+ {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
};
if (ST->hasAVX512())
@@ -1396,7 +1396,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
}
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
+ TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2018,7 +2018,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
MVT SimpleSrcTy = SrcTy.getSimpleVT();
MVT SimpleDstTy = DstTy.getSimpleVT();
@@ -2079,18 +2079,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return AdjustCost(Entry->Cost);
}
- return AdjustCost(
- BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
+ return AdjustCost(
+ BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
- I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
+ I);
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -2274,7 +2274,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
}
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
@@ -2288,9 +2288,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
-
- // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
- // specialized in these tables yet.
+
+ // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
+ // specialized in these tables yet.
static const CostTblEntry AVX512CDCostTbl[] = {
{ ISD::CTLZ, MVT::v8i64, 1 },
{ ISD::CTLZ, MVT::v16i32, 1 },
@@ -2306,8 +2306,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTLZ, MVT::v16i8, 4 },
};
static const CostTblEntry AVX512BWCostTbl[] = {
- { ISD::ABS, MVT::v32i16, 1 },
- { ISD::ABS, MVT::v64i8, 1 },
+ { ISD::ABS, MVT::v32i16, 1 },
+ { ISD::ABS, MVT::v64i8, 1 },
{ ISD::BITREVERSE, MVT::v8i64, 5 },
{ ISD::BITREVERSE, MVT::v16i32, 5 },
{ ISD::BITREVERSE, MVT::v32i16, 5 },
@@ -2326,28 +2326,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v64i8, 9 },
{ ISD::SADDSAT, MVT::v32i16, 1 },
{ ISD::SADDSAT, MVT::v64i8, 1 },
- { ISD::SMAX, MVT::v32i16, 1 },
- { ISD::SMAX, MVT::v64i8, 1 },
- { ISD::SMIN, MVT::v32i16, 1 },
- { ISD::SMIN, MVT::v64i8, 1 },
+ { ISD::SMAX, MVT::v32i16, 1 },
+ { ISD::SMAX, MVT::v64i8, 1 },
+ { ISD::SMIN, MVT::v32i16, 1 },
+ { ISD::SMIN, MVT::v64i8, 1 },
{ ISD::SSUBSAT, MVT::v32i16, 1 },
{ ISD::SSUBSAT, MVT::v64i8, 1 },
{ ISD::UADDSAT, MVT::v32i16, 1 },
{ ISD::UADDSAT, MVT::v64i8, 1 },
- { ISD::UMAX, MVT::v32i16, 1 },
- { ISD::UMAX, MVT::v64i8, 1 },
- { ISD::UMIN, MVT::v32i16, 1 },
- { ISD::UMIN, MVT::v64i8, 1 },
+ { ISD::UMAX, MVT::v32i16, 1 },
+ { ISD::UMAX, MVT::v64i8, 1 },
+ { ISD::UMIN, MVT::v32i16, 1 },
+ { ISD::UMIN, MVT::v64i8, 1 },
{ ISD::USUBSAT, MVT::v32i16, 1 },
{ ISD::USUBSAT, MVT::v64i8, 1 },
};
static const CostTblEntry AVX512CostTbl[] = {
- { ISD::ABS, MVT::v8i64, 1 },
- { ISD::ABS, MVT::v16i32, 1 },
- { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
- { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
- { ISD::ABS, MVT::v4i64, 1 },
- { ISD::ABS, MVT::v2i64, 1 },
+ { ISD::ABS, MVT::v8i64, 1 },
+ { ISD::ABS, MVT::v16i32, 1 },
+ { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::ABS, MVT::v4i64, 1 },
+ { ISD::ABS, MVT::v2i64, 1 },
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::BITREVERSE, MVT::v32i16, 10 },
@@ -2364,30 +2364,30 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i32, 28 },
{ ISD::CTTZ, MVT::v32i16, 24 },
{ ISD::CTTZ, MVT::v64i8, 18 },
- { ISD::SMAX, MVT::v8i64, 1 },
- { ISD::SMAX, MVT::v16i32, 1 },
- { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
- { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
- { ISD::SMAX, MVT::v4i64, 1 },
- { ISD::SMAX, MVT::v2i64, 1 },
- { ISD::SMIN, MVT::v8i64, 1 },
- { ISD::SMIN, MVT::v16i32, 1 },
- { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
- { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
- { ISD::SMIN, MVT::v4i64, 1 },
- { ISD::SMIN, MVT::v2i64, 1 },
- { ISD::UMAX, MVT::v8i64, 1 },
- { ISD::UMAX, MVT::v16i32, 1 },
- { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
- { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
- { ISD::UMAX, MVT::v4i64, 1 },
- { ISD::UMAX, MVT::v2i64, 1 },
- { ISD::UMIN, MVT::v8i64, 1 },
- { ISD::UMIN, MVT::v16i32, 1 },
- { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
- { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
- { ISD::UMIN, MVT::v4i64, 1 },
- { ISD::UMIN, MVT::v2i64, 1 },
+ { ISD::SMAX, MVT::v8i64, 1 },
+ { ISD::SMAX, MVT::v16i32, 1 },
+ { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMAX, MVT::v4i64, 1 },
+ { ISD::SMAX, MVT::v2i64, 1 },
+ { ISD::SMIN, MVT::v8i64, 1 },
+ { ISD::SMIN, MVT::v16i32, 1 },
+ { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SMIN, MVT::v4i64, 1 },
+ { ISD::SMIN, MVT::v2i64, 1 },
+ { ISD::UMAX, MVT::v8i64, 1 },
+ { ISD::UMAX, MVT::v16i32, 1 },
+ { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMAX, MVT::v4i64, 1 },
+ { ISD::UMAX, MVT::v2i64, 1 },
+ { ISD::UMIN, MVT::v8i64, 1 },
+ { ISD::UMIN, MVT::v16i32, 1 },
+ { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UMIN, MVT::v4i64, 1 },
+ { ISD::UMIN, MVT::v2i64, 1 },
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
@@ -2428,10 +2428,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::BITREVERSE, MVT::i8, 3 }
};
static const CostTblEntry AVX2CostTbl[] = {
- { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
- { ISD::ABS, MVT::v8i32, 1 },
- { ISD::ABS, MVT::v16i16, 1 },
- { ISD::ABS, MVT::v32i8, 1 },
+ { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 1 },
+ { ISD::ABS, MVT::v16i16, 1 },
+ { ISD::ABS, MVT::v32i8, 1 },
{ ISD::BITREVERSE, MVT::v4i64, 5 },
{ ISD::BITREVERSE, MVT::v8i32, 5 },
{ ISD::BITREVERSE, MVT::v16i16, 5 },
@@ -2453,28 +2453,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v32i8, 9 },
{ ISD::SADDSAT, MVT::v16i16, 1 },
{ ISD::SADDSAT, MVT::v32i8, 1 },
- { ISD::SMAX, MVT::v8i32, 1 },
- { ISD::SMAX, MVT::v16i16, 1 },
- { ISD::SMAX, MVT::v32i8, 1 },
- { ISD::SMIN, MVT::v8i32, 1 },
- { ISD::SMIN, MVT::v16i16, 1 },
- { ISD::SMIN, MVT::v32i8, 1 },
+ { ISD::SMAX, MVT::v8i32, 1 },
+ { ISD::SMAX, MVT::v16i16, 1 },
+ { ISD::SMAX, MVT::v32i8, 1 },
+ { ISD::SMIN, MVT::v8i32, 1 },
+ { ISD::SMIN, MVT::v16i16, 1 },
+ { ISD::SMIN, MVT::v32i8, 1 },
{ ISD::SSUBSAT, MVT::v16i16, 1 },
{ ISD::SSUBSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v16i16, 1 },
{ ISD::UADDSAT, MVT::v32i8, 1 },
{ ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd
- { ISD::UMAX, MVT::v8i32, 1 },
- { ISD::UMAX, MVT::v16i16, 1 },
- { ISD::UMAX, MVT::v32i8, 1 },
- { ISD::UMIN, MVT::v8i32, 1 },
- { ISD::UMIN, MVT::v16i16, 1 },
- { ISD::UMIN, MVT::v32i8, 1 },
+ { ISD::UMAX, MVT::v8i32, 1 },
+ { ISD::UMAX, MVT::v16i16, 1 },
+ { ISD::UMAX, MVT::v32i8, 1 },
+ { ISD::UMIN, MVT::v8i32, 1 },
+ { ISD::UMIN, MVT::v16i16, 1 },
+ { ISD::UMIN, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v16i16, 1 },
{ ISD::USUBSAT, MVT::v32i8, 1 },
{ ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd
- { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
- { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
{ ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
@@ -2483,10 +2483,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};
static const CostTblEntry AVX1CostTbl[] = {
- { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
- { ISD::ABS, MVT::v8i32, 3 },
- { ISD::ABS, MVT::v16i16, 3 },
- { ISD::ABS, MVT::v32i8, 3 },
+ { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X)
+ { ISD::ABS, MVT::v8i32, 3 },
+ { ISD::ABS, MVT::v16i16, 3 },
+ { ISD::ABS, MVT::v32i8, 3 },
{ ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
{ ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
{ ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
@@ -2508,32 +2508,32 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
- { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
+ { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
- { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
- { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
- { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
- { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
- { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
- { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
+ { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS
+ { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ?
+ { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD
+ { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ?
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -2559,21 +2559,21 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
- static const CostTblEntry SSE41CostTbl[] = {
- { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
- { ISD::SMAX, MVT::v4i32, 1 },
- { ISD::SMAX, MVT::v16i8, 1 },
- { ISD::SMIN, MVT::v4i32, 1 },
- { ISD::SMIN, MVT::v16i8, 1 },
- { ISD::UMAX, MVT::v4i32, 1 },
- { ISD::UMAX, MVT::v8i16, 1 },
- { ISD::UMIN, MVT::v4i32, 1 },
- { ISD::UMIN, MVT::v8i16, 1 },
- };
+ static const CostTblEntry SSE41CostTbl[] = {
+ { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X)
+ { ISD::SMAX, MVT::v4i32, 1 },
+ { ISD::SMAX, MVT::v16i8, 1 },
+ { ISD::SMIN, MVT::v4i32, 1 },
+ { ISD::SMIN, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v4i32, 1 },
+ { ISD::UMAX, MVT::v8i16, 1 },
+ { ISD::UMIN, MVT::v4i32, 1 },
+ { ISD::UMIN, MVT::v8i16, 1 },
+ };
static const CostTblEntry SSSE3CostTbl[] = {
- { ISD::ABS, MVT::v4i32, 1 },
- { ISD::ABS, MVT::v8i16, 1 },
- { ISD::ABS, MVT::v16i8, 1 },
+ { ISD::ABS, MVT::v4i32, 1 },
+ { ISD::ABS, MVT::v8i16, 1 },
+ { ISD::ABS, MVT::v16i8, 1 },
{ ISD::BITREVERSE, MVT::v2i64, 5 },
{ ISD::BITREVERSE, MVT::v4i32, 5 },
{ ISD::BITREVERSE, MVT::v8i16, 5 },
@@ -2595,10 +2595,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i8, 9 }
};
static const CostTblEntry SSE2CostTbl[] = {
- { ISD::ABS, MVT::v2i64, 4 },
- { ISD::ABS, MVT::v4i32, 3 },
- { ISD::ABS, MVT::v8i16, 2 },
- { ISD::ABS, MVT::v16i8, 2 },
+ { ISD::ABS, MVT::v2i64, 4 },
+ { ISD::ABS, MVT::v4i32, 3 },
+ { ISD::ABS, MVT::v8i16, 2 },
+ { ISD::ABS, MVT::v16i8, 2 },
{ ISD::BITREVERSE, MVT::v2i64, 29 },
{ ISD::BITREVERSE, MVT::v4i32, 27 },
{ ISD::BITREVERSE, MVT::v8i16, 27 },
@@ -2620,16 +2620,16 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTTZ, MVT::v16i8, 13 },
{ ISD::SADDSAT, MVT::v8i16, 1 },
{ ISD::SADDSAT, MVT::v16i8, 1 },
- { ISD::SMAX, MVT::v8i16, 1 },
- { ISD::SMIN, MVT::v8i16, 1 },
+ { ISD::SMAX, MVT::v8i16, 1 },
+ { ISD::SMIN, MVT::v8i16, 1 },
{ ISD::SSUBSAT, MVT::v8i16, 1 },
{ ISD::SSUBSAT, MVT::v16i8, 1 },
{ ISD::UADDSAT, MVT::v8i16, 1 },
{ ISD::UADDSAT, MVT::v16i8, 1 },
- { ISD::UMAX, MVT::v8i16, 2 },
- { ISD::UMAX, MVT::v16i8, 1 },
- { ISD::UMIN, MVT::v8i16, 2 },
- { ISD::UMIN, MVT::v16i8, 1 },
+ { ISD::UMAX, MVT::v8i16, 2 },
+ { ISD::UMAX, MVT::v16i8, 1 },
+ { ISD::UMIN, MVT::v8i16, 2 },
+ { ISD::UMIN, MVT::v16i8, 1 },
{ ISD::USUBSAT, MVT::v8i16, 1 },
{ ISD::USUBSAT, MVT::v16i8, 1 },
{ ISD::FMAXNUM, MVT::f64, 4 },
@@ -2668,18 +2668,18 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::CTPOP, MVT::i8, 1 },
};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
- { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
+ { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, 14 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
- { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
+ { ISD::UMULO, MVT::i64, 2 }, // mulq + seto
};
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
- { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
- { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
+ { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV
+ { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
@@ -2698,9 +2698,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::UADDO, MVT::i32, 1 },
{ ISD::UADDO, MVT::i16, 1 },
{ ISD::UADDO, MVT::i8, 1 },
- { ISD::UMULO, MVT::i32, 2 }, // mul + seto
- { ISD::UMULO, MVT::i16, 2 },
- { ISD::UMULO, MVT::i8, 2 },
+ { ISD::UMULO, MVT::i32, 2 }, // mul + seto
+ { ISD::UMULO, MVT::i16, 2 },
+ { ISD::UMULO, MVT::i8, 2 },
};
Type *RetTy = ICA.getReturnType();
@@ -2710,9 +2710,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
switch (IID) {
default:
break;
- case Intrinsic::abs:
- ISD = ISD::ABS;
- break;
+ case Intrinsic::abs:
+ ISD = ISD::ABS;
+ break;
case Intrinsic::bitreverse:
ISD = ISD::BITREVERSE;
break;
@@ -2736,24 +2736,24 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
case Intrinsic::sadd_sat:
ISD = ISD::SADDSAT;
break;
- case Intrinsic::smax:
- ISD = ISD::SMAX;
- break;
- case Intrinsic::smin:
- ISD = ISD::SMIN;
- break;
+ case Intrinsic::smax:
+ ISD = ISD::SMAX;
+ break;
+ case Intrinsic::smin:
+ ISD = ISD::SMIN;
+ break;
case Intrinsic::ssub_sat:
ISD = ISD::SSUBSAT;
break;
case Intrinsic::uadd_sat:
ISD = ISD::UADDSAT;
break;
- case Intrinsic::umax:
- ISD = ISD::UMAX;
- break;
- case Intrinsic::umin:
- ISD = ISD::UMIN;
- break;
+ case Intrinsic::umax:
+ ISD = ISD::UMAX;
+ break;
+ case Intrinsic::umin:
+ ISD = ISD::UMIN;
+ break;
case Intrinsic::usub_sat:
ISD = ISD::USUBSAT;
break;
@@ -2772,12 +2772,12 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
ISD = ISD::UADDO;
OpTy = RetTy->getContainedType(0);
break;
- case Intrinsic::umul_with_overflow:
- case Intrinsic::smul_with_overflow:
- // SMULO has same costs so don't duplicate.
- ISD = ISD::UMULO;
- OpTy = RetTy->getContainedType(0);
- break;
+ case Intrinsic::umul_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ // SMULO has same costs so don't duplicate.
+ ISD = ISD::UMULO;
+ OpTy = RetTy->getContainedType(0);
+ break;
}
if (ISD != ISD::DELETED_NODE) {
@@ -2786,121 +2786,121 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
MVT MTy = LT.second;
// Attempt to lookup cost.
- if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
- MTy.isVector()) {
- // With PSHUFB the code is very similar for all types. If we have integer
- // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
- // we also need a PSHUFB.
- unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
-
- // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
- // instructions. We also need an extract and an insert.
- if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
- (ST->hasBWI() && MTy.is512BitVector())))
- Cost = Cost * 2 + 2;
-
- return LT.first * Cost;
- }
-
- auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
- FastMathFlags FMF) {
- // If there are no NANs to deal with, then these are reduced to a
- // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
- // assume is used in the non-fast case.
- if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
- if (FMF.noNaNs())
- return LegalizationCost * 1;
- }
- return LegalizationCost * (int)Entry.Cost;
- };
-
+ if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
+ MTy.isVector()) {
+ // With PSHUFB the code is very similar for all types. If we have integer
+ // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
+ // we also need a PSHUFB.
+ unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
+
+ // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
+ // instructions. We also need an extract and an insert.
+ if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
+ (ST->hasBWI() && MTy.is512BitVector())))
+ Cost = Cost * 2 + 2;
+
+ return LT.first * Cost;
+ }
+
+ auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+ FastMathFlags FMF) {
+ // If there are no NANs to deal with, then these are reduced to a
+ // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
+ // assume is used in the non-fast case.
+ if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) {
+ if (FMF.noNaNs())
+ return LegalizationCost * 1;
+ }
+ return LegalizationCost * (int)Entry.Cost;
+ };
+
if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->isSLM())
if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasCDI())
if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
-
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (ST->hasBMI()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
if (ST->hasLZCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
if (ST->hasPOPCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
// TODO - add BMI (TZCNT) scalar handling
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
- return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+ return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -3119,32 +3119,32 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
Cost +=
BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
} else {
- // In each 128-lane, if at least one index is demanded but not all
- // indices are demanded and this 128-lane is not the first 128-lane of
- // the legalized-vector, then this 128-lane needs a extracti128; If in
- // each 128-lane, there is at least one demanded index, this 128-lane
- // needs a inserti128.
-
- // The following cases will help you build a better understanding:
- // Assume we insert several elements into a v8i32 vector in avx2,
- // Case#1: inserting into 1th index needs vpinsrd + inserti128.
- // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
- // inserti128.
- // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
- unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
- unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
- APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
- unsigned Scale = NumElts / Num128Lanes;
- // We iterate each 128-lane, and check if we need a
- // extracti128/inserti128 for this 128-lane.
- for (unsigned I = 0; I < NumElts; I += Scale) {
- APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
- APInt MaskedDE = Mask & WidenedDemandedElts;
- unsigned Population = MaskedDE.countPopulation();
- Cost += (Population > 0 && Population != Scale &&
- I % LT.second.getVectorNumElements() != 0);
- Cost += Population > 0;
- }
+ // In each 128-lane, if at least one index is demanded but not all
+ // indices are demanded and this 128-lane is not the first 128-lane of
+ // the legalized-vector, then this 128-lane needs a extracti128; If in
+ // each 128-lane, there is at least one demanded index, this 128-lane
+ // needs a inserti128.
+
+ // The following cases will help you build a better understanding:
+ // Assume we insert several elements into a v8i32 vector in avx2,
+ // Case#1: inserting into 1th index needs vpinsrd + inserti128.
+ // Case#2: inserting into 5th index needs extracti128 + vpinsrd +
+ // inserti128.
+ // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
+ unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
+ unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+ APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
+ unsigned Scale = NumElts / Num128Lanes;
+ // We iterate each 128-lane, and check if we need a
+ // extracti128/inserti128 for this 128-lane.
+ for (unsigned I = 0; I < NumElts; I += Scale) {
+ APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale);
+ APInt MaskedDE = Mask & WidenedDemandedElts;
+ unsigned Population = MaskedDE.countPopulation();
+ Cost += (Population > 0 && Population != Scale &&
+ I % LT.second.getVectorNumElements() != 0);
+ Cost += Population > 0;
+ }
Cost += DemandedElts.countPopulation();
// For vXf32 cases, insertion into the 0'th index in each v4f32
@@ -3188,10 +3188,10 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput) {
- if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
+ if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
// Store instruction with index and scale costs 2 Uops.
// Check the preceding GEP to identify non-const indices.
- if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) {
if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
return TTI::TCC_Basic * 2;
}
@@ -3270,7 +3270,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
int ValueSplitCost =
@@ -3691,10 +3691,10 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// Otherwise fall back to cmp+select.
- return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
- CostKind) +
- getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+ CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
}
int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
@@ -3923,10 +3923,10 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
return std::max(1, Cost);
}
-int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -4066,28 +4066,28 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
}
-int X86TTIImpl::getGatherOverhead() const {
- // Some CPUs have more overhead for gather. The specified overhead is relative
- // to the Load operation. "2" is the number provided by Intel architects. This
- // parameter is used for cost estimation of Gather Op and comparison with
- // other alternatives.
- // TODO: Remove the explicit hasAVX512()?, That would mean we would only
- // enable gather with a -march.
- if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
- return 2;
-
- return 1024;
-}
-
-int X86TTIImpl::getScatterOverhead() const {
- if (ST->hasAVX512())
- return 2;
-
- return 1024;
-}
-
-// Return an average cost of Gather / Scatter instruction, maybe improved later.
-// FIXME: Add TargetCostKind support.
+int X86TTIImpl::getGatherOverhead() const {
+ // Some CPUs have more overhead for gather. The specified overhead is relative
+ // to the Load operation. "2" is the number provided by Intel architects. This
+ // parameter is used for cost estimation of Gather Op and comparison with
+ // other alternatives.
+ // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather()))
+ return 2;
+
+ return 1024;
+}
+
+int X86TTIImpl::getScatterOverhead() const {
+ if (ST->hasAVX512())
+ return 2;
+
+ return 1024;
+}
+
+// Return an average cost of Gather / Scatter instruction, maybe improved later.
+// FIXME: Add TargetCostKind support.
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
Align Alignment, unsigned AddressSpace) {
@@ -4145,8 +4145,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.
const int GSOverhead = (Opcode == Instruction::Load)
- ? getGatherOverhead()
- : getScatterOverhead();
+ ? getGatherOverhead()
+ : getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
MaybeAlign(Alignment), AddressSpace,
TTI::TCK_RecipThroughput);
@@ -4160,7 +4160,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
/// Alignment - Alignment for one element.
/// AddressSpace - pointer[s] address space.
///
-/// FIXME: Add TargetCostKind support.
+/// FIXME: Add TargetCostKind support.
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
bool VariableMask, Align Alignment,
unsigned AddressSpace) {
@@ -4174,9 +4174,9 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost = getCmpSelInstrCost(
- Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ int ScalarCompareCost = getCmpSelInstrCost(
+ Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
@@ -4207,15 +4207,15 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
Align Alignment,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) {
- if (CostKind != TTI::TCK_RecipThroughput) {
- if ((Opcode == Instruction::Load &&
- isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
- (Opcode == Instruction::Store &&
- isLegalMaskedScatter(SrcVTy, Align(Alignment))))
- return 1;
- return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
- Alignment, CostKind, I);
- }
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if ((Opcode == Instruction::Load &&
+ isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+ return 1;
+ return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+ }
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
@@ -4375,7 +4375,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
// scalarize it.
if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
unsigned NumElts = DataVTy->getNumElements();
- if (NumElts == 1)
+ if (NumElts == 1)
return false;
}
Type *ScalarTy = DataTy->getScalarType();
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h
index 17570f1c04..3ebfef5d65 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h
@@ -22,8 +22,8 @@
namespace llvm {
-class InstCombiner;
-
+class InstCombiner;
+
class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
typedef BasicTTIImplBase<X86TTIImpl> BaseT;
typedef TargetTransformInfo TTI;
@@ -130,10 +130,10 @@ public:
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
+ TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
+ CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -153,18 +153,18 @@ public:
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
const SCEV *Ptr);
- Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
- IntrinsicInst &II) const;
- Optional<Value *>
- simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
- APInt DemandedMask, KnownBits &Known,
- bool &KnownBitsComputed) const;
- Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
- InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
- APInt &UndefElts2, APInt &UndefElts3,
- std::function<void(Instruction *, unsigned, APInt, APInt &)>
- SimplifyAndSetOp) const;
-
+ Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+ IntrinsicInst &II) const;
+ Optional<Value *>
+ simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+ APInt DemandedMask, KnownBits &Known,
+ bool &KnownBitsComputed) const;
+ Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const;
+
unsigned getAtomicMemIntrinsicMaxElementSize() const;
int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
@@ -204,9 +204,9 @@ public:
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty, TTI::TargetCostKind CostKind,
- Instruction *Inst = nullptr);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind,
+ Instruction *Inst = nullptr);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
@@ -245,9 +245,9 @@ private:
int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
Align Alignment, unsigned AddressSpace);
- int getGatherOverhead() const;
- int getScatterOverhead() const;
-
+ int getGatherOverhead() const;
+ int getScatterOverhead() const;
+
/// @}
};
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp
index ef010bcd38..6164a84c73 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp
@@ -1,248 +1,248 @@
-//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Pass to config the shape of AMX physical registers
-/// AMX register need to be configured before use. In X86PreTileConfig pass
-/// the pldtilecfg instruction is inserted, however at that time we don't
-/// know the shape of each physical tile registers, because the register
-/// allocation is not done yet. This pass runs after egister allocation
-/// pass. It collects the shape information of each physical tile register
-/// and store the shape in the stack slot that is allocated for load config
-/// to tile config register.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86MachineFunctionInfo.h"
-#include "X86RegisterInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TileShapeInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "tile-config"
-
-namespace {
-
-class X86TileConfig : public MachineFunctionPass {
- // context
- MachineFunction *MF = nullptr;
- const X86Subtarget *ST = nullptr;
- const TargetRegisterInfo *TRI;
- const TargetInstrInfo *TII;
- MachineDominatorTree *DomTree = nullptr;
- MachineRegisterInfo *MRI = nullptr;
- VirtRegMap *VRM = nullptr;
- LiveIntervals *LIS = nullptr;
-
- MachineInstr *getTileConfigPoint();
- void tileConfig();
-
-public:
- X86TileConfig() : MachineFunctionPass(ID) {}
-
- /// Return the pass name.
- StringRef getPassName() const override { return "Tile Register Configure"; }
-
- /// X86TileConfig analysis usage.
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
- /// Perform register allocation.
- bool runOnMachineFunction(MachineFunction &mf) override;
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoPHIs);
- }
-
- static char ID;
-};
-
-} // end anonymous namespace
-
-char X86TileConfig::ID = 0;
-
-INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
-INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
- false, false)
-
-void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineDominatorTree>();
- AU.addRequired<LiveIntervals>();
- AU.addPreserved<SlotIndexes>();
- AU.addRequired<VirtRegMap>();
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-static unsigned getTilePhysRegIndex(Register PhysReg) {
- assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
- "Tile register number is invalid");
- return (PhysReg - X86::TMM0);
-}
-
-static MachineInstr *
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
- const TargetInstrInfo *TII, const TargetRegisterClass *RC,
- const TargetRegisterInfo *TRI) {
-
- unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
- unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
- if (BitSize == TRI->getRegSizeInBits(*RC))
- SubIdx = 0;
- MachineInstr *NewMI =
- addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
- Offset)
- .addReg(SrcReg, 0, SubIdx);
- return NewMI;
-}
-
-static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- int64_t Imm, unsigned BitSize,
- int FrameIdx, int Offset,
- const TargetInstrInfo *TII) {
- unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
- return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
- FrameIdx, Offset)
- .addImm(Imm);
-}
-
-MachineInstr *X86TileConfig::getTileConfigPoint() {
- for (MachineBasicBlock &MBB : *MF) {
-
- // Traverse the basic block.
- for (MachineInstr &MI : MBB)
- // Refer X86PreTileConfig.cpp.
- // We only support one tile config for now.
- if (MI.getOpcode() == X86::PLDTILECFG)
- return &MI;
- }
-
- return nullptr;
-}
-
-void X86TileConfig::tileConfig() {
- MachineInstr *MI = getTileConfigPoint();
- if (!MI)
- return;
- MachineBasicBlock *MBB = MI->getParent();
- int SS = MI->getOperand(1).getIndex();
- BitVector PhysRegs(TRI->getNumRegs());
-
- // Fill in the palette first.
- auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
- LIS->InsertMachineInstrInMaps(*NewMI);
- // Fill in the shape of each tile physical register.
- for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
- Register VirtReg = Register::index2VirtReg(i);
- if (MRI->reg_nodbg_empty(VirtReg))
- continue;
- const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
- if (RC.getID() != X86::TILERegClassID)
- continue;
- Register PhysReg = VRM->getPhys(VirtReg);
- if (PhysRegs.test(PhysReg))
- continue;
- PhysRegs.set(PhysReg);
- ShapeT Shape = VRM->getShape(VirtReg);
- Register RowReg = Shape.getRow()->getReg();
- Register ColReg = Shape.getCol()->getReg();
-
- // Here is the data format for the tile config.
- // 0 palette
- // 1 start_row
- // 2-15 reserved, must be zero
- // 16-17 tile0.colsb Tile 0 bytes per row.
- // 18-19 tile1.colsb Tile 1 bytes per row.
- // 20-21 tile2.colsb Tile 2 bytes per row.
- // ... (sequence continues)
- // 30-31 tile7.colsb Tile 7 bytes per row.
- // 32-47 reserved, must be zero
- // 48 tile0.rows Tile 0 rows.
- // 49 tile1.rows Tile 1 rows.
- // 50 tile2.rows Tile 2 rows.
- // ... (sequence continues)
- // 55 tile7.rows Tile 7 rows.
- // 56-63 reserved, must be zero
- unsigned Index = getTilePhysRegIndex(PhysReg);
- int RowOffset = 48 + Index;
- int ColOffset = 16 + Index * 2;
-
- unsigned BitSize = 8;
- for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
- std::make_pair(ColReg, ColOffset)}) {
- int64_t Imm;
- int ImmCount = 0;
- // All def must be the same value, otherwise it is invalid MIs.
- // Immediate is prefered.
- for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
- const auto *Inst = MO.getParent();
- if (Inst->isMoveImmediate()) {
- ImmCount++;
- Imm = Inst->getOperand(1).getImm();
- break;
- }
- }
- auto StoreConfig = [&](int Offset) {
- MachineInstr *NewMI = nullptr;
- if (ImmCount)
- NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
- else {
- const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
- NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
- Offset, TII, RC, TRI);
- }
- SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
- if (!ImmCount) {
- // Extend the live interval.
- SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
- LiveInterval &Int = LIS->getInterval(Pair.first);
- LIS->extendToIndices(Int, EndPoints);
- }
- };
- StoreConfig(Pair.second);
- BitSize += 8;
- }
- }
-}
-
-bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
- MF = &mf;
- MRI = &mf.getRegInfo();
- ST = &mf.getSubtarget<X86Subtarget>();
- TRI = ST->getRegisterInfo();
- TII = mf.getSubtarget().getInstrInfo();
- DomTree = &getAnalysis<MachineDominatorTree>();
- VRM = &getAnalysis<VirtRegMap>();
- LIS = &getAnalysis<LiveIntervals>();
-
- if (VRM->isShapeMapEmpty())
- return false;
-
- tileConfig();
- return true;
-}
-
-FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
+//===-- X86TileConfig.cpp - Tile Register Configure----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Pass to config the shape of AMX physical registers
+/// AMX register need to be configured before use. In X86PreTileConfig pass
+/// the pldtilecfg instruction is inserted, however at that time we don't
+/// know the shape of each physical tile registers, because the register
+/// allocation is not done yet. This pass runs after egister allocation
+/// pass. It collects the shape information of each physical tile register
+/// and store the shape in the stack slot that is allocated for load config
+/// to tile config register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TileShapeInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tile-config"
+
+namespace {
+
+class X86TileConfig : public MachineFunctionPass {
+ // context
+ MachineFunction *MF = nullptr;
+ const X86Subtarget *ST = nullptr;
+ const TargetRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ VirtRegMap *VRM = nullptr;
+ LiveIntervals *LIS = nullptr;
+
+ MachineInstr *getTileConfigPoint();
+ void tileConfig();
+
+public:
+ X86TileConfig() : MachineFunctionPass(ID) {}
+
+ /// Return the pass name.
+ StringRef getPassName() const override { return "Tile Register Configure"; }
+
+ /// X86TileConfig analysis usage.
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Perform register allocation.
+ bool runOnMachineFunction(MachineFunction &mf) override;
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoPHIs);
+ }
+
+ static char ID;
+};
+
+} // end anonymous namespace
+
+char X86TileConfig::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
+ false, false)
+
+void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addRequired<VirtRegMap>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static unsigned getTilePhysRegIndex(Register PhysReg) {
+ assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) &&
+ "Tile register number is invalid");
+ return (PhysReg - X86::TMM0);
+}
+
+static MachineInstr *
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register SrcReg, unsigned BitSize, int FrameIdx, int Offset,
+ const TargetInstrInfo *TII, const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+
+ unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit;
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr;
+ if (BitSize == TRI->getRegSizeInBits(*RC))
+ SubIdx = 0;
+ MachineInstr *NewMI =
+ addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx,
+ Offset)
+ .addReg(SrcReg, 0, SubIdx);
+ return NewMI;
+}
+
+static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ int64_t Imm, unsigned BitSize,
+ int FrameIdx, int Offset,
+ const TargetInstrInfo *TII) {
+ unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi;
+ return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)),
+ FrameIdx, Offset)
+ .addImm(Imm);
+}
+
+MachineInstr *X86TileConfig::getTileConfigPoint() {
+ for (MachineBasicBlock &MBB : *MF) {
+
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
+ // Refer X86PreTileConfig.cpp.
+ // We only support one tile config for now.
+ if (MI.getOpcode() == X86::PLDTILECFG)
+ return &MI;
+ }
+
+ return nullptr;
+}
+
+void X86TileConfig::tileConfig() {
+ MachineInstr *MI = getTileConfigPoint();
+ if (!MI)
+ return;
+ MachineBasicBlock *MBB = MI->getParent();
+ int SS = MI->getOperand(1).getIndex();
+ BitVector PhysRegs(TRI->getNumRegs());
+
+ // Fill in the palette first.
+ auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ // Fill in the shape of each tile physical register.
+ for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+ Register VirtReg = Register::index2VirtReg(i);
+ if (MRI->reg_nodbg_empty(VirtReg))
+ continue;
+ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+ if (RC.getID() != X86::TILERegClassID)
+ continue;
+ Register PhysReg = VRM->getPhys(VirtReg);
+ if (PhysRegs.test(PhysReg))
+ continue;
+ PhysRegs.set(PhysReg);
+ ShapeT Shape = VRM->getShape(VirtReg);
+ Register RowReg = Shape.getRow()->getReg();
+ Register ColReg = Shape.getCol()->getReg();
+
+ // Here is the data format for the tile config.
+ // 0 palette
+ // 1 start_row
+ // 2-15 reserved, must be zero
+ // 16-17 tile0.colsb Tile 0 bytes per row.
+ // 18-19 tile1.colsb Tile 1 bytes per row.
+ // 20-21 tile2.colsb Tile 2 bytes per row.
+ // ... (sequence continues)
+ // 30-31 tile7.colsb Tile 7 bytes per row.
+ // 32-47 reserved, must be zero
+ // 48 tile0.rows Tile 0 rows.
+ // 49 tile1.rows Tile 1 rows.
+ // 50 tile2.rows Tile 2 rows.
+ // ... (sequence continues)
+ // 55 tile7.rows Tile 7 rows.
+ // 56-63 reserved, must be zero
+ unsigned Index = getTilePhysRegIndex(PhysReg);
+ int RowOffset = 48 + Index;
+ int ColOffset = 16 + Index * 2;
+
+ unsigned BitSize = 8;
+ for (const auto &Pair : {std::make_pair(RowReg, RowOffset),
+ std::make_pair(ColReg, ColOffset)}) {
+ int64_t Imm;
+ int ImmCount = 0;
+ // All def must be the same value, otherwise it is invalid MIs.
+ // Immediate is prefered.
+ for (const MachineOperand &MO : MRI->def_operands(Pair.first)) {
+ const auto *Inst = MO.getParent();
+ if (Inst->isMoveImmediate()) {
+ ImmCount++;
+ Imm = Inst->getOperand(1).getImm();
+ break;
+ }
+ }
+ auto StoreConfig = [&](int Offset) {
+ MachineInstr *NewMI = nullptr;
+ if (ImmCount)
+ NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII);
+ else {
+ const TargetRegisterClass *RC = MRI->getRegClass(Pair.first);
+ NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS,
+ Offset, TII, RC, TRI);
+ }
+ SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI);
+ if (!ImmCount) {
+ // Extend the live interval.
+ SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()};
+ LiveInterval &Int = LIS->getInterval(Pair.first);
+ LIS->extendToIndices(Int, EndPoints);
+ }
+ };
+ StoreConfig(Pair.second);
+ BitSize += 8;
+ }
+ }
+}
+
+bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ MRI = &mf.getRegInfo();
+ ST = &mf.getSubtarget<X86Subtarget>();
+ TRI = ST->getRegisterInfo();
+ TII = mf.getSubtarget().getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+ VRM = &getAnalysis<VirtRegMap>();
+ LIS = &getAnalysis<LiveIntervals>();
+
+ if (VRM->isShapeMapEmpty())
+ return false;
+
+ tileConfig();
+ return true;
+}
+
+FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); }
diff --git a/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp b/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp
index 8d8bd5e6b3..0122a5e83f 100644
--- a/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp
@@ -109,7 +109,7 @@ private:
/// The linked list node subobject inside of RegNode.
Value *Link = nullptr;
};
-} // namespace
+} // namespace
FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); }
diff --git a/contrib/libs/llvm12/lib/Target/X86/ya.make b/contrib/libs/llvm12/lib/Target/X86/ya.make
index 1df03a55e7..efb9cc5da8 100644
--- a/contrib/libs/llvm12/lib/Target/X86/ya.make
+++ b/contrib/libs/llvm12/lib/Target/X86/ya.make
@@ -15,26 +15,26 @@ LICENSE(
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
PEERDIR(
- contrib/libs/llvm12
- contrib/libs/llvm12/include
- contrib/libs/llvm12/lib/Analysis
- contrib/libs/llvm12/lib/CodeGen
- contrib/libs/llvm12/lib/CodeGen/AsmPrinter
- contrib/libs/llvm12/lib/CodeGen/GlobalISel
- contrib/libs/llvm12/lib/CodeGen/SelectionDAG
- contrib/libs/llvm12/lib/IR
- contrib/libs/llvm12/lib/MC
- contrib/libs/llvm12/lib/ProfileData
- contrib/libs/llvm12/lib/Support
- contrib/libs/llvm12/lib/Target
- contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
- contrib/libs/llvm12/lib/Target/X86/TargetInfo
- contrib/libs/llvm12/lib/Transforms/CFGuard
+ contrib/libs/llvm12
+ contrib/libs/llvm12/include
+ contrib/libs/llvm12/lib/Analysis
+ contrib/libs/llvm12/lib/CodeGen
+ contrib/libs/llvm12/lib/CodeGen/AsmPrinter
+ contrib/libs/llvm12/lib/CodeGen/GlobalISel
+ contrib/libs/llvm12/lib/CodeGen/SelectionDAG
+ contrib/libs/llvm12/lib/IR
+ contrib/libs/llvm12/lib/MC
+ contrib/libs/llvm12/lib/ProfileData
+ contrib/libs/llvm12/lib/Support
+ contrib/libs/llvm12/lib/Target
+ contrib/libs/llvm12/lib/Target/X86/MCTargetDesc
+ contrib/libs/llvm12/lib/Target/X86/TargetInfo
+ contrib/libs/llvm12/lib/Transforms/CFGuard
)
ADDINCL(
- ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
- contrib/libs/llvm12/lib/Target/X86
+ ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86
+ contrib/libs/llvm12/lib/Target/X86
)
NO_CLANG_COVERAGE()
@@ -68,7 +68,7 @@ SRCS(
X86IndirectThunks.cpp
X86InsertPrefetch.cpp
X86InsertWait.cpp
- X86InstCombineIntrinsic.cpp
+ X86InstCombineIntrinsic.cpp
X86InstrFMA3Info.cpp
X86InstrFoldTables.cpp
X86InstrInfo.cpp
@@ -77,14 +77,14 @@ SRCS(
X86LegalizerInfo.cpp
X86LoadValueInjectionLoadHardening.cpp
X86LoadValueInjectionRetHardening.cpp
- X86LowerAMXType.cpp
+ X86LowerAMXType.cpp
X86MCInstLower.cpp
X86MachineFunctionInfo.cpp
X86MacroFusion.cpp
X86OptimizeLEAs.cpp
X86PadShortFunction.cpp
X86PartialReduction.cpp
- X86PreTileConfig.cpp
+ X86PreTileConfig.cpp
X86RegisterBankInfo.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
@@ -95,7 +95,7 @@ SRCS(
X86TargetMachine.cpp
X86TargetObjectFile.cpp
X86TargetTransformInfo.cpp
- X86TileConfig.cpp
+ X86TileConfig.cpp
X86VZeroUpper.cpp
X86WinAllocaExpander.cpp
X86WinEHState.cpp