diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/X86 | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/X86')
94 files changed, 10714 insertions, 10714 deletions
diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp index 9d9a20183f..f063bdbf6a 100644 --- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -32,7 +32,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -57,53 +57,53 @@ static bool checkScale(unsigned Scale, StringRef &ErrMsg) { namespace { static const char OpPrecedence[] = { - 0, // IC_OR - 1, // IC_XOR - 2, // IC_AND - 4, // IC_LSHIFT - 4, // IC_RSHIFT - 5, // IC_PLUS - 5, // IC_MINUS - 6, // IC_MULTIPLY - 6, // IC_DIVIDE - 6, // IC_MOD - 7, // IC_NOT - 8, // IC_NEG - 9, // IC_RPAREN - 10, // IC_LPAREN - 0, // IC_IMM - 0, // IC_REGISTER - 3, // IC_EQ - 3, // IC_NE - 3, // IC_LT - 3, // IC_LE - 3, // IC_GT - 3 // IC_GE + 0, // IC_OR + 1, // IC_XOR + 2, // IC_AND + 4, // IC_LSHIFT + 4, // IC_RSHIFT + 5, // IC_PLUS + 5, // IC_MINUS + 6, // IC_MULTIPLY + 6, // IC_DIVIDE + 6, // IC_MOD + 7, // IC_NOT + 8, // IC_NEG + 9, // IC_RPAREN + 10, // IC_LPAREN + 0, // IC_IMM + 0, // IC_REGISTER + 3, // IC_EQ + 3, // IC_NE + 3, // IC_LT + 3, // IC_LE + 3, // IC_GT + 3 // IC_GE }; class X86AsmParser : public MCTargetAsmParser { ParseInstructionInfo *InstInfo; bool Code16GCC; - unsigned ForcedDataPrefix = 0; + unsigned ForcedDataPrefix = 0; enum VEXEncoding { VEXEncoding_Default, VEXEncoding_VEX, - VEXEncoding_VEX2, + VEXEncoding_VEX2, VEXEncoding_VEX3, VEXEncoding_EVEX, }; VEXEncoding ForcedVEXEncoding = VEXEncoding_Default; - enum DispEncoding { - DispEncoding_Default, - DispEncoding_Disp8, - DispEncoding_Disp32, - }; - - DispEncoding ForcedDispEncoding = DispEncoding_Default; - + enum DispEncoding { + DispEncoding_Default, + DispEncoding_Disp8, + DispEncoding_Disp32, + }; + + DispEncoding ForcedDispEncoding = DispEncoding_Default; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -149,13 +149,13 @@ private: IC_RPAREN, IC_LPAREN, IC_IMM, - IC_REGISTER, - IC_EQ, - IC_NE, - IC_LT, - IC_LE, - IC_GT, - IC_GE + IC_REGISTER, + IC_EQ, + IC_NE, + IC_LT, + IC_LE, + IC_GT, + IC_GE }; enum IntelOperatorKind { @@ -165,19 +165,19 @@ private: IOK_TYPE, }; - enum MasmOperatorKind { - MOK_INVALID = 0, - MOK_LENGTHOF, - MOK_SIZEOF, - MOK_TYPE, - }; - + enum MasmOperatorKind { + MOK_INVALID = 0, + MOK_LENGTHOF, + MOK_SIZEOF, + MOK_TYPE, + }; + class InfixCalculator { typedef std::pair< InfixCalculatorTok, int64_t > ICToken; SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; SmallVector<ICToken, 4> PostfixStack; - bool isUnaryOperator(InfixCalculatorTok Op) const { + bool isUnaryOperator(InfixCalculatorTok Op) const { return Op == IC_NEG || Op == IC_NOT; } @@ -344,44 +344,44 @@ private: Val = Op1.second >> Op2.second; OperandStack.push_back(std::make_pair(IC_IMM, Val)); break; - case IC_EQ: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Equals operation with an immediate and a register!"); - Val = (Op1.second == Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; - case IC_NE: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Not-equals operation with an immediate and a register!"); - Val = (Op1.second != Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; - case IC_LT: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Less-than operation with an immediate and a register!"); - Val = (Op1.second < Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; - case IC_LE: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Less-than-or-equal operation with an immediate and a " - "register!"); - Val = (Op1.second <= Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; - case IC_GT: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Greater-than operation with an immediate and a register!"); - Val = (Op1.second > Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; - case IC_GE: - assert(Op1.first == IC_IMM && Op2.first == IC_IMM && - "Greater-than-or-equal operation with an immediate and a " - "register!"); - Val = (Op1.second >= Op2.second) ? -1 : 0; - OperandStack.push_back(std::make_pair(IC_IMM, Val)); - break; + case IC_EQ: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Equals operation with an immediate and a register!"); + Val = (Op1.second == Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_NE: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Not-equals operation with an immediate and a register!"); + Val = (Op1.second != Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_LT: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Less-than operation with an immediate and a register!"); + Val = (Op1.second < Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_LE: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Less-than-or-equal operation with an immediate and a " + "register!"); + Val = (Op1.second <= Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_GT: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Greater-than operation with an immediate and a register!"); + Val = (Op1.second > Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_GE: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Greater-than-or-equal operation with an immediate and a " + "register!"); + Val = (Op1.second >= Op2.second) ? -1 : 0; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; } } } @@ -395,12 +395,12 @@ private: IES_OR, IES_XOR, IES_AND, - IES_EQ, - IES_NE, - IES_LT, - IES_LE, - IES_GT, - IES_GE, + IES_EQ, + IES_NE, + IES_LT, + IES_LE, + IES_GT, + IES_GE, IES_LSHIFT, IES_RSHIFT, IES_PLUS, @@ -433,7 +433,7 @@ private: bool MemExpr; bool OffsetOperator; SMLoc OffsetOperatorLoc; - AsmTypeInfo CurType; + AsmTypeInfo CurType; bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) { if (Sym) { @@ -452,25 +452,25 @@ private: MemExpr(false), OffsetOperator(false) {} void addImm(int64_t imm) { Imm += imm; } - short getBracCount() const { return BracCount; } - bool isMemExpr() const { return MemExpr; } - bool isOffsetOperator() const { return OffsetOperator; } - SMLoc getOffsetLoc() const { return OffsetOperatorLoc; } - unsigned getBaseReg() const { return BaseReg; } - unsigned getIndexReg() const { return IndexReg; } - unsigned getScale() const { return Scale; } - const MCExpr *getSym() const { return Sym; } - StringRef getSymName() const { return SymName; } - StringRef getType() const { return CurType.Name; } - unsigned getSize() const { return CurType.Size; } - unsigned getElementSize() const { return CurType.ElementSize; } - unsigned getLength() const { return CurType.Length; } + short getBracCount() const { return BracCount; } + bool isMemExpr() const { return MemExpr; } + bool isOffsetOperator() const { return OffsetOperator; } + SMLoc getOffsetLoc() const { return OffsetOperatorLoc; } + unsigned getBaseReg() const { return BaseReg; } + unsigned getIndexReg() const { return IndexReg; } + unsigned getScale() const { return Scale; } + const MCExpr *getSym() const { return Sym; } + StringRef getSymName() const { return SymName; } + StringRef getType() const { return CurType.Name; } + unsigned getSize() const { return CurType.Size; } + unsigned getElementSize() const { return CurType.ElementSize; } + unsigned getLength() const { return CurType.Length; } int64_t getImm() { return Imm + IC.execute(); } - bool isValidEndState() const { + bool isValidEndState() const { return State == IES_RBRAC || State == IES_INTEGER; } - bool hadError() const { return State == IES_ERROR; } - const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; } + bool hadError() const { return State == IES_ERROR; } + const InlineAsmIdentifierInfo &getIdentifierInfo() const { return Info; } void onOr() { IntelExprState CurrState = State; @@ -517,96 +517,96 @@ private: } PrevState = CurrState; } - void onEq() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_EQ; - IC.pushOperator(IC_EQ); - break; - } - PrevState = CurrState; - } - void onNE() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_NE; - IC.pushOperator(IC_NE); - break; - } - PrevState = CurrState; - } - void onLT() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_LT; - IC.pushOperator(IC_LT); - break; - } - PrevState = CurrState; - } - void onLE() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_LE; - IC.pushOperator(IC_LE); - break; - } - PrevState = CurrState; - } - void onGT() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_GT; - IC.pushOperator(IC_GT); - break; - } - PrevState = CurrState; - } - void onGE() { - IntelExprState CurrState = State; - switch (State) { - default: - State = IES_ERROR; - break; - case IES_INTEGER: - case IES_RPAREN: - case IES_REGISTER: - State = IES_GE; - IC.pushOperator(IC_GE); - break; - } - PrevState = CurrState; - } + void onEq() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_EQ; + IC.pushOperator(IC_EQ); + break; + } + PrevState = CurrState; + } + void onNE() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_NE; + IC.pushOperator(IC_NE); + break; + } + PrevState = CurrState; + } + void onLT() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_LT; + IC.pushOperator(IC_LT); + break; + } + PrevState = CurrState; + } + void onLE() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_LE; + IC.pushOperator(IC_LE); + break; + } + PrevState = CurrState; + } + void onGT() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_GT; + IC.pushOperator(IC_GT); + break; + } + PrevState = CurrState; + } + void onGE() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_GE; + IC.pushOperator(IC_GE); + break; + } + PrevState = CurrState; + } void onLShift() { IntelExprState CurrState = State; switch (State) { @@ -677,12 +677,12 @@ private: case IES_OR: case IES_XOR: case IES_AND: - case IES_EQ: - case IES_NE: - case IES_LT: - case IES_LE: - case IES_GT: - case IES_GE: + case IES_EQ: + case IES_NE: + case IES_LT: + case IES_LE: + case IES_GT: + case IES_GE: case IES_LSHIFT: case IES_RSHIFT: case IES_PLUS: @@ -738,12 +738,12 @@ private: case IES_OR: case IES_XOR: case IES_AND: - case IES_EQ: - case IES_NE: - case IES_LT: - case IES_LE: - case IES_GT: - case IES_GE: + case IES_EQ: + case IES_NE: + case IES_LT: + case IES_LE: + case IES_GT: + case IES_GE: case IES_LSHIFT: case IES_RSHIFT: case IES_PLUS: @@ -799,8 +799,8 @@ private: } bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName, const InlineAsmIdentifierInfo &IDInfo, - const AsmTypeInfo &Type, bool ParsingMSInlineAsm, - StringRef &ErrMsg) { + const AsmTypeInfo &Type, bool ParsingMSInlineAsm, + StringRef &ErrMsg) { // InlineAsm: Treat an enum value as an integer if (ParsingMSInlineAsm) if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) @@ -819,7 +819,7 @@ private: case IES_NOT: case IES_INIT: case IES_LBRAC: - case IES_LPAREN: + case IES_LPAREN: if (setSymRef(SymRef, SymRefName, ErrMsg)) return true; MemExpr = true; @@ -827,7 +827,7 @@ private: IC.pushOperand(IC_IMM); if (ParsingMSInlineAsm) Info = IDInfo; - setTypeInfo(Type); + setTypeInfo(Type); break; } return false; @@ -844,12 +844,12 @@ private: case IES_OR: case IES_XOR: case IES_AND: - case IES_EQ: - case IES_NE: - case IES_LT: - case IES_LE: - case IES_GT: - case IES_GE: + case IES_EQ: + case IES_NE: + case IES_LT: + case IES_LE: + case IES_GT: + case IES_GE: case IES_LSHIFT: case IES_RSHIFT: case IES_DIVIDE: @@ -932,8 +932,8 @@ private: case IES_RPAREN: State = IES_PLUS; IC.pushOperator(IC_PLUS); - CurType.Length = 1; - CurType.Size = CurType.ElementSize; + CurType.Length = 1; + CurType.Size = CurType.ElementSize; break; case IES_INIT: case IES_CAST: @@ -986,12 +986,12 @@ private: case IES_OR: case IES_XOR: case IES_AND: - case IES_EQ: - case IES_NE: - case IES_LT: - case IES_LE: - case IES_GT: - case IES_GE: + case IES_EQ: + case IES_NE: + case IES_LT: + case IES_LE: + case IES_GT: + case IES_GE: case IES_LSHIFT: case IES_RSHIFT: case IES_MULTIPLY: @@ -1023,8 +1023,8 @@ private: } } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, - const InlineAsmIdentifierInfo &IDInfo, - bool ParsingMSInlineAsm, StringRef &ErrMsg) { + const InlineAsmIdentifierInfo &IDInfo, + bool ParsingMSInlineAsm, StringRef &ErrMsg) { PrevState = State; switch (State) { default: @@ -1048,19 +1048,19 @@ private: } return false; } - void onCast(AsmTypeInfo Info) { + void onCast(AsmTypeInfo Info) { PrevState = State; switch (State) { default: State = IES_ERROR; break; case IES_LPAREN: - setTypeInfo(Info); + setTypeInfo(Info); State = IES_CAST; break; } } - void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } + void setTypeInfo(AsmTypeInfo Type) { CurType = Type; } }; bool Error(SMLoc L, const Twine &Msg, SMRange Range = None, @@ -1089,21 +1089,21 @@ private: std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst); bool VerifyAndAdjustOperands(OperandVector &OrigOperands, OperandVector &FinalOperands); - bool ParseOperand(OperandVector &Operands); - bool ParseATTOperand(OperandVector &Operands); - bool ParseIntelOperand(OperandVector &Operands); + bool ParseOperand(OperandVector &Operands); + bool ParseATTOperand(OperandVector &Operands); + bool ParseIntelOperand(OperandVector &Operands); bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, InlineAsmIdentifierInfo &Info, SMLoc &End); bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); unsigned IdentifyIntelInlineAsmOperator(StringRef Name); unsigned ParseIntelInlineAsmOperator(unsigned OpKind); - unsigned IdentifyMasmOperator(StringRef Name); - bool ParseMasmOperator(unsigned OpKind, int64_t &Val); - bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands); + unsigned IdentifyMasmOperator(StringRef Name); + bool ParseMasmOperator(unsigned OpKind, int64_t &Val); + bool ParseRoundingModeOp(SMLoc Start, OperandVector &Operands); bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM, bool &ParseError, SMLoc &End); - bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM, - bool &ParseError, SMLoc &End); + bool ParseMasmNamedOperator(StringRef Name, IntelExprStateMachine &SM, + bool &ParseError, SMLoc &End); void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); @@ -1112,21 +1112,21 @@ private: bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator = false); - bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc, - SMLoc EndLoc, OperandVector &Operands); + bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc, + SMLoc EndLoc, OperandVector &Operands); X86::CondCode ParseConditionCode(StringRef CCode); bool ParseIntelMemoryOperandSize(unsigned &Size); - bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, - unsigned BaseReg, unsigned IndexReg, - unsigned Scale, SMLoc Start, SMLoc End, - unsigned Size, StringRef Identifier, - const InlineAsmIdentifierInfo &Info, - OperandVector &Operands); - - bool parseDirectiveArch(); - bool parseDirectiveNops(SMLoc L); + bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, + unsigned Size, StringRef Identifier, + const InlineAsmIdentifierInfo &Info, + OperandVector &Operands); + + bool parseDirectiveArch(); + bool parseDirectiveNops(SMLoc L); bool parseDirectiveEven(SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); @@ -1187,7 +1187,7 @@ private: /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z}) /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required. /// return false if no parsing errors occurred, true otherwise. - bool HandleAVX512Operand(OperandVector &Operands); + bool HandleAVX512Operand(OperandVector &Operands); bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc); @@ -1716,17 +1716,17 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands, return false; } -bool X86AsmParser::ParseOperand(OperandVector &Operands) { +bool X86AsmParser::ParseOperand(OperandVector &Operands) { if (isParsingIntelSyntax()) - return ParseIntelOperand(Operands); - - return ParseATTOperand(Operands); + return ParseIntelOperand(Operands); + + return ParseATTOperand(Operands); } -bool X86AsmParser::CreateMemForMSInlineAsm( +bool X86AsmParser::CreateMemForMSInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - const InlineAsmIdentifierInfo &Info, OperandVector &Operands) { + const InlineAsmIdentifierInfo &Info, OperandVector &Operands) { // If we found a decl other than a VarDecl, then assume it is a FuncDecl or // some other label reference. if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) { @@ -1738,10 +1738,10 @@ bool X86AsmParser::CreateMemForMSInlineAsm( } // Create an absolute memory reference in order to match against // instructions taking a PC relative operand. - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start, - End, Size, Identifier, - Info.Label.Decl)); - return false; + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start, + End, Size, Identifier, + Info.Label.Decl)); + return false; } // We either have a direct symbol reference, or an offset from a symbol. The // parser always puts the symbol on the LHS, so look there for size @@ -1758,19 +1758,19 @@ bool X86AsmParser::CreateMemForMSInlineAsm( // It is widely common for MS InlineAsm to use a global variable and one/two // registers in a mmory expression, and though unaccessible via rip/eip. if (IsGlobalLV && (BaseReg || IndexReg)) { - Operands.push_back( - X86Operand::CreateMem(getPointerWidth(), Disp, Start, End)); - return false; - } + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, Start, End)); + return false; + } // Otherwise, we set the base register to a non-zero value // if we don't know the actual value at this time. This is necessary to // get the matching correct in some cases. - BaseReg = BaseReg ? BaseReg : 1; - Operands.push_back(X86Operand::CreateMem( - getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End, - Size, - /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize)); - return false; + BaseReg = BaseReg ? BaseReg : 1; + Operands.push_back(X86Operand::CreateMem( + getPointerWidth(), SegReg, Disp, BaseReg, IndexReg, Scale, Start, End, + Size, + /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, FrontendSize)); + return false; } // Some binary bitwise operators have a named synonymous @@ -1779,10 +1779,10 @@ bool X86AsmParser::CreateMemForMSInlineAsm( bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM, bool &ParseError, SMLoc &End) { - // A named operator should be either lower or upper case, but not a mix... - // except in MASM, which uses full case-insensitivity. - if (Name.compare(Name.lower()) && Name.compare(Name.upper()) && - !getParser().isParsingMasm()) + // A named operator should be either lower or upper case, but not a mix... + // except in MASM, which uses full case-insensitivity. + if (Name.compare(Name.lower()) && Name.compare(Name.upper()) && + !getParser().isParsingMasm()) return false; if (Name.equals_lower("not")) { SM.onNot(); @@ -1818,27 +1818,27 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, End = consumeToken(); return true; } -bool X86AsmParser::ParseMasmNamedOperator(StringRef Name, - IntelExprStateMachine &SM, - bool &ParseError, SMLoc &End) { - if (Name.equals_lower("eq")) { - SM.onEq(); - } else if (Name.equals_lower("ne")) { - SM.onNE(); - } else if (Name.equals_lower("lt")) { - SM.onLT(); - } else if (Name.equals_lower("le")) { - SM.onLE(); - } else if (Name.equals_lower("gt")) { - SM.onGT(); - } else if (Name.equals_lower("ge")) { - SM.onGE(); - } else { - return false; - } - End = consumeToken(); - return true; -} +bool X86AsmParser::ParseMasmNamedOperator(StringRef Name, + IntelExprStateMachine &SM, + bool &ParseError, SMLoc &End) { + if (Name.equals_lower("eq")) { + SM.onEq(); + } else if (Name.equals_lower("ne")) { + SM.onNE(); + } else if (Name.equals_lower("lt")) { + SM.onLT(); + } else if (Name.equals_lower("le")) { + SM.onLE(); + } else if (Name.equals_lower("gt")) { + SM.onGT(); + } else if (Name.equals_lower("ge")) { + SM.onGE(); + } else { + return false; + } + End = consumeToken(); + return true; +} bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); @@ -1847,10 +1847,10 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { AsmToken::TokenKind PrevTK = AsmToken::Error; bool Done = false; while (!Done) { - // Get a fresh reference on each loop iteration in case the previous - // iteration moved the token storage during UnLex(). - const AsmToken &Tok = Parser.getTok(); - + // Get a fresh reference on each loop iteration in case the previous + // iteration moved the token storage during UnLex(). + const AsmToken &Tok = Parser.getTok(); + bool UpdateLocLex = true; AsmToken::TokenKind TK = getLexer().getKind(); @@ -1859,9 +1859,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if ((Done = SM.isValidEndState())) break; return Error(Tok.getLoc(), "unknown token in expression"); - case AsmToken::Error: - return Error(getLexer().getErrLoc(), getLexer().getErr()); - break; + case AsmToken::Error: + return Error(getLexer().getErrLoc(), getLexer().getErr()); + break; case AsmToken::EndOfStatement: Done = true; break; @@ -1871,73 +1871,73 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (ParseIntelDotOperator(SM, End)) return true; break; - case AsmToken::Dot: - if (!Parser.isParsingMasm()) { - if ((Done = SM.isValidEndState())) - break; - return Error(Tok.getLoc(), "unknown token in expression"); - } - // MASM allows spaces around the dot operator (e.g., "var . x") - Lex(); - UpdateLocLex = false; - if (ParseIntelDotOperator(SM, End)) - return true; - break; - case AsmToken::Dollar: - if (!Parser.isParsingMasm()) { - if ((Done = SM.isValidEndState())) - break; - return Error(Tok.getLoc(), "unknown token in expression"); - } - LLVM_FALLTHROUGH; - case AsmToken::String: { - if (Parser.isParsingMasm()) { - // MASM parsers handle strings in expressions as constants. - SMLoc ValueLoc = Tok.getLoc(); - int64_t Res; - const MCExpr *Val; - if (Parser.parsePrimaryExpr(Val, End, nullptr)) - return true; - UpdateLocLex = false; - if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr())) - return Error(ValueLoc, "expected absolute value"); - if (SM.onInteger(Res, ErrMsg)) - return Error(ValueLoc, ErrMsg); - break; - } - LLVM_FALLTHROUGH; - } + case AsmToken::Dot: + if (!Parser.isParsingMasm()) { + if ((Done = SM.isValidEndState())) + break; + return Error(Tok.getLoc(), "unknown token in expression"); + } + // MASM allows spaces around the dot operator (e.g., "var . x") + Lex(); + UpdateLocLex = false; + if (ParseIntelDotOperator(SM, End)) + return true; + break; + case AsmToken::Dollar: + if (!Parser.isParsingMasm()) { + if ((Done = SM.isValidEndState())) + break; + return Error(Tok.getLoc(), "unknown token in expression"); + } + LLVM_FALLTHROUGH; + case AsmToken::String: { + if (Parser.isParsingMasm()) { + // MASM parsers handle strings in expressions as constants. + SMLoc ValueLoc = Tok.getLoc(); + int64_t Res; + const MCExpr *Val; + if (Parser.parsePrimaryExpr(Val, End, nullptr)) + return true; + UpdateLocLex = false; + if (!Val->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr())) + return Error(ValueLoc, "expected absolute value"); + if (SM.onInteger(Res, ErrMsg)) + return Error(ValueLoc, ErrMsg); + break; + } + LLVM_FALLTHROUGH; + } case AsmToken::At: case AsmToken::Identifier: { SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); UpdateLocLex = false; - if (Parser.isParsingMasm()) { - size_t DotOffset = Identifier.find_first_of('.'); - if (DotOffset != StringRef::npos) { - consumeToken(); - StringRef LHS = Identifier.slice(0, DotOffset); - StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1); - StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos); - if (!RHS.empty()) { - getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS)); - } - getLexer().UnLex(AsmToken(AsmToken::Dot, Dot)); - if (!LHS.empty()) { - getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS)); - } - break; - } - } + if (Parser.isParsingMasm()) { + size_t DotOffset = Identifier.find_first_of('.'); + if (DotOffset != StringRef::npos) { + consumeToken(); + StringRef LHS = Identifier.slice(0, DotOffset); + StringRef Dot = Identifier.slice(DotOffset, DotOffset + 1); + StringRef RHS = Identifier.slice(DotOffset + 1, StringRef::npos); + if (!RHS.empty()) { + getLexer().UnLex(AsmToken(AsmToken::Identifier, RHS)); + } + getLexer().UnLex(AsmToken(AsmToken::Dot, Dot)); + if (!LHS.empty()) { + getLexer().UnLex(AsmToken(AsmToken::Identifier, LHS)); + } + break; + } + } // (MASM only) <TYPE> PTR operator if (Parser.isParsingMasm()) { const AsmToken &NextTok = getLexer().peekTok(); if (NextTok.is(AsmToken::Identifier) && NextTok.getIdentifier().equals_lower("ptr")) { - AsmTypeInfo Info; - if (Parser.lookUpType(Identifier, Info)) - return Error(Tok.getLoc(), "unknown type"); - SM.onCast(Info); + AsmTypeInfo Info; + if (Parser.lookUpType(Identifier, Info)) + return Error(Tok.getLoc(), "unknown type"); + SM.onCast(Info); // Eat type and PTR. consumeToken(); End = consumeToken(); @@ -1962,15 +1962,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (SM.onRegister(Reg, ErrMsg)) return Error(IdentLoc, ErrMsg); - AsmFieldInfo Info; + AsmFieldInfo Info; SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data()); - if (Parser.lookUpField(Field, Info)) + if (Parser.lookUpField(Field, Info)) return Error(FieldStartLoc, "unknown offset"); else if (SM.onPlus(ErrMsg)) return Error(getTok().getLoc(), ErrMsg); - else if (SM.onInteger(Info.Offset, ErrMsg)) + else if (SM.onInteger(Info.Offset, ErrMsg)) return Error(IdentLoc, ErrMsg); - SM.setTypeInfo(Info.Type); + SM.setTypeInfo(Info.Type); End = consumeToken(); break; @@ -1984,15 +1984,15 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return true; break; } - if (Parser.isParsingMasm() && - ParseMasmNamedOperator(Identifier, SM, ParseError, End)) { - if (ParseError) - return true; - break; - } + if (Parser.isParsingMasm() && + ParseMasmNamedOperator(Identifier, SM, ParseError, End)) { + if (ParseError) + return true; + break; + } // Symbol reference, when parsing assembly content InlineAsmIdentifierInfo Info; - AsmFieldInfo FieldInfo; + AsmFieldInfo FieldInfo; const MCExpr *Val; if (isParsingMSInlineAsm() || Parser.isParsingMasm()) { // MS Dot Operator expression @@ -2009,9 +2009,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) { if (SM.onInteger(Val, ErrMsg)) return Error(IdentLoc, ErrMsg); - } else { + } else { return true; - } + } break; } // MS InlineAsm identifier @@ -2020,49 +2020,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(IdentLoc, "expected identifier"); if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End)) return true; - else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type, - true, ErrMsg)) + else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type, + true, ErrMsg)) return Error(IdentLoc, ErrMsg); break; } - if (Parser.isParsingMasm()) { - if (unsigned OpKind = IdentifyMasmOperator(Identifier)) { - int64_t Val; - if (ParseMasmOperator(OpKind, Val)) - return true; - if (SM.onInteger(Val, ErrMsg)) - return Error(IdentLoc, ErrMsg); - break; - } - if (!getParser().lookUpType(Identifier, FieldInfo.Type)) { - // Field offset immediate; <TYPE>.<field specification> - Lex(); // eat type - bool EndDot = parseOptionalToken(AsmToken::Dot); - while (EndDot || (getTok().is(AsmToken::Identifier) && - getTok().getString().startswith("."))) { - getParser().parseIdentifier(Identifier); - if (!EndDot) - Identifier.consume_front("."); - EndDot = Identifier.consume_back("."); - if (getParser().lookUpField(FieldInfo.Type.Name, Identifier, - FieldInfo)) { - SMLoc IDEnd = - SMLoc::getFromPointer(Identifier.data() + Identifier.size()); - return Error(IdentLoc, "Unable to lookup field reference!", - SMRange(IdentLoc, IDEnd)); - } - if (!EndDot) - EndDot = parseOptionalToken(AsmToken::Dot); - } - if (SM.onInteger(FieldInfo.Offset, ErrMsg)) - return Error(IdentLoc, ErrMsg); - break; - } - } - if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) { + if (Parser.isParsingMasm()) { + if (unsigned OpKind = IdentifyMasmOperator(Identifier)) { + int64_t Val; + if (ParseMasmOperator(OpKind, Val)) + return true; + if (SM.onInteger(Val, ErrMsg)) + return Error(IdentLoc, ErrMsg); + break; + } + if (!getParser().lookUpType(Identifier, FieldInfo.Type)) { + // Field offset immediate; <TYPE>.<field specification> + Lex(); // eat type + bool EndDot = parseOptionalToken(AsmToken::Dot); + while (EndDot || (getTok().is(AsmToken::Identifier) && + getTok().getString().startswith("."))) { + getParser().parseIdentifier(Identifier); + if (!EndDot) + Identifier.consume_front("."); + EndDot = Identifier.consume_back("."); + if (getParser().lookUpField(FieldInfo.Type.Name, Identifier, + FieldInfo)) { + SMLoc IDEnd = + SMLoc::getFromPointer(Identifier.data() + Identifier.size()); + return Error(IdentLoc, "Unable to lookup field reference!", + SMRange(IdentLoc, IDEnd)); + } + if (!EndDot) + EndDot = parseOptionalToken(AsmToken::Dot); + } + if (SM.onInteger(FieldInfo.Offset, ErrMsg)) + return Error(IdentLoc, ErrMsg); + break; + } + } + if (getParser().parsePrimaryExpr(Val, End, &FieldInfo.Type)) { return Error(Tok.getLoc(), "Unexpected identifier!"); - } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type, - false, ErrMsg)) { + } else if (SM.onIdentifierExpr(Val, Identifier, Info, FieldInfo.Type, + false, ErrMsg)) { return Error(IdentLoc, ErrMsg); } break; @@ -2085,9 +2085,9 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); InlineAsmIdentifierInfo Info; - AsmTypeInfo Type; - if (SM.onIdentifierExpr(Val, Identifier, Info, Type, - isParsingMSInlineAsm(), ErrMsg)) + AsmTypeInfo Type; + if (SM.onIdentifierExpr(Val, Identifier, Info, Type, + isParsingMSInlineAsm(), ErrMsg)) return Error(Loc, ErrMsg); End = consumeToken(); } else { @@ -2229,13 +2229,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier( } //ParseRoundingModeOp - Parse AVX-512 rounding mode operand -bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) { +bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); if (Tok.isNot(AsmToken::Identifier)) - return Error(Tok.getLoc(), "Expected an identifier after {"); + return Error(Tok.getLoc(), "Expected an identifier after {"); if (Tok.getIdentifier().startswith("r")){ int rndMode = StringSwitch<int>(Tok.getIdentifier()) .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) @@ -2244,76 +2244,76 @@ bool X86AsmParser::ParseRoundingModeOp(SMLoc Start, OperandVector &Operands) { .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) .Default(-1); if (-1 == rndMode) - return Error(Tok.getLoc(), "Invalid rounding mode."); + return Error(Tok.getLoc(), "Invalid rounding mode."); Parser.Lex(); // Eat "r*" of r*-sae if (!getLexer().is(AsmToken::Minus)) - return Error(Tok.getLoc(), "Expected - at this point"); + return Error(Tok.getLoc(), "Expected - at this point"); Parser.Lex(); // Eat "-" Parser.Lex(); // Eat the sae if (!getLexer().is(AsmToken::RCurly)) - return Error(Tok.getLoc(), "Expected } at this point"); + return Error(Tok.getLoc(), "Expected } at this point"); SMLoc End = Tok.getEndLoc(); Parser.Lex(); // Eat "}" const MCExpr *RndModeOp = MCConstantExpr::create(rndMode, Parser.getContext()); - Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End)); - return false; + Operands.push_back(X86Operand::CreateImm(RndModeOp, Start, End)); + return false; } if(Tok.getIdentifier().equals("sae")){ Parser.Lex(); // Eat the sae if (!getLexer().is(AsmToken::RCurly)) - return Error(Tok.getLoc(), "Expected } at this point"); + return Error(Tok.getLoc(), "Expected } at this point"); Parser.Lex(); // Eat "}" - Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken)); - return false; + Operands.push_back(X86Operand::CreateToken("{sae}", consumedToken)); + return false; } - return Error(Tok.getLoc(), "unknown token in expression"); + return Error(Tok.getLoc(), "unknown token in expression"); } /// Parse the '.' operator. bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) { const AsmToken &Tok = getTok(); - AsmFieldInfo Info; + AsmFieldInfo Info; // Drop the optional '.'. StringRef DotDispStr = Tok.getString(); if (DotDispStr.startswith(".")) DotDispStr = DotDispStr.drop_front(1); - StringRef TrailingDot; + StringRef TrailingDot; // .Imm gets lexed as a real. if (Tok.is(AsmToken::Real)) { APInt DotDisp; DotDispStr.getAsInteger(10, DotDisp); - Info.Offset = DotDisp.getZExtValue(); + Info.Offset = DotDisp.getZExtValue(); } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) && Tok.is(AsmToken::Identifier)) { - if (DotDispStr.endswith(".")) { - TrailingDot = DotDispStr.substr(DotDispStr.size() - 1); - DotDispStr = DotDispStr.drop_back(1); - } + if (DotDispStr.endswith(".")) { + TrailingDot = DotDispStr.substr(DotDispStr.size() - 1); + DotDispStr = DotDispStr.drop_back(1); + } const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); const StringRef Base = BaseMember.first, Member = BaseMember.second; - if (getParser().lookUpField(SM.getType(), DotDispStr, Info) && - getParser().lookUpField(SM.getSymName(), DotDispStr, Info) && - getParser().lookUpField(DotDispStr, Info) && + if (getParser().lookUpField(SM.getType(), DotDispStr, Info) && + getParser().lookUpField(SM.getSymName(), DotDispStr, Info) && + getParser().lookUpField(DotDispStr, Info) && (!SemaCallback || - SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset))) + SemaCallback->LookupInlineAsmField(Base, Member, Info.Offset))) return Error(Tok.getLoc(), "Unable to lookup field reference!"); - } else { + } else { return Error(Tok.getLoc(), "Unexpected token type!"); - } + } // Eat the DotExpression and update End End = SMLoc::getFromPointer(DotDispStr.data()); const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size(); while (Tok.getLoc().getPointer() < DotExprEndLoc) Lex(); - if (!TrailingDot.empty()) - getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot)); - SM.addImm(Info.Offset); - SM.setTypeInfo(Info.Type); + if (!TrailingDot.empty()) + getLexer().UnLex(AsmToken(AsmToken::Dot, TrailingDot)); + SM.addImm(Info.Offset); + SM.setTypeInfo(Info.Type); return false; } @@ -2328,7 +2328,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, if (!isParsingMSInlineAsm()) { if ((getTok().isNot(AsmToken::Identifier) && getTok().isNot(AsmToken::String)) || - getParser().parsePrimaryExpr(Val, End, nullptr)) + getParser().parsePrimaryExpr(Val, End, nullptr)) return Error(Start, "unexpected token!"); } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) { return Error(Start, "unable to lookup expression"); @@ -2364,7 +2364,7 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, - /*IsUnevaluatedOperand=*/true, End)) + /*IsUnevaluatedOperand=*/true, End)) return 0; if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { @@ -2383,73 +2383,73 @@ unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { return CVal; } -// Query a candidate string for being an Intel assembly operator -// Report back its kind, or IOK_INVALID if does not evaluated as a known one -unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { - return StringSwitch<unsigned>(Name.lower()) - .Case("type", MOK_TYPE) - .Cases("size", "sizeof", MOK_SIZEOF) - .Cases("length", "lengthof", MOK_LENGTHOF) - .Default(MOK_INVALID); -} - -/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator -/// returns the number of elements in an array. It returns the value 1 for -/// non-array variables. The SIZEOF operator returns the size of a type or -/// variable in bytes. A variable's size is the product of its LENGTH and TYPE. -/// The TYPE operator returns the size of a variable. If the variable is an -/// array, TYPE returns the size of a single element. -bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { - MCAsmParser &Parser = getParser(); - SMLoc OpLoc = Parser.getTok().getLoc(); - Parser.Lex(); // Eat operator. - - Val = 0; - if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) { - // Check for SIZEOF(<type>) and TYPE(<type>). - bool InParens = Parser.getTok().is(AsmToken::LParen); - const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok(); - AsmTypeInfo Type; - if (IDTok.is(AsmToken::Identifier) && - !Parser.lookUpType(IDTok.getIdentifier(), Type)) { - Val = Type.Size; - - // Eat tokens. - if (InParens) - parseToken(AsmToken::LParen); - parseToken(AsmToken::Identifier); - if (InParens) - parseToken(AsmToken::RParen); - } - } - - if (!Val) { - IntelExprStateMachine SM; - SMLoc End, Start = Parser.getTok().getLoc(); - if (ParseIntelExpression(SM, End)) - return true; - - switch (OpKind) { - default: - llvm_unreachable("Unexpected operand kind!"); - case MOK_SIZEOF: - Val = SM.getSize(); - break; - case MOK_LENGTHOF: - Val = SM.getLength(); - break; - case MOK_TYPE: - Val = SM.getElementSize(); - break; - } - - if (!Val) - return Error(OpLoc, "expression has unknown type", SMRange(Start, End)); - } - - return false; -} - +// Query a candidate string for being an Intel assembly operator +// Report back its kind, or IOK_INVALID if does not evaluated as a known one +unsigned X86AsmParser::IdentifyMasmOperator(StringRef Name) { + return StringSwitch<unsigned>(Name.lower()) + .Case("type", MOK_TYPE) + .Cases("size", "sizeof", MOK_SIZEOF) + .Cases("length", "lengthof", MOK_LENGTHOF) + .Default(MOK_INVALID); +} + +/// Parse the 'LENGTHOF', 'SIZEOF', and 'TYPE' operators. The LENGTHOF operator +/// returns the number of elements in an array. It returns the value 1 for +/// non-array variables. The SIZEOF operator returns the size of a type or +/// variable in bytes. A variable's size is the product of its LENGTH and TYPE. +/// The TYPE operator returns the size of a variable. If the variable is an +/// array, TYPE returns the size of a single element. +bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { + MCAsmParser &Parser = getParser(); + SMLoc OpLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat operator. + + Val = 0; + if (OpKind == MOK_SIZEOF || OpKind == MOK_TYPE) { + // Check for SIZEOF(<type>) and TYPE(<type>). + bool InParens = Parser.getTok().is(AsmToken::LParen); + const AsmToken &IDTok = InParens ? getLexer().peekTok() : Parser.getTok(); + AsmTypeInfo Type; + if (IDTok.is(AsmToken::Identifier) && + !Parser.lookUpType(IDTok.getIdentifier(), Type)) { + Val = Type.Size; + + // Eat tokens. + if (InParens) + parseToken(AsmToken::LParen); + parseToken(AsmToken::Identifier); + if (InParens) + parseToken(AsmToken::RParen); + } + } + + if (!Val) { + IntelExprStateMachine SM; + SMLoc End, Start = Parser.getTok().getLoc(); + if (ParseIntelExpression(SM, End)) + return true; + + switch (OpKind) { + default: + llvm_unreachable("Unexpected operand kind!"); + case MOK_SIZEOF: + Val = SM.getSize(); + break; + case MOK_LENGTHOF: + Val = SM.getLength(); + break; + case MOK_TYPE: + Val = SM.getElementSize(); + break; + } + + if (!Val) + return Error(OpLoc, "expression has unknown type", SMRange(Start, End)); + } + + return false; +} + bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { Size = StringSwitch<unsigned>(getTok().getString()) .Cases("BYTE", "byte", 8) @@ -2476,7 +2476,7 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { return false; } -bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { +bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -2484,31 +2484,31 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { // Parse optional Size directive. unsigned Size; if (ParseIntelMemoryOperandSize(Size)) - return true; + return true; bool PtrInOperand = bool(Size); Start = Tok.getLoc(); // Rounding mode operand. if (getLexer().is(AsmToken::LCurly)) - return ParseRoundingModeOp(Start, Operands); + return ParseRoundingModeOp(Start, Operands); // Register operand. unsigned RegNo = 0; if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) { if (RegNo == X86::RIP) - return Error(Start, "rip can only be used as a base register"); + return Error(Start, "rip can only be used as a base register"); // A Register followed by ':' is considered a segment override - if (Tok.isNot(AsmToken::Colon)) { - if (PtrInOperand) - return Error(Start, "expected memory operand after 'ptr', " + if (Tok.isNot(AsmToken::Colon)) { + if (PtrInOperand) + return Error(Start, "expected memory operand after 'ptr', " "found register operand instead"); - Operands.push_back(X86Operand::CreateReg(RegNo, Start, End)); - return false; - } + Operands.push_back(X86Operand::CreateReg(RegNo, Start, End)); + return false; + } // An alleged segment override. check if we have a valid segment register if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) - return Error(Start, "invalid segment register"); + return Error(Start, "invalid segment register"); // Eat ':' and update Start location Start = Lex().getLoc(); } @@ -2516,7 +2516,7 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { // Immediates and Memory IntelExprStateMachine SM; if (ParseIntelExpression(SM, End)) - return true; + return true; if (isParsingMSInlineAsm()) RewriteIntelExpression(SM, Start, Tok.getLoc()); @@ -2533,27 +2533,27 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { // and we are parsing a segment override if (!SM.isMemExpr() && !RegNo) { if (isParsingMSInlineAsm() && SM.isOffsetOperator()) { - const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + const InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { // Disp includes the address of a variable; make sure this is recorded // for later handling. - Operands.push_back(X86Operand::CreateImm(Disp, Start, End, - SM.getSymName(), Info.Var.Decl, - Info.Var.IsGlobalLV)); - return false; + Operands.push_back(X86Operand::CreateImm(Disp, Start, End, + SM.getSymName(), Info.Var.Decl, + Info.Var.IsGlobalLV)); + return false; } } - Operands.push_back(X86Operand::CreateImm(Disp, Start, End)); - return false; + Operands.push_back(X86Operand::CreateImm(Disp, Start, End)); + return false; } StringRef ErrMsg; unsigned BaseReg = SM.getBaseReg(); unsigned IndexReg = SM.getIndexReg(); unsigned Scale = SM.getScale(); - if (!PtrInOperand) - Size = SM.getElementSize() << 3; + if (!PtrInOperand) + Size = SM.getElementSize() << 3; if (Scale == 0 && BaseReg != X86::ESP && BaseReg != X86::RSP && (IndexReg == X86::ESP || IndexReg == X86::RSP)) @@ -2572,7 +2572,7 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { if (Scale != 0 && X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) - return Error(Start, "16-bit addresses cannot have a scale"); + return Error(Start, "16-bit addresses cannot have a scale"); // If there was no explicit scale specified, change it to 1. if (Scale == 0) @@ -2588,33 +2588,33 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { if ((BaseReg || IndexReg) && CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(), ErrMsg)) - return Error(Start, ErrMsg); + return Error(Start, ErrMsg); if (isParsingMSInlineAsm()) return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start, End, Size, SM.getSymName(), - SM.getIdentifierInfo(), Operands); + SM.getIdentifierInfo(), Operands); // When parsing x64 MS-style assembly, all memory operands default to // RIP-relative when interpreted as non-absolute references. - if (Parser.isParsingMasm() && is64BitMode()) { - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, - BaseReg, IndexReg, Scale, Start, - End, Size, - /*DefaultBaseReg=*/X86::RIP)); - return false; - } - - if ((BaseReg || IndexReg || RegNo)) - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, - BaseReg, IndexReg, Scale, Start, - End, Size)); - else - Operands.push_back( - X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size)); - return false; + if (Parser.isParsingMasm() && is64BitMode()) { + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, + BaseReg, IndexReg, Scale, Start, + End, Size, + /*DefaultBaseReg=*/X86::RIP)); + return false; + } + + if ((BaseReg || IndexReg || RegNo)) + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, + BaseReg, IndexReg, Scale, Start, + End, Size)); + else + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size)); + return false; } -bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { +bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); switch (getLexer().getKind()) { case AsmToken::Dollar: { @@ -2629,13 +2629,13 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { "expected immediate expression") || getParser().parseExpression(Val, End) || check(isa<X86MCExpr>(Val), L, "expected immediate expression")) - return true; - Operands.push_back(X86Operand::CreateImm(Val, Start, End)); - return false; + return true; + Operands.push_back(X86Operand::CreateImm(Val, Start, End)); + return false; } case AsmToken::LCurly: { SMLoc Start = Parser.getTok().getLoc(); - return ParseRoundingModeOp(Start, Operands); + return ParseRoundingModeOp(Start, Operands); } default: { // This a memory operand or a register. We have some parsing complications @@ -2649,7 +2649,7 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { if (getLexer().isNot(AsmToken::LParen)) { // No '(' so this is either a displacement expression or a register. if (Parser.parseExpression(Expr, EndLoc)) - return true; + return true; if (auto *RE = dyn_cast<X86MCExpr>(Expr)) { // Segment Register. Reset Expr and copy value to register. Expr = nullptr; @@ -2657,27 +2657,27 @@ bool X86AsmParser::ParseATTOperand(OperandVector &Operands) { // Sanity check register. if (Reg == X86::EIZ || Reg == X86::RIZ) - return Error( + return Error( Loc, "%eiz and %riz can only be used as index registers", SMRange(Loc, EndLoc)); if (Reg == X86::RIP) - return Error(Loc, "%rip can only be used as a base register", - SMRange(Loc, EndLoc)); + return Error(Loc, "%rip can only be used as a base register", + SMRange(Loc, EndLoc)); // Return register that are not segment prefixes immediately. - if (!Parser.parseOptionalToken(AsmToken::Colon)) { - Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc)); - return false; - } + if (!Parser.parseOptionalToken(AsmToken::Colon)) { + Operands.push_back(X86Operand::CreateReg(Reg, Loc, EndLoc)); + return false; + } if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(Reg)) - return Error(Loc, "invalid segment register"); - // Accept a '*' absolute memory reference after the segment. Place it - // before the full memory operand. - if (getLexer().is(AsmToken::Star)) - Operands.push_back(X86Operand::CreateToken("*", consumeToken())); + return Error(Loc, "invalid segment register"); + // Accept a '*' absolute memory reference after the segment. Place it + // before the full memory operand. + if (getLexer().is(AsmToken::Star)) + Operands.push_back(X86Operand::CreateToken("*", consumeToken())); } } // This is a Memory operand. - return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands); + return ParseMemOperand(Reg, Expr, Loc, EndLoc, Operands); } } } @@ -2727,7 +2727,7 @@ bool X86AsmParser::ParseZ(std::unique_ptr<X86Operand> &Z, } // true on failure, false otherwise -bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) { +bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) { MCAsmParser &Parser = getParser(); if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -2737,26 +2737,26 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) { // Parse memory broadcasting ({1to<NUM>}). if (getLexer().getTok().getIntVal() != 1) return TokError("Expected 1to<NUM> at this point"); - StringRef Prefix = getLexer().getTok().getString(); - Parser.Lex(); // Eat first token of 1to8 - if (!getLexer().is(AsmToken::Identifier)) + StringRef Prefix = getLexer().getTok().getString(); + Parser.Lex(); // Eat first token of 1to8 + if (!getLexer().is(AsmToken::Identifier)) return TokError("Expected 1to<NUM> at this point"); // Recognize only reasonable suffixes. - SmallVector<char, 5> BroadcastVector; - StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier()) - .toStringRef(BroadcastVector); - if (!BroadcastString.startswith("1to")) - return TokError("Expected 1to<NUM> at this point"); + SmallVector<char, 5> BroadcastVector; + StringRef BroadcastString = (Prefix + getLexer().getTok().getIdentifier()) + .toStringRef(BroadcastVector); + if (!BroadcastString.startswith("1to")) + return TokError("Expected 1to<NUM> at this point"); const char *BroadcastPrimitive = - StringSwitch<const char *>(BroadcastString) - .Case("1to2", "{1to2}") - .Case("1to4", "{1to4}") - .Case("1to8", "{1to8}") - .Case("1to16", "{1to16}") - .Default(nullptr); + StringSwitch<const char *>(BroadcastString) + .Case("1to2", "{1to2}") + .Case("1to4", "{1to4}") + .Case("1to8", "{1to8}") + .Case("1to16", "{1to16}") + .Default(nullptr); if (!BroadcastPrimitive) return TokError("Invalid memory broadcast primitive."); - Parser.Lex(); // Eat trailing token of 1toN + Parser.Lex(); // Eat trailing token of 1toN if (!getLexer().is(AsmToken::RCurly)) return TokError("Expected } at this point"); Parser.Lex(); // Eat "}" @@ -2816,9 +2816,9 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) { /// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'. The '%ds:' prefix /// has already been parsed if present. disp may be provided as well. -bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, - SMLoc StartLoc, SMLoc EndLoc, - OperandVector &Operands) { +bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, + SMLoc StartLoc, SMLoc EndLoc, + OperandVector &Operands) { MCAsmParser &Parser = getParser(); SMLoc Loc; // Based on the initial passed values, we may be in any of these cases, we are @@ -2880,7 +2880,7 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, // Parse immediate if we're not at a mem operand yet. if (!isAtMemOperand()) { if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(Disp, EndLoc)) - return true; + return true; assert(!isa<X86MCExpr>(Disp) && "Expected non-register here."); } else { // Disp is implicitly zero if we haven't parsed it yet. @@ -2893,12 +2893,12 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, if (!parseOptionalToken(AsmToken::LParen)) { if (SegReg == 0) - Operands.push_back( - X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); - else - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, - 0, 0, 1, StartLoc, EndLoc)); - return false; + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); + else + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + 0, 0, 1, StartLoc, EndLoc)); + return false; } // If we reached here, then eat the '(' and Process @@ -2912,13 +2912,13 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, if (getLexer().isNot(AsmToken::Comma) && getLexer().isNot(AsmToken::RParen)) { if (Parser.parseExpression(E, EndLoc) || check(!isa<X86MCExpr>(E), BaseLoc, "expected register here")) - return true; + return true; // Sanity check register. BaseReg = cast<X86MCExpr>(E)->getRegNo(); if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) - return Error(BaseLoc, "eiz and riz can only be used as index registers", - SMRange(BaseLoc, EndLoc)); + return Error(BaseLoc, "eiz and riz can only be used as index registers", + SMRange(BaseLoc, EndLoc)); } if (parseOptionalToken(AsmToken::Comma)) { @@ -2930,14 +2930,14 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, // "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. if (getLexer().isNot(AsmToken::RParen)) { if (Parser.parseTokenLoc(Loc) || Parser.parseExpression(E, EndLoc)) - return true; + return true; if (!isa<X86MCExpr>(E)) { // We've parsed an unexpected Scale Value instead of an index // register. Interpret it as an absolute. int64_t ScaleVal; if (!E->evaluateAsAbsolute(ScaleVal, getStreamer().getAssemblerPtr())) - return Error(Loc, "expected absolute expression"); + return Error(Loc, "expected absolute expression"); if (ScaleVal != 1) Warning(Loc, "scale factor without index register is ignored"); Scale = 1; @@ -2945,10 +2945,10 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, IndexReg = cast<X86MCExpr>(E)->getRegNo(); if (BaseReg == X86::RIP) - return Error(Loc, - "%rip as base register can not have an index register"); + return Error(Loc, + "%rip as base register can not have an index register"); if (IndexReg == X86::RIP) - return Error(Loc, "%rip is not allowed as an index register"); + return Error(Loc, "%rip is not allowed as an index register"); if (parseOptionalToken(AsmToken::Comma)) { // Parse the scale amount: @@ -2959,14 +2959,14 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, int64_t ScaleVal; if (Parser.parseTokenLoc(Loc) || Parser.parseAbsoluteExpression(ScaleVal)) - return Error(Loc, "expected scale expression"); + return Error(Loc, "expected scale expression"); Scale = (unsigned)ScaleVal; // Validate the scale amount. if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && Scale != 1) - return Error(Loc, "scale factor in 16-bit address must be 1"); + return Error(Loc, "scale factor in 16-bit address must be 1"); if (checkScale(Scale, ErrMsg)) - return Error(Loc, ErrMsg); + return Error(Loc, ErrMsg); } } } @@ -2975,30 +2975,30 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp, // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. if (parseToken(AsmToken::RParen, "unexpected token in memory operand")) - return true; + return true; // This is to support otherwise illegal operand (%dx) found in various // unofficial manuals examples (e.g. "out[s]?[bwl]? %al, (%dx)") and must now // be supported. Mark such DX variants separately fix only in special cases. if (BaseReg == X86::DX && IndexReg == 0 && Scale == 1 && SegReg == 0 && - isa<MCConstantExpr>(Disp) && - cast<MCConstantExpr>(Disp)->getValue() == 0) { - Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc)); - return false; - } + isa<MCConstantExpr>(Disp) && + cast<MCConstantExpr>(Disp)->getValue() == 0) { + Operands.push_back(X86Operand::CreateDXReg(BaseLoc, BaseLoc)); + return false; + } if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(), ErrMsg)) - return Error(BaseLoc, ErrMsg); + return Error(BaseLoc, ErrMsg); if (SegReg || BaseReg || IndexReg) - Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, - BaseReg, IndexReg, Scale, StartLoc, - EndLoc)); - else - Operands.push_back( - X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); - return false; + Operands.push_back(X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + BaseReg, IndexReg, Scale, StartLoc, + EndLoc)); + else + Operands.push_back( + X86Operand::CreateMem(getPointerWidth(), Disp, StartLoc, EndLoc)); + return false; } // Parse either a standard primary expression or a register. @@ -3015,7 +3015,7 @@ bool X86AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { Res = X86MCExpr::create(RegNo, Parser.getContext()); return false; } - return Parser.parsePrimaryExpr(Res, EndLoc, nullptr); + return Parser.parsePrimaryExpr(Res, EndLoc, nullptr); } bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -3025,7 +3025,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Reset the forced VEX encoding. ForcedVEXEncoding = VEXEncoding_Default; - ForcedDispEncoding = DispEncoding_Default; + ForcedDispEncoding = DispEncoding_Default; // Parse pseudo prefixes. while (1) { @@ -3038,18 +3038,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(Parser.getTok().getLoc(), "Expected '}'"); Parser.Lex(); // Eat curly. - if (Prefix == "vex") + if (Prefix == "vex") ForcedVEXEncoding = VEXEncoding_VEX; - else if (Prefix == "vex2") - ForcedVEXEncoding = VEXEncoding_VEX2; + else if (Prefix == "vex2") + ForcedVEXEncoding = VEXEncoding_VEX2; else if (Prefix == "vex3") ForcedVEXEncoding = VEXEncoding_VEX3; else if (Prefix == "evex") ForcedVEXEncoding = VEXEncoding_EVEX; - else if (Prefix == "disp8") - ForcedDispEncoding = DispEncoding_Disp8; - else if (Prefix == "disp32") - ForcedDispEncoding = DispEncoding_Disp32; + else if (Prefix == "disp8") + ForcedDispEncoding = DispEncoding_Disp8; + else if (Prefix == "disp32") + ForcedDispEncoding = DispEncoding_Disp32; else return Error(NameLoc, "unknown prefix"); @@ -3066,36 +3066,36 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } continue; } - // Parse MASM style pseudo prefixes. - if (isParsingMSInlineAsm()) { - if (Name.equals_lower("vex")) - ForcedVEXEncoding = VEXEncoding_VEX; - else if (Name.equals_lower("vex2")) - ForcedVEXEncoding = VEXEncoding_VEX2; - else if (Name.equals_lower("vex3")) - ForcedVEXEncoding = VEXEncoding_VEX3; - else if (Name.equals_lower("evex")) - ForcedVEXEncoding = VEXEncoding_EVEX; - - if (ForcedVEXEncoding != VEXEncoding_Default) { - if (getLexer().isNot(AsmToken::Identifier)) - return Error(Parser.getTok().getLoc(), "Expected identifier"); - // FIXME: The mnemonic won't match correctly if its not in lower case. - Name = Parser.getTok().getString(); - NameLoc = Parser.getTok().getLoc(); - Parser.Lex(); - } - } + // Parse MASM style pseudo prefixes. + if (isParsingMSInlineAsm()) { + if (Name.equals_lower("vex")) + ForcedVEXEncoding = VEXEncoding_VEX; + else if (Name.equals_lower("vex2")) + ForcedVEXEncoding = VEXEncoding_VEX2; + else if (Name.equals_lower("vex3")) + ForcedVEXEncoding = VEXEncoding_VEX3; + else if (Name.equals_lower("evex")) + ForcedVEXEncoding = VEXEncoding_EVEX; + + if (ForcedVEXEncoding != VEXEncoding_Default) { + if (getLexer().isNot(AsmToken::Identifier)) + return Error(Parser.getTok().getLoc(), "Expected identifier"); + // FIXME: The mnemonic won't match correctly if its not in lower case. + Name = Parser.getTok().getString(); + NameLoc = Parser.getTok().getLoc(); + Parser.Lex(); + } + } break; } - // Support the suffix syntax for overriding displacement size as well. - if (Name.consume_back(".d32")) { - ForcedDispEncoding = DispEncoding_Disp32; - } else if (Name.consume_back(".d8")) { - ForcedDispEncoding = DispEncoding_Disp8; - } - + // Support the suffix syntax for overriding displacement size as well. + if (Name.consume_back(".d32")) { + ForcedDispEncoding = DispEncoding_Disp32; + } else if (Name.consume_back(".d8")) { + ForcedDispEncoding = DispEncoding_Disp8; + } + StringRef PatchedName = Name; // Hack to skip "short" following Jcc. @@ -3263,13 +3263,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // repz repnz <insn> ; GAS errors for the use of two similar prefixes // lock addq %rax, %rbx ; Destination operand must be of memory type // xacquire <insn> ; xacquire must be accompanied by 'lock' - bool IsPrefix = - StringSwitch<bool>(Name) - .Cases("cs", "ds", "es", "fs", "gs", "ss", true) - .Cases("rex64", "data32", "data16", "addr32", "addr16", true) - .Cases("xacquire", "xrelease", true) - .Cases("acquire", "release", isParsingIntelSyntax()) - .Default(false); + bool IsPrefix = + StringSwitch<bool>(Name) + .Cases("cs", "ds", "es", "fs", "gs", "ss", true) + .Cases("rex64", "data32", "data16", "addr32", "addr16", true) + .Cases("xacquire", "xrelease", true) + .Cases("acquire", "release", isParsingIntelSyntax()) + .Default(false); auto isLockRepeatNtPrefix = [](StringRef N) { return StringSwitch<bool>(N) @@ -3324,22 +3324,22 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(NameLoc, "'data32' is not supported in 64-bit mode"); // Hack to 'data16' for the table lookup. PatchedName = "data16"; - - if (getLexer().isNot(AsmToken::EndOfStatement)) { - StringRef Next = Parser.getTok().getString(); - getLexer().Lex(); - // data32 effectively changes the instruction suffix. - // TODO Generalize. - if (Next == "callw") - Next = "calll"; - if (Next == "ljmpw") - Next = "ljmpl"; - - Name = Next; - PatchedName = Name; - ForcedDataPrefix = X86::Mode32Bit; - IsPrefix = false; - } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + StringRef Next = Parser.getTok().getString(); + getLexer().Lex(); + // data32 effectively changes the instruction suffix. + // TODO Generalize. + if (Next == "callw") + Next = "calll"; + if (Next == "ljmpw") + Next = "ljmpl"; + + Name = Next; + PatchedName = Name; + ForcedDataPrefix = X86::Mode32Bit; + IsPrefix = false; + } } Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); @@ -3355,18 +3355,18 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as // the next one. - if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) { + if (getLexer().isNot(AsmToken::EndOfStatement) && !IsPrefix) { // Parse '*' modifier. if (getLexer().is(AsmToken::Star)) Operands.push_back(X86Operand::CreateToken("*", consumeToken())); // Read the operands. while(1) { - if (ParseOperand(Operands)) - return true; - if (HandleAVX512Operand(Operands)) - return true; - + if (ParseOperand(Operands)) + return true; + if (HandleAVX512Operand(Operands)) + return true; + // check for comma and eat it if (getLexer().is(AsmToken::Comma)) Parser.Lex(); @@ -3392,7 +3392,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Consume the EndOfStatement or the prefix separator Slash if (getLexer().is(AsmToken::EndOfStatement) || - (IsPrefix && getLexer().is(AsmToken::Slash))) + (IsPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); else if (CurlyAsEndOfStatement) // Add an actual EndOfStatement before the curly brace @@ -3567,26 +3567,26 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; - case X86::JMP_1: - // {disp32} forces a larger displacement as if the instruction was relaxed. - // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. - // This matches GNU assembler. - if (ForcedDispEncoding == DispEncoding_Disp32) { - Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4); - return true; - } - - return false; - case X86::JCC_1: - // {disp32} forces a larger displacement as if the instruction was relaxed. - // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. - // This matches GNU assembler. - if (ForcedDispEncoding == DispEncoding_Disp32) { - Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4); - return true; - } - - return false; + case X86::JMP_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JMP_2 : X86::JMP_4); + return true; + } + + return false; + case X86::JCC_1: + // {disp32} forces a larger displacement as if the instruction was relaxed. + // NOTE: 16-bit mode uses 16-bit displacement even though it says {disp32}. + // This matches GNU assembler. + if (ForcedDispEncoding == DispEncoding_Disp32) { + Inst.setOpcode(is16BitMode() ? X86::JCC_2 : X86::JCC_4); + return true; + } + + return false; case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: @@ -3645,123 +3645,123 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { Inst.setOpcode(NewOpc); return true; } - case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri: - case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri: - case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri: - case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri: - case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri: - case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri: - case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: { - // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate. - // FIXME: It would be great if we could just do this with an InstAlias. - if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1) - return false; - - unsigned NewOpc; - switch (Inst.getOpcode()) { - default: llvm_unreachable("Invalid opcode"); - case X86::RCR8ri: NewOpc = X86::RCR8r1; break; - case X86::RCR16ri: NewOpc = X86::RCR16r1; break; - case X86::RCR32ri: NewOpc = X86::RCR32r1; break; - case X86::RCR64ri: NewOpc = X86::RCR64r1; break; - case X86::RCL8ri: NewOpc = X86::RCL8r1; break; - case X86::RCL16ri: NewOpc = X86::RCL16r1; break; - case X86::RCL32ri: NewOpc = X86::RCL32r1; break; - case X86::RCL64ri: NewOpc = X86::RCL64r1; break; - case X86::ROR8ri: NewOpc = X86::ROR8r1; break; - case X86::ROR16ri: NewOpc = X86::ROR16r1; break; - case X86::ROR32ri: NewOpc = X86::ROR32r1; break; - case X86::ROR64ri: NewOpc = X86::ROR64r1; break; - case X86::ROL8ri: NewOpc = X86::ROL8r1; break; - case X86::ROL16ri: NewOpc = X86::ROL16r1; break; - case X86::ROL32ri: NewOpc = X86::ROL32r1; break; - case X86::ROL64ri: NewOpc = X86::ROL64r1; break; - case X86::SAR8ri: NewOpc = X86::SAR8r1; break; - case X86::SAR16ri: NewOpc = X86::SAR16r1; break; - case X86::SAR32ri: NewOpc = X86::SAR32r1; break; - case X86::SAR64ri: NewOpc = X86::SAR64r1; break; - case X86::SHR8ri: NewOpc = X86::SHR8r1; break; - case X86::SHR16ri: NewOpc = X86::SHR16r1; break; - case X86::SHR32ri: NewOpc = X86::SHR32r1; break; - case X86::SHR64ri: NewOpc = X86::SHR64r1; break; - case X86::SHL8ri: NewOpc = X86::SHL8r1; break; - case X86::SHL16ri: NewOpc = X86::SHL16r1; break; - case X86::SHL32ri: NewOpc = X86::SHL32r1; break; - case X86::SHL64ri: NewOpc = X86::SHL64r1; break; - } - - MCInst TmpInst; - TmpInst.setOpcode(NewOpc); - TmpInst.addOperand(Inst.getOperand(0)); - TmpInst.addOperand(Inst.getOperand(1)); - Inst = TmpInst; - return true; - } - case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi: - case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi: - case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi: - case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi: - case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi: - case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi: - case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: { - // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate. - // FIXME: It would be great if we could just do this with an InstAlias. - if (!Inst.getOperand(X86::AddrNumOperands).isImm() || - Inst.getOperand(X86::AddrNumOperands).getImm() != 1) - return false; - - unsigned NewOpc; - switch (Inst.getOpcode()) { - default: llvm_unreachable("Invalid opcode"); - case X86::RCR8mi: NewOpc = X86::RCR8m1; break; - case X86::RCR16mi: NewOpc = X86::RCR16m1; break; - case X86::RCR32mi: NewOpc = X86::RCR32m1; break; - case X86::RCR64mi: NewOpc = X86::RCR64m1; break; - case X86::RCL8mi: NewOpc = X86::RCL8m1; break; - case X86::RCL16mi: NewOpc = X86::RCL16m1; break; - case X86::RCL32mi: NewOpc = X86::RCL32m1; break; - case X86::RCL64mi: NewOpc = X86::RCL64m1; break; - case X86::ROR8mi: NewOpc = X86::ROR8m1; break; - case X86::ROR16mi: NewOpc = X86::ROR16m1; break; - case X86::ROR32mi: NewOpc = X86::ROR32m1; break; - case X86::ROR64mi: NewOpc = X86::ROR64m1; break; - case X86::ROL8mi: NewOpc = X86::ROL8m1; break; - case X86::ROL16mi: NewOpc = X86::ROL16m1; break; - case X86::ROL32mi: NewOpc = X86::ROL32m1; break; - case X86::ROL64mi: NewOpc = X86::ROL64m1; break; - case X86::SAR8mi: NewOpc = X86::SAR8m1; break; - case X86::SAR16mi: NewOpc = X86::SAR16m1; break; - case X86::SAR32mi: NewOpc = X86::SAR32m1; break; - case X86::SAR64mi: NewOpc = X86::SAR64m1; break; - case X86::SHR8mi: NewOpc = X86::SHR8m1; break; - case X86::SHR16mi: NewOpc = X86::SHR16m1; break; - case X86::SHR32mi: NewOpc = X86::SHR32m1; break; - case X86::SHR64mi: NewOpc = X86::SHR64m1; break; - case X86::SHL8mi: NewOpc = X86::SHL8m1; break; - case X86::SHL16mi: NewOpc = X86::SHL16m1; break; - case X86::SHL32mi: NewOpc = X86::SHL32m1; break; - case X86::SHL64mi: NewOpc = X86::SHL64m1; break; - } - - MCInst TmpInst; - TmpInst.setOpcode(NewOpc); - for (int i = 0; i != X86::AddrNumOperands; ++i) - TmpInst.addOperand(Inst.getOperand(i)); - Inst = TmpInst; - return true; - } - case X86::INT: { - // Transforms "int $3" into "int3" as a size optimization. We can't write an - // instalias with an immediate operand yet. - if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3) - return false; - - MCInst TmpInst; - TmpInst.setOpcode(X86::INT3); - Inst = TmpInst; - return true; - } - } + case X86::RCR8ri: case X86::RCR16ri: case X86::RCR32ri: case X86::RCR64ri: + case X86::RCL8ri: case X86::RCL16ri: case X86::RCL32ri: case X86::RCL64ri: + case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri: + case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri: + case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri: + case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri: + case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: { + // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate. + // FIXME: It would be great if we could just do this with an InstAlias. + if (!Inst.getOperand(2).isImm() || Inst.getOperand(2).getImm() != 1) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::RCR8ri: NewOpc = X86::RCR8r1; break; + case X86::RCR16ri: NewOpc = X86::RCR16r1; break; + case X86::RCR32ri: NewOpc = X86::RCR32r1; break; + case X86::RCR64ri: NewOpc = X86::RCR64r1; break; + case X86::RCL8ri: NewOpc = X86::RCL8r1; break; + case X86::RCL16ri: NewOpc = X86::RCL16r1; break; + case X86::RCL32ri: NewOpc = X86::RCL32r1; break; + case X86::RCL64ri: NewOpc = X86::RCL64r1; break; + case X86::ROR8ri: NewOpc = X86::ROR8r1; break; + case X86::ROR16ri: NewOpc = X86::ROR16r1; break; + case X86::ROR32ri: NewOpc = X86::ROR32r1; break; + case X86::ROR64ri: NewOpc = X86::ROR64r1; break; + case X86::ROL8ri: NewOpc = X86::ROL8r1; break; + case X86::ROL16ri: NewOpc = X86::ROL16r1; break; + case X86::ROL32ri: NewOpc = X86::ROL32r1; break; + case X86::ROL64ri: NewOpc = X86::ROL64r1; break; + case X86::SAR8ri: NewOpc = X86::SAR8r1; break; + case X86::SAR16ri: NewOpc = X86::SAR16r1; break; + case X86::SAR32ri: NewOpc = X86::SAR32r1; break; + case X86::SAR64ri: NewOpc = X86::SAR64r1; break; + case X86::SHR8ri: NewOpc = X86::SHR8r1; break; + case X86::SHR16ri: NewOpc = X86::SHR16r1; break; + case X86::SHR32ri: NewOpc = X86::SHR32r1; break; + case X86::SHR64ri: NewOpc = X86::SHR64r1; break; + case X86::SHL8ri: NewOpc = X86::SHL8r1; break; + case X86::SHL16ri: NewOpc = X86::SHL16r1; break; + case X86::SHL32ri: NewOpc = X86::SHL32r1; break; + case X86::SHL64ri: NewOpc = X86::SHL64r1; break; + } + + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + return true; + } + case X86::RCR8mi: case X86::RCR16mi: case X86::RCR32mi: case X86::RCR64mi: + case X86::RCL8mi: case X86::RCL16mi: case X86::RCL32mi: case X86::RCL64mi: + case X86::ROR8mi: case X86::ROR16mi: case X86::ROR32mi: case X86::ROR64mi: + case X86::ROL8mi: case X86::ROL16mi: case X86::ROL32mi: case X86::ROL64mi: + case X86::SAR8mi: case X86::SAR16mi: case X86::SAR32mi: case X86::SAR64mi: + case X86::SHR8mi: case X86::SHR16mi: case X86::SHR32mi: case X86::SHR64mi: + case X86::SHL8mi: case X86::SHL16mi: case X86::SHL32mi: case X86::SHL64mi: { + // Optimize s{hr,ar,hl} $1, <op> to "shift <op>". Similar for rotate. + // FIXME: It would be great if we could just do this with an InstAlias. + if (!Inst.getOperand(X86::AddrNumOperands).isImm() || + Inst.getOperand(X86::AddrNumOperands).getImm() != 1) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::RCR8mi: NewOpc = X86::RCR8m1; break; + case X86::RCR16mi: NewOpc = X86::RCR16m1; break; + case X86::RCR32mi: NewOpc = X86::RCR32m1; break; + case X86::RCR64mi: NewOpc = X86::RCR64m1; break; + case X86::RCL8mi: NewOpc = X86::RCL8m1; break; + case X86::RCL16mi: NewOpc = X86::RCL16m1; break; + case X86::RCL32mi: NewOpc = X86::RCL32m1; break; + case X86::RCL64mi: NewOpc = X86::RCL64m1; break; + case X86::ROR8mi: NewOpc = X86::ROR8m1; break; + case X86::ROR16mi: NewOpc = X86::ROR16m1; break; + case X86::ROR32mi: NewOpc = X86::ROR32m1; break; + case X86::ROR64mi: NewOpc = X86::ROR64m1; break; + case X86::ROL8mi: NewOpc = X86::ROL8m1; break; + case X86::ROL16mi: NewOpc = X86::ROL16m1; break; + case X86::ROL32mi: NewOpc = X86::ROL32m1; break; + case X86::ROL64mi: NewOpc = X86::ROL64m1; break; + case X86::SAR8mi: NewOpc = X86::SAR8m1; break; + case X86::SAR16mi: NewOpc = X86::SAR16m1; break; + case X86::SAR32mi: NewOpc = X86::SAR32m1; break; + case X86::SAR64mi: NewOpc = X86::SAR64m1; break; + case X86::SHR8mi: NewOpc = X86::SHR8m1; break; + case X86::SHR16mi: NewOpc = X86::SHR16m1; break; + case X86::SHR32mi: NewOpc = X86::SHR32m1; break; + case X86::SHR64mi: NewOpc = X86::SHR64m1; break; + case X86::SHL8mi: NewOpc = X86::SHL8m1; break; + case X86::SHL16mi: NewOpc = X86::SHL16m1; break; + case X86::SHL32mi: NewOpc = X86::SHL32m1; break; + case X86::SHL64mi: NewOpc = X86::SHL64m1; break; + } + + MCInst TmpInst; + TmpInst.setOpcode(NewOpc); + for (int i = 0; i != X86::AddrNumOperands; ++i) + TmpInst.addOperand(Inst.getOperand(i)); + Inst = TmpInst; + return true; + } + case X86::INT: { + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (!Inst.getOperand(0).isImm() || Inst.getOperand(0).getImm() != 3) + return false; + + MCInst TmpInst; + TmpInst.setOpcode(X86::INT3); + Inst = TmpInst; + return true; + } + } } bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { @@ -3860,33 +3860,33 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { } } - const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); - // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to - // check this with the legacy encoding, VEX/EVEX/XOP don't use REX. - if ((MCID.TSFlags & X86II::EncodingMask) == 0) { - MCPhysReg HReg = X86::NoRegister; - bool UsesRex = MCID.TSFlags & X86II::REX_W; - unsigned NumOps = Inst.getNumOperands(); - for (unsigned i = 0; i != NumOps; ++i) { - const MCOperand &MO = Inst.getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) - HReg = Reg; - if (X86II::isX86_64NonExtLowByteReg(Reg) || - X86II::isX86_64ExtendedReg(Reg)) - UsesRex = true; - } - - if (UsesRex && HReg != X86::NoRegister) { - StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg); - return Error(Ops[0]->getStartLoc(), - "can't encode '" + RegName + "' in an instruction requiring " - "REX prefix"); - } - } - + const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); + // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to + // check this with the legacy encoding, VEX/EVEX/XOP don't use REX. + if ((MCID.TSFlags & X86II::EncodingMask) == 0) { + MCPhysReg HReg = X86::NoRegister; + bool UsesRex = MCID.TSFlags & X86II::REX_W; + unsigned NumOps = Inst.getNumOperands(); + for (unsigned i = 0; i != NumOps; ++i) { + const MCOperand &MO = Inst.getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + HReg = Reg; + if (X86II::isX86_64NonExtLowByteReg(Reg) || + X86II::isX86_64ExtendedReg(Reg)) + UsesRex = true; + } + + if (UsesRex && HReg != X86::NoRegister) { + StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg); + return Error(Ops[0]->getStartLoc(), + "can't encode '" + RegName + "' in an instruction requiring " + "REX prefix"); + } + } + return false; } @@ -4080,18 +4080,18 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Unsupported; if ((ForcedVEXEncoding == VEXEncoding_VEX || - ForcedVEXEncoding == VEXEncoding_VEX2 || + ForcedVEXEncoding == VEXEncoding_VEX2 || ForcedVEXEncoding == VEXEncoding_VEX3) && (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) return Match_Unsupported; - // These instructions are only available with {vex}, {vex2} or {vex3} prefix - if (MCID.TSFlags & X86II::ExplicitVEXPrefix && - (ForcedVEXEncoding != VEXEncoding_VEX && - ForcedVEXEncoding != VEXEncoding_VEX2 && - ForcedVEXEncoding != VEXEncoding_VEX3)) - return Match_Unsupported; - + // These instructions are only available with {vex}, {vex2} or {vex3} prefix + if (MCID.TSFlags & X86II::ExplicitVEXPrefix && + (ForcedVEXEncoding != VEXEncoding_VEX && + ForcedVEXEncoding != VEXEncoding_VEX2 && + ForcedVEXEncoding != VEXEncoding_VEX3)) + return Match_Unsupported; + // These instructions match ambiguously with their VEX encoded counterparts // and appear first in the matching table. Reject them unless we're forcing // EVEX encoding. @@ -4130,39 +4130,39 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; - // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the - // encoder and printer. - if (ForcedVEXEncoding == VEXEncoding_VEX) - Prefixes |= X86::IP_USE_VEX; - else if (ForcedVEXEncoding == VEXEncoding_VEX2) - Prefixes |= X86::IP_USE_VEX2; - else if (ForcedVEXEncoding == VEXEncoding_VEX3) + // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the + // encoder and printer. + if (ForcedVEXEncoding == VEXEncoding_VEX) + Prefixes |= X86::IP_USE_VEX; + else if (ForcedVEXEncoding == VEXEncoding_VEX2) + Prefixes |= X86::IP_USE_VEX2; + else if (ForcedVEXEncoding == VEXEncoding_VEX3) Prefixes |= X86::IP_USE_VEX3; - else if (ForcedVEXEncoding == VEXEncoding_EVEX) - Prefixes |= X86::IP_USE_EVEX; - - // Set encoded flags for {disp8} and {disp32}. - if (ForcedDispEncoding == DispEncoding_Disp8) - Prefixes |= X86::IP_USE_DISP8; - else if (ForcedDispEncoding == DispEncoding_Disp32) - Prefixes |= X86::IP_USE_DISP32; - + else if (ForcedVEXEncoding == VEXEncoding_EVEX) + Prefixes |= X86::IP_USE_EVEX; + + // Set encoded flags for {disp8} and {disp32}. + if (ForcedDispEncoding == DispEncoding_Disp8) + Prefixes |= X86::IP_USE_DISP8; + else if (ForcedDispEncoding == DispEncoding_Disp32) + Prefixes |= X86::IP_USE_DISP32; + if (Prefixes) Inst.setFlags(Prefixes); - // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode - // when matching the instruction. - if (ForcedDataPrefix == X86::Mode32Bit) - SwitchMode(X86::Mode32Bit); + // In 16-bit mode, if data32 is specified, temporarily switch to 32-bit mode + // when matching the instruction. + if (ForcedDataPrefix == X86::Mode32Bit) + SwitchMode(X86::Mode32Bit); // First, try a direct match. FeatureBitset MissingFeatures; unsigned OriginalError = MatchInstruction(Operands, Inst, ErrorInfo, MissingFeatures, MatchingInlineAsm, isParsingIntelSyntax()); - if (ForcedDataPrefix == X86::Mode32Bit) { - SwitchMode(X86::Mode16Bit); - ForcedDataPrefix = 0; - } + if (ForcedDataPrefix == X86::Mode32Bit) { + SwitchMode(X86::Mode16Bit); + ForcedDataPrefix = 0; + } switch (OriginalError) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: @@ -4271,15 +4271,15 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { - if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which - // encoding is selected. Loop on it while changes happen so the - // individual transformations can chain off each other. - if (!MatchingInlineAsm) - while (processInstruction(Inst, Operands)) - ; - + if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) + return true; + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the + // individual transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + Inst.setLoc(IDLoc); if (!MatchingInlineAsm) emitInstruction(Inst, Operands, Out); @@ -4393,23 +4393,23 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, MCInst Inst; - // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the - // encoder and printer. - if (ForcedVEXEncoding == VEXEncoding_VEX) - Prefixes |= X86::IP_USE_VEX; - else if (ForcedVEXEncoding == VEXEncoding_VEX2) - Prefixes |= X86::IP_USE_VEX2; - else if (ForcedVEXEncoding == VEXEncoding_VEX3) + // If VEX/EVEX encoding is forced, we need to pass the USE_* flag to the + // encoder and printer. + if (ForcedVEXEncoding == VEXEncoding_VEX) + Prefixes |= X86::IP_USE_VEX; + else if (ForcedVEXEncoding == VEXEncoding_VEX2) + Prefixes |= X86::IP_USE_VEX2; + else if (ForcedVEXEncoding == VEXEncoding_VEX3) Prefixes |= X86::IP_USE_VEX3; - else if (ForcedVEXEncoding == VEXEncoding_EVEX) - Prefixes |= X86::IP_USE_EVEX; - - // Set encoded flags for {disp8} and {disp32}. - if (ForcedDispEncoding == DispEncoding_Disp8) - Prefixes |= X86::IP_USE_DISP8; - else if (ForcedDispEncoding == DispEncoding_Disp32) - Prefixes |= X86::IP_USE_DISP32; - + else if (ForcedVEXEncoding == VEXEncoding_EVEX) + Prefixes |= X86::IP_USE_EVEX; + + // Set encoded flags for {disp8} and {disp32}. + if (ForcedDispEncoding == DispEncoding_Disp8) + Prefixes |= X86::IP_USE_DISP8; + else if (ForcedDispEncoding == DispEncoding_Disp32) + Prefixes |= X86::IP_USE_DISP32; + if (Prefixes) Inst.setFlags(Prefixes); @@ -4603,8 +4603,8 @@ bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getIdentifier(); - if (IDVal.startswith(".arch")) - return parseDirectiveArch(); + if (IDVal.startswith(".arch")) + return parseDirectiveArch(); if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { @@ -4629,9 +4629,9 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { "a '%' prefix in .intel_syntax"); } return false; - } else if (IDVal == ".nops") - return parseDirectiveNops(DirectiveID.getLoc()); - else if (IDVal == ".even") + } else if (IDVal == ".nops") + return parseDirectiveNops(DirectiveID.getLoc()); + else if (IDVal == ".even") return parseDirectiveEven(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_proc") return parseDirectiveFPOProc(DirectiveID.getLoc()); @@ -4647,67 +4647,67 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveFPOEndPrologue(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_endproc") return parseDirectiveFPOEndProc(DirectiveID.getLoc()); - else if (IDVal == ".seh_pushreg" || - (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg"))) + else if (IDVal == ".seh_pushreg" || + (Parser.isParsingMasm() && IDVal.equals_lower(".pushreg"))) return parseDirectiveSEHPushReg(DirectiveID.getLoc()); - else if (IDVal == ".seh_setframe" || - (Parser.isParsingMasm() && IDVal.equals_lower(".setframe"))) + else if (IDVal == ".seh_setframe" || + (Parser.isParsingMasm() && IDVal.equals_lower(".setframe"))) return parseDirectiveSEHSetFrame(DirectiveID.getLoc()); - else if (IDVal == ".seh_savereg" || - (Parser.isParsingMasm() && IDVal.equals_lower(".savereg"))) + else if (IDVal == ".seh_savereg" || + (Parser.isParsingMasm() && IDVal.equals_lower(".savereg"))) return parseDirectiveSEHSaveReg(DirectiveID.getLoc()); - else if (IDVal == ".seh_savexmm" || - (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128"))) + else if (IDVal == ".seh_savexmm" || + (Parser.isParsingMasm() && IDVal.equals_lower(".savexmm128"))) return parseDirectiveSEHSaveXMM(DirectiveID.getLoc()); - else if (IDVal == ".seh_pushframe" || - (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe"))) + else if (IDVal == ".seh_pushframe" || + (Parser.isParsingMasm() && IDVal.equals_lower(".pushframe"))) return parseDirectiveSEHPushFrame(DirectiveID.getLoc()); return true; } -bool X86AsmParser::parseDirectiveArch() { - // Ignore .arch for now. - getParser().parseStringToEndOfStatement(); - return false; -} - -/// parseDirectiveNops -/// ::= .nops size[, control] -bool X86AsmParser::parseDirectiveNops(SMLoc L) { - int64_t NumBytes = 0, Control = 0; - SMLoc NumBytesLoc, ControlLoc; - const MCSubtargetInfo STI = getSTI(); - NumBytesLoc = getTok().getLoc(); - if (getParser().checkForValidSection() || - getParser().parseAbsoluteExpression(NumBytes)) - return true; - - if (parseOptionalToken(AsmToken::Comma)) { - ControlLoc = getTok().getLoc(); - if (getParser().parseAbsoluteExpression(Control)) - return true; - } - if (getParser().parseToken(AsmToken::EndOfStatement, - "unexpected token in '.nops' directive")) - return true; - - if (NumBytes <= 0) { - Error(NumBytesLoc, "'.nops' directive with non-positive size"); - return false; - } - - if (Control < 0) { - Error(ControlLoc, "'.nops' directive with negative NOP size"); - return false; - } - - /// Emit nops - getParser().getStreamer().emitNops(NumBytes, Control, L); - - return false; -} - +bool X86AsmParser::parseDirectiveArch() { + // Ignore .arch for now. + getParser().parseStringToEndOfStatement(); + return false; +} + +/// parseDirectiveNops +/// ::= .nops size[, control] +bool X86AsmParser::parseDirectiveNops(SMLoc L) { + int64_t NumBytes = 0, Control = 0; + SMLoc NumBytesLoc, ControlLoc; + const MCSubtargetInfo STI = getSTI(); + NumBytesLoc = getTok().getLoc(); + if (getParser().checkForValidSection() || + getParser().parseAbsoluteExpression(NumBytes)) + return true; + + if (parseOptionalToken(AsmToken::Comma)) { + ControlLoc = getTok().getLoc(); + if (getParser().parseAbsoluteExpression(Control)) + return true; + } + if (getParser().parseToken(AsmToken::EndOfStatement, + "unexpected token in '.nops' directive")) + return true; + + if (NumBytes <= 0) { + Error(NumBytesLoc, "'.nops' directive with non-positive size"); + return false; + } + + if (Control < 0) { + Error(ControlLoc, "'.nops' directive with negative NOP size"); + return false; + } + + /// Emit nops + getParser().getStreamer().emitNops(NumBytes, Control, L); + + return false; +} + /// parseDirectiveEven /// ::= .even bool X86AsmParser::parseDirectiveEven(SMLoc L) { diff --git a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make index f88283f4e5..b3115c55b9 100644 --- a/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/AsmParser/ya.make @@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCParser - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/X86/MCTargetDesc - contrib/libs/llvm12/lib/Target/X86/TargetInfo + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCParser + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/X86/MCTargetDesc + contrib/libs/llvm12/lib/Target/X86/TargetInfo ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86/AsmParser + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86/AsmParser ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp index 4e6d8e8e1a..1d396796e7 100644 --- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -492,7 +492,7 @@ static int readPrefixes(struct InternalInstruction *insn) { insn->addressSize = (insn->hasAdSize ? 4 : 8); insn->displacementSize = 4; insn->immediateSize = 4; - insn->hasOpSize = false; + insn->hasOpSize = false; } else { insn->registerSize = (insn->hasOpSize ? 2 : 4); insn->addressSize = (insn->hasAdSize ? 4 : 8); @@ -1663,9 +1663,9 @@ namespace X86 { sib = 504, sib64 = 505 }; -} // namespace X86 +} // namespace X86 -} // namespace llvm +} // namespace llvm static bool translateInstruction(MCInst &target, InternalInstruction &source, @@ -1690,7 +1690,7 @@ private: DisassemblerMode fMode; }; -} // namespace +} // namespace X86GenericDisassembler::X86GenericDisassembler( const MCSubtargetInfo &STI, diff --git a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make index b55833692f..07b83642cc 100644 --- a/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/Disassembler/ya.make @@ -12,17 +12,17 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC/MCDisassembler - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/X86/TargetInfo + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC/MCDisassembler + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/X86/TargetInfo ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86/Disassembler + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86/Disassembler ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index c685d7e0db..fef6c33a90 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -16,7 +16,7 @@ #include "X86InstComments.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Casting.h" @@ -385,16 +385,16 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O) { - // Do not print the exact form of the memory operand if it references a known - // binary object. - if (SymbolizeOperands && MIA) { - uint64_t Target; - if (MIA->evaluateBranch(*MI, 0, 0, Target)) - return; - if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0)) - return; - } - + // Do not print the exact form of the memory operand if it references a known + // binary object. + if (SymbolizeOperands && MIA) { + uint64_t Target; + if (MIA->evaluateBranch(*MI, 0, 0, Target)) + return; + if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0)) + return; + } + const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h index f7a8505712..5926418c57 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -36,7 +36,7 @@ public: raw_ostream &O); // Autogenerated by tblgen. - std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS); static const char *getRegisterName(unsigned RegNo); diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 95012a148d..0b1812c935 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -109,7 +109,7 @@ cl::opt<unsigned> X86PadMaxPrefixSize( cl::desc("Maximum number of prefixes to use for padding")); cl::opt<bool> X86PadForAlign( - "x86-pad-for-align", cl::init(false), cl::Hidden, + "x86-pad-for-align", cl::init(false), cl::Hidden, cl::desc("Pad previous instructions to implement align directives")); cl::opt<bool> X86PadForBranchAlign( @@ -207,8 +207,8 @@ public: void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override; - unsigned getMaximumNopSize() const override; - + unsigned getMaximumNopSize() const override; + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; } // end anonymous namespace @@ -957,9 +957,9 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, if (!X86PadForAlign && !X86PadForBranchAlign) return; - // The processed regions are delimitered by LabeledFragments. -g may have more - // MCSymbols and therefore different relaxation results. X86PadForAlign is - // disabled by default to eliminate the -g vs non -g difference. + // The processed regions are delimitered by LabeledFragments. -g may have more + // MCSymbols and therefore different relaxation results. X86PadForAlign is + // disabled by default to eliminate the -g vs non -g difference. DenseSet<MCFragment *> LabeledFragments; for (const MCSymbol &S : Asm.symbols()) LabeledFragments.insert(S.getFragment(false)); @@ -1072,21 +1072,21 @@ void X86AsmBackend::finishLayout(MCAssembler const &Asm, } } -unsigned X86AsmBackend::getMaximumNopSize() const { - if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) - return 1; - if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) - return 7; - if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) - return 15; - if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) - return 11; - // FIXME: handle 32-bit mode - // 15-bytes is the longest single NOP instruction, but 10-bytes is - // commonly the longest that can be efficiently decoded. - return 10; -} - +unsigned X86AsmBackend::getMaximumNopSize() const { + if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) + return 1; + if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) + return 7; + if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + return 15; + if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + return 11; + // FIXME: handle 32-bit mode + // 15-bytes is the longest single NOP instruction, but 10-bytes is + // commonly the longest that can be efficiently decoded. + return 10; +} + /// Write a sequence of optimal nops to the output, covering \p Count /// bytes. /// \return - true on success, false on failure @@ -1114,7 +1114,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { "\x66\x2e\x0f\x1f\x84\x00\x00\x00\x00\x00", }; - uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(); + uint64_t MaxNopLength = (uint64_t)getMaximumNopSize(); // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining // length. @@ -1241,7 +1241,7 @@ namespace CU { UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF }; -} // namespace CU +} // namespace CU class DarwinX86AsmBackend : public X86AsmBackend { const MCRegisterInfo &MRI; diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 4db1bfc251..fc60496917 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -55,18 +55,18 @@ namespace X86 { /// The constants to describe instr prefixes if there are enum IPREFIXES { IP_NO_PREFIX = 0, - IP_HAS_OP_SIZE = 1U << 0, - IP_HAS_AD_SIZE = 1U << 1, - IP_HAS_REPEAT_NE = 1U << 2, - IP_HAS_REPEAT = 1U << 3, - IP_HAS_LOCK = 1U << 4, - IP_HAS_NOTRACK = 1U << 5, - IP_USE_VEX = 1U << 6, - IP_USE_VEX2 = 1U << 7, - IP_USE_VEX3 = 1U << 8, - IP_USE_EVEX = 1U << 9, - IP_USE_DISP8 = 1U << 10, - IP_USE_DISP32 = 1U << 11, + IP_HAS_OP_SIZE = 1U << 0, + IP_HAS_AD_SIZE = 1U << 1, + IP_HAS_REPEAT_NE = 1U << 2, + IP_HAS_REPEAT = 1U << 3, + IP_HAS_LOCK = 1U << 4, + IP_HAS_NOTRACK = 1U << 5, + IP_USE_VEX = 1U << 6, + IP_USE_VEX2 = 1U << 7, + IP_USE_VEX3 = 1U << 8, + IP_USE_EVEX = 1U << 9, + IP_USE_DISP8 = 1U << 10, + IP_USE_DISP32 = 1U << 11, }; enum OperandType : unsigned { @@ -952,11 +952,11 @@ namespace X86II { // NOTRACK prefix NoTrackShift = EVEX_RCShift + 1, - NOTRACK = 1ULL << NoTrackShift, - - // Force VEX encoding - ExplicitVEXShift = NoTrackShift + 1, - ExplicitVEXPrefix = 1ULL << ExplicitVEXShift + NOTRACK = 1ULL << NoTrackShift, + + // Force VEX encoding + ExplicitVEXShift = NoTrackShift + 1, + ExplicitVEXPrefix = 1ULL << ExplicitVEXShift }; /// \returns true if the instruction with given opcode is a prefix. diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index fa937d3816..177f8efdf3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -94,12 +94,12 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) { "32 bit reloc applied to a field with a different size"); } -static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) { - if (Type != RT64_64) - Ctx.reportError(Loc, - "64 bit reloc applied to a field with a different size"); -} - +static void checkIs64(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) { + if (Type != RT64_64) + Ctx.reportError(Loc, + "64 bit reloc applied to a field with a different size"); +} + static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, MCSymbolRefExpr::VariantKind Modifier, X86_64RelType Type, bool IsPCRel, @@ -218,9 +218,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, return ELF::R_X86_64_REX_GOTPCRELX; } llvm_unreachable("unexpected relocation type!"); - case MCSymbolRefExpr::VK_X86_PLTOFF: - checkIs64(Ctx, Loc, Type); - return ELF::R_X86_64_PLTOFF64; + case MCSymbolRefExpr::VK_X86_PLTOFF: + checkIs64(Ctx, Loc, Type); + return ELF::R_X86_64_PLTOFF64; } } diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index d8dbbbbf27..9c6db0fcb7 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -295,10 +295,10 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, /// \see MCInstPrinter::printInst void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo, raw_ostream &O) { - // Do not print the numberic target address when symbolizing. - if (SymbolizeOperands) - return; - + // Do not print the numberic target address when symbolizing. + if (SymbolizeOperands) + return; + const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) { if (PrintBranchImmAsAddress) { @@ -346,21 +346,21 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) { O << "\trepne\t"; else if (Flags & X86::IP_HAS_REPEAT) O << "\trep\t"; - - // These all require a pseudo prefix - if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix)) - O << "\t{vex}"; - else if (Flags & X86::IP_USE_VEX2) - O << "\t{vex2}"; - else if (Flags & X86::IP_USE_VEX3) - O << "\t{vex3}"; - else if (Flags & X86::IP_USE_EVEX) - O << "\t{evex}"; - - if (Flags & X86::IP_USE_DISP8) - O << "\t{disp8}"; - else if (Flags & X86::IP_USE_DISP32) - O << "\t{disp32}"; + + // These all require a pseudo prefix + if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix)) + O << "\t{vex}"; + else if (Flags & X86::IP_USE_VEX2) + O << "\t{vex2}"; + else if (Flags & X86::IP_USE_VEX3) + O << "\t{vex3}"; + else if (Flags & X86::IP_USE_EVEX) + O << "\t{evex}"; + + if (Flags & X86::IP_USE_DISP8) + O << "\t{disp8}"; + else if (Flags & X86::IP_USE_DISP32) + O << "\t{disp32}"; } void X86InstPrinterCommon::printVKPair(const MCInst *MI, unsigned OpNo, diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index d5b205ad9a..371f3a223a 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -16,7 +16,7 @@ #include "X86InstComments.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -343,15 +343,15 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O) { - // Do not print the exact form of the memory operand if it references a known - // binary object. - if (SymbolizeOperands && MIA) { - uint64_t Target; - if (MIA->evaluateBranch(*MI, 0, 0, Target)) - return; - if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0)) - return; - } + // Do not print the exact form of the memory operand if it references a known + // binary object. + if (SymbolizeOperands && MIA) { + uint64_t Target; + if (MIA->evaluateBranch(*MI, 0, 0, Target)) + return; + if (MIA->evaluateMemoryOperandAddress(*MI, 0, 0)) + return; + } const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h index aa4d0545ea..48ee4fbfcf 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h @@ -37,7 +37,7 @@ public: raw_ostream &O); // Autogenerated by tblgen. - std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 260253a530..d1e4d3bd4f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -93,8 +93,8 @@ private: bool emitOpcodePrefix(int MemOperand, const MCInst &MI, const MCSubtargetInfo &STI, raw_ostream &OS) const; - bool emitREXPrefix(int MemOperand, const MCInst &MI, - const MCSubtargetInfo &STI, raw_ostream &OS) const; + bool emitREXPrefix(int MemOperand, const MCInst &MI, + const MCSubtargetInfo &STI, raw_ostream &OS) const; }; } // end anonymous namespace @@ -114,28 +114,28 @@ static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) { } } -/// Determine if this immediate can fit in a disp8 or a compressed disp8 for -/// EVEX instructions. \p will be set to the value to pass to the ImmOffset -/// parameter of emitImmediate. -static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) { - bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; +/// Determine if this immediate can fit in a disp8 or a compressed disp8 for +/// EVEX instructions. \p will be set to the value to pass to the ImmOffset +/// parameter of emitImmediate. +static bool isDispOrCDisp8(uint64_t TSFlags, int Value, int &ImmOffset) { + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; - int CD8_Scale = + int CD8_Scale = (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; - if (!HasEVEX || CD8_Scale == 0) - return isInt<8>(Value); + if (!HasEVEX || CD8_Scale == 0) + return isInt<8>(Value); - assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!"); - if (Value & (CD8_Scale - 1)) // Unaligned offset + assert(isPowerOf2_32(CD8_Scale) && "Unexpected CD8 scale!"); + if (Value & (CD8_Scale - 1)) // Unaligned offset return false; - int CDisp8 = Value / CD8_Scale; - if (!isInt<8>(CDisp8)) - return false; - - // ImmOffset will be added to Value in emitImmediate leaving just CDisp8. - ImmOffset = CDisp8 - Value; - return true; + int CDisp8 = Value / CD8_Scale; + if (!isInt<8>(CDisp8)) + return false; + + // ImmOffset will be added to Value in emitImmediate leaving just CDisp8. + ImmOffset = CDisp8 - Value; + return true; } /// \returns the appropriate fixup kind to use for an immediate in an @@ -160,18 +160,18 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { /// \returns true if the specified instruction has a 16-bit memory operand. static bool is16BitMemOperand(const MCInst &MI, unsigned Op, const MCSubtargetInfo &STI) { - const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); - const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Index = MI.getOperand(Op + X86::AddrIndexReg); - unsigned BaseReg = Base.getReg(); - unsigned IndexReg = Index.getReg(); - - if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0) + unsigned BaseReg = Base.getReg(); + unsigned IndexReg = Index.getReg(); + + if (STI.hasFeature(X86::Mode16Bit) && BaseReg == 0 && IndexReg == 0) return true; - if ((BaseReg != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || - (IndexReg != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))) + if ((BaseReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) || + (IndexReg != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg))) return true; return false; } @@ -398,33 +398,33 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, emitByte(modRMByte(0, RegOpcodeField, 5), OS); unsigned Opcode = MI.getOpcode(); - unsigned FixupKind = [&]() { - // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a - // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4). - if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr()))) - return X86::reloc_riprel_4byte; - - // Certain loads for GOT references can be relocated against the symbol - // directly if the symbol ends up in the same linkage unit. + unsigned FixupKind = [&]() { + // Enable relaxed relocation only for a MCSymbolRefExpr. We cannot use a + // relaxed relocation if an offset is present (e.g. x@GOTPCREL+4). + if (!(Disp.isExpr() && isa<MCSymbolRefExpr>(Disp.getExpr()))) + return X86::reloc_riprel_4byte; + + // Certain loads for GOT references can be relocated against the symbol + // directly if the symbol ends up in the same linkage unit. switch (Opcode) { default: return X86::reloc_riprel_4byte; case X86::MOV64rm: - // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a - // special case because COFF and Mach-O don't support ELF's more - // flexible R_X86_64_REX_GOTPCRELX relaxation. + // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a + // special case because COFF and Mach-O don't support ELF's more + // flexible R_X86_64_REX_GOTPCRELX relaxation. assert(HasREX); return X86::reloc_riprel_4byte_movq_load; - case X86::ADC32rm: - case X86::ADD32rm: - case X86::AND32rm: - case X86::CMP32rm: - case X86::MOV32rm: - case X86::OR32rm: - case X86::SBB32rm: - case X86::SUB32rm: - case X86::TEST32mr: - case X86::XOR32rm: + case X86::ADC32rm: + case X86::ADD32rm: + case X86::AND32rm: + case X86::CMP32rm: + case X86::MOV32rm: + case X86::OR32rm: + case X86::SBB32rm: + case X86::SUB32rm: + case X86::TEST32mr: + case X86::XOR32rm: case X86::CALL64m: case X86::JMP64m: case X86::TAILJMPm64: @@ -497,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1); } - if (Disp.isImm() && isInt<8>(Disp.getImm())) { + if (Disp.isImm() && isInt<8>(Disp.getImm())) { if (Disp.getImm() == 0 && RMfield != 6) { // There is no displacement; just the register. emitByte(modRMByte(0, RegOpcodeField, RMfield), OS); @@ -511,7 +511,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // This is the [REG]+disp16 case. emitByte(modRMByte(2, RegOpcodeField, RMfield), OS); } else { - assert(IndexReg.getReg() == 0 && "Unexpected index register!"); + assert(IndexReg.getReg() == 0 && "Unexpected index register!"); // There is no BaseReg; this is the plain [disp16] case. emitByte(modRMByte(0, RegOpcodeField, 6), OS); } @@ -521,18 +521,18 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, return; } - // Check for presence of {disp8} or {disp32} pseudo prefixes. - bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8; - bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32; - - // We only allow no displacement if no pseudo prefix is present. - bool AllowNoDisp = !UseDisp8 && !UseDisp32; - // Disp8 is allowed unless the {disp32} prefix is present. - bool AllowDisp8 = !UseDisp32; - + // Check for presence of {disp8} or {disp32} pseudo prefixes. + bool UseDisp8 = MI.getFlags() & X86::IP_USE_DISP8; + bool UseDisp32 = MI.getFlags() & X86::IP_USE_DISP32; + + // We only allow no displacement if no pseudo prefix is present. + bool AllowNoDisp = !UseDisp8 && !UseDisp32; + // Disp8 is allowed unless the {disp32} prefix is present. + bool AllowDisp8 = !UseDisp32; + // Determine whether a SIB byte is needed. - if (// The SIB byte must be used if there is an index register or the - // encoding requires a SIB byte. + if (// The SIB byte must be used if there is an index register or the + // encoding requires a SIB byte. !ForceSIB && IndexReg.getReg() == 0 && // The SIB byte must be used if the base is ESP/RSP/R12, all of which // encode to an R/M value of 4, which indicates that a SIB byte is @@ -548,12 +548,12 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, return; } - // If the base is not EBP/ESP/R12/R13 and there is no displacement, use - // simple indirect register encoding, this handles addresses like [EAX]. - // The encoding for [EBP] or[R13] with no displacement means [disp32] so we - // handle it by emitting a displacement of 0 later. + // If the base is not EBP/ESP/R12/R13 and there is no displacement, use + // simple indirect register encoding, this handles addresses like [EAX]. + // The encoding for [EBP] or[R13] with no displacement means [disp32] so we + // handle it by emitting a displacement of 0 later. if (BaseRegNo != N86::EBP) { - if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) { + if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp) { emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS); return; } @@ -572,22 +572,22 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, } // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - // Including a compressed disp8 for EVEX instructions that support it. - // This also handles the 0 displacement for [EBP] or [R13]. We can't use - // disp8 if the {disp32} pseudo prefix is present. - if (Disp.isImm() && AllowDisp8) { - int ImmOffset = 0; - if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { + // Including a compressed disp8 for EVEX instructions that support it. + // This also handles the 0 displacement for [EBP] or [R13]. We can't use + // disp8 if the {disp32} pseudo prefix is present. + if (Disp.isImm() && AllowDisp8) { + int ImmOffset = 0; + if (isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, - ImmOffset); + ImmOffset); return; } } - // Otherwise, emit the most general non-SIB encoding: [REG+disp32]. - // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix - // prevented using disp8 above. + // Otherwise, emit the most general non-SIB encoding: [REG+disp32]. + // Displacement may be 0 for [EBP] or [R13] case if {disp32} pseudo prefix + // prevented using disp8 above. emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS); unsigned Opcode = MI.getOpcode(); unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax @@ -607,43 +607,43 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. - BaseRegNo = 5; + BaseRegNo = 5; emitByte(modRMByte(0, RegOpcodeField, 4), OS); ForceDisp32 = true; - } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp && - // Base reg can't be EBP/RBP/R13 as that would end up with '5' as - // the base field, but that is the magic [*] nomenclature that - // indicates no base when mod=0. For these cases we'll emit a 0 - // displacement instead. + } else if (Disp.isImm() && Disp.getImm() == 0 && AllowNoDisp && + // Base reg can't be EBP/RBP/R13 as that would end up with '5' as + // the base field, but that is the magic [*] nomenclature that + // indicates no base when mod=0. For these cases we'll emit a 0 + // displacement instead. BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte emitByte(modRMByte(0, RegOpcodeField, 4), OS); - } else if (Disp.isImm() && AllowDisp8 && - isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { - // Displacement fits in a byte or matches an EVEX compressed disp8, use - // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless - // {disp32} pseudo prefix was used. + } else if (Disp.isImm() && AllowDisp8 && + isDispOrCDisp8(TSFlags, Disp.getImm(), ImmOffset)) { + // Displacement fits in a byte or matches an EVEX compressed disp8, use + // disp8 encoding. This also handles EBP/R13 base with 0 displacement unless + // {disp32} pseudo prefix was used. emitByte(modRMByte(1, RegOpcodeField, 4), OS); - ForceDisp8 = true; + ForceDisp8 = true; } else { - // Otherwise, emit the normal disp32 encoding. + // Otherwise, emit the normal disp32 encoding. emitByte(modRMByte(2, RegOpcodeField, 4), OS); - ForceDisp32 = true; + ForceDisp32 = true; } // Calculate what the SS field value should be... static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3}; unsigned SS = SSTable[Scale.getImm()]; - unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4; - - emitSIBByte(SS, IndexRegNo, BaseRegNo, OS); + unsigned IndexRegNo = IndexReg.getReg() ? getX86RegNum(IndexReg) : 4; + emitSIBByte(SS, IndexRegNo, BaseRegNo, OS); + // Do we need to output a displacement? if (ForceDisp8) emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, ImmOffset); - else if (ForceDisp32) + else if (ForceDisp32) emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), StartByte, OS, Fixups); } @@ -1201,7 +1201,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, /// /// \returns true if REX prefix is used, otherwise returns false. bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI, - const MCSubtargetInfo &STI, + const MCSubtargetInfo &STI, raw_ostream &OS) const { uint8_t REX = [&, MemOperand]() { uint8_t REX = 0; @@ -1222,28 +1222,28 @@ bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI, // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. for (unsigned i = CurOp; i != NumOps; ++i) { const MCOperand &MO = MI.getOperand(i); - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || - Reg == X86::DH) - UsesHighByteReg = true; - if (X86II::isX86_64NonExtLowByteReg(Reg)) - // FIXME: The caller of determineREXPrefix slaps this prefix onto - // anything that returns non-zero. - REX |= 0x40; // REX fixed encoding prefix - } else if (MO.isExpr() && - STI.getTargetTriple().getEnvironment() == Triple::GNUX32) { - // GOTTPOFF and TLSDESC relocations require a REX prefix to allow - // linker optimizations: even if the instructions we see may not require - // any prefix, they may be replaced by instructions that do. This is - // handled as a special case here so that it also works for hand-written - // assembly without the user needing to write REX, as with GNU as. - const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); - if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF || - Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) { - REX |= 0x40; // REX fixed encoding prefix - } - } + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || + Reg == X86::DH) + UsesHighByteReg = true; + if (X86II::isX86_64NonExtLowByteReg(Reg)) + // FIXME: The caller of determineREXPrefix slaps this prefix onto + // anything that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix + } else if (MO.isExpr() && + STI.getTargetTriple().getEnvironment() == Triple::GNUX32) { + // GOTTPOFF and TLSDESC relocations require a REX prefix to allow + // linker optimizations: even if the instructions we see may not require + // any prefix, they may be replaced by instructions that do. This is + // handled as a special case here so that it also works for hand-written + // assembly without the user needing to write REX, as with GNU as. + const auto *Ref = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); + if (Ref && (Ref->getKind() == MCSymbolRefExpr::VK_GOTTPOFF || + Ref->getKind() == MCSymbolRefExpr::VK_TLSDESC)) { + REX |= 0x40; // REX fixed encoding prefix + } + } } switch (TSFlags & X86II::FormMask) { @@ -1366,7 +1366,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI, assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) && "REX.W requires 64bit mode."); bool HasREX = STI.hasFeature(X86::Mode64Bit) - ? emitREXPrefix(MemOperand, MI, STI, OS) + ? emitREXPrefix(MemOperand, MI, STI, OS) : false; // 0x0F escape code must be emitted just before the opcode. diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5cf8d77519..7214b80941 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -44,10 +44,10 @@ using namespace llvm; std::string X86_MC::ParseX86Triple(const Triple &TT) { std::string FS; - // SSE2 should default to enabled in 64-bit mode, but can be turned off - // explicitly. - if (TT.isArch64Bit()) - FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2"; + // SSE2 should default to enabled in 64-bit mode, but can be turned off + // explicitly. + if (TT.isArch64Bit()) + FS = "+64bit-mode,-32bit-mode,-16bit-mode,+sse2"; else if (TT.getEnvironment() != Triple::CODE16) FS = "-64bit-mode,+32bit-mode,-16bit-mode"; else @@ -292,10 +292,10 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, if (!FS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); - if (CPU.empty()) - CPU = "generic"; + if (CPU.empty()) + CPU = "generic"; - return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); + return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); } static MCInstrInfo *createX86MCInstrInfo() { diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 35604cd3ec..69fc238074 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -85,11 +85,11 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, /// Implements X86-only directives for assembly emission. MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, - MCInstPrinter *InstPrinter, - bool IsVerboseAsm); + MCInstPrinter *InstPrinter, + bool IsVerboseAsm); /// Implements X86-only directives for object files. -MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S, +MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI); /// Construct an X86 Windows COFF machine code streamer which will generate diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index b98e58d653..40fc8527c3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -68,7 +68,7 @@ public: FixedValue); } }; -} // namespace +} // namespace static bool isFixupKindRIPRel(unsigned Kind) { return Kind == X86::reloc_riprel_4byte || diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp index 201b22d623..7e3f6d8335 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp @@ -568,4 +568,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts, } } -} // namespace llvm +} // namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index c292112461..72bb41e94b 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -26,7 +26,7 @@ public: : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} void EmitWinEHHandlerData(SMLoc Loc) override; - void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; + void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; void EmitWindowsUnwindTables() override; void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override; void finishImpl() override; @@ -38,13 +38,13 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { // We have to emit the unwind info now, because this directive // actually switches to the .xdata section. if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo()) - EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true); -} - -void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { - EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); + EHStreamer.EmitUnwindInfo(*this, CurFrame, /* HandlerData = */ true); } +void X86WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { + EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); +} + void X86WinCOFFStreamer::EmitWindowsUnwindTables() { if (!getNumWinFrameInfos()) return; @@ -63,7 +63,7 @@ void X86WinCOFFStreamer::finishImpl() { MCWinCOFFStreamer::finishImpl(); } -} // namespace +} // namespace MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> &&AB, diff --git a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make index 8da0d02f5b..77b54a6412 100644 --- a/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/MCTargetDesc/ya.make @@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/BinaryFormat - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCDisassembler - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/X86/TargetInfo + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/BinaryFormat + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCDisassembler + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/X86/TargetInfo ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86/MCTargetDesc + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86/MCTargetDesc ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make index 2f30db941e..b21991ca46 100644 --- a/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/TargetInfo/ya.make @@ -12,13 +12,13 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/lib/Support + contrib/libs/llvm12 + contrib/libs/llvm12/lib/Support ) ADDINCL( - contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86/TargetInfo + contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86/TargetInfo ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/X86/X86.h b/contrib/libs/llvm12/lib/Target/X86/X86.h index e17b9ba550..f5a9baefa2 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86.h @@ -76,10 +76,10 @@ FunctionPass *createX86FlagsCopyLoweringPass(); /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); -FunctionPass *createX86TileConfigPass(); - -FunctionPass *createX86PreTileConfigPass(); - +FunctionPass *createX86TileConfigPass(); + +FunctionPass *createX86PreTileConfigPass(); + /// Return a pass that inserts int3 at the end of the function if it ends with a /// CALL instruction. The pass does the same for each funclet as well. This /// ensures that the open interval of function start and end PCs contains all @@ -166,9 +166,9 @@ void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); -void initializeX86PreTileConfigPass(PassRegistry &); -void initializeX86TileConfigPass(PassRegistry &); -void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); +void initializeX86PreTileConfigPass(PassRegistry &); +void initializeX86TileConfigPass(PassRegistry &); +void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/contrib/libs/llvm12/lib/Target/X86/X86.td b/contrib/libs/llvm12/lib/Target/X86/X86.td index c492d686c5..d17c7f4f9b 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86.td @@ -171,9 +171,9 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", "Enable AVX-512 Vector Neural Network Instructions", [FeatureAVX512]>; -def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true", - "Support AVX_VNNI encoding", - [FeatureAVX2]>; +def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true", + "Support AVX_VNNI encoding", + [FeatureAVX2]>; def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true", "Support bfloat16 floating point", [FeatureBWI]>; @@ -237,8 +237,8 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; -def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true", - "Support LAHF and SAHF instructions in 64-bit mode">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF64", "true", + "Support LAHF and SAHF instructions in 64-bit mode">; def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true", "Enable MONITORX/MWAITX timer functionality">; def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true", @@ -282,20 +282,20 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", "Has ENQCMD instructions">; -def FeatureKL : SubtargetFeature<"kl", "HasKL", "true", - "Support Key Locker kl Instructions", - [FeatureSSE2]>; -def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true", - "Support Key Locker wide Instructions", - [FeatureKL]>; -def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true", - "Has hreset instruction">; +def FeatureKL : SubtargetFeature<"kl", "HasKL", "true", + "Support Key Locker kl Instructions", + [FeatureSSE2]>; +def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true", + "Support Key Locker wide Instructions", + [FeatureKL]>; +def FeatureHRESET : SubtargetFeature<"hreset", "HasHRESET", "true", + "Has hreset instruction">; def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true", "Has serialize instruction">; def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true", "Support TSXLDTRK instructions">; -def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true", - "Has UINTR Instructions">; +def FeatureUINTR : SubtargetFeature<"uintr", "HasUINTR", "true", + "Has UINTR Instructions">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands // should be avoided in favor of a MOV + register CALL/PUSH/POP. @@ -385,12 +385,12 @@ def FeatureERMSB "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; -// Icelake and newer processors have Fast Short REP MOV. -def FeatureFSRM - : SubtargetFeature< - "fsrm", "HasFSRM", "true", - "REP MOVSB of short lengths is faster">; - +// Icelake and newer processors have Fast Short REP MOV. +def FeatureFSRM + : SubtargetFeature< + "fsrm", "HasFSRM", "true", + "REP MOVSB of short lengths is faster">; + // Bulldozer and newer processors can merge CMP/TEST (but not other // instructions) with conditional branches. def FeatureBranchFusion @@ -565,59 +565,59 @@ include "X86SchedSkylakeServer.td" //===----------------------------------------------------------------------===// def ProcessorFeatures { - // x86-64 and x86-64-v[234] - list<SubtargetFeature> X86_64V1Features = [ - FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, Feature64Bit - ]; - list<SubtargetFeature> X86_64V2Features = !listconcat( - X86_64V1Features, - [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]); - list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [ - FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT, - FeatureMOVBE, FeatureXSAVE - ]); - list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [ - FeatureBWI, - FeatureCDI, - FeatureDQI, - FeatureVLX, - ]); - + // x86-64 and x86-64-v[234] + list<SubtargetFeature> X86_64V1Features = [ + FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureNOPL, Feature64Bit + ]; + list<SubtargetFeature> X86_64V2Features = !listconcat( + X86_64V1Features, + [FeatureCMPXCHG16B, FeatureLAHFSAHF, FeaturePOPCNT, FeatureSSE42]); + list<SubtargetFeature> X86_64V3Features = !listconcat(X86_64V2Features, [ + FeatureAVX2, FeatureBMI, FeatureBMI2, FeatureF16C, FeatureFMA, FeatureLZCNT, + FeatureMOVBE, FeatureXSAVE + ]); + list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [ + FeatureBWI, + FeatureCDI, + FeatureDQI, + FeatureVLX, + ]); + // Nehalem - list<SubtargetFeature> NHMFeatures = X86_64V2Features; - list<SubtargetFeature> NHMTuning = [FeatureMacroFusion, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> NHMFeatures = X86_64V2Features; + list<SubtargetFeature> NHMTuning = [FeatureMacroFusion, + FeatureInsertVZEROUPPER]; // Westmere list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL]; - list<SubtargetFeature> WSMTuning = NHMTuning; + list<SubtargetFeature> WSMTuning = NHMTuning; list<SubtargetFeature> WSMFeatures = - !listconcat(NHMFeatures, WSMAdditionalFeatures); + !listconcat(NHMFeatures, WSMAdditionalFeatures); // Sandybridge list<SubtargetFeature> SNBAdditionalFeatures = [FeatureAVX, FeatureXSAVE, - FeatureXSAVEOPT]; - list<SubtargetFeature> SNBTuning = [FeatureMacroFusion, - FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureSlowUAMem32, - FeatureFastScalarFSQRT, - FeatureFastSHLDRotate, - FeatureFast15ByteNOP, - FeaturePOPCNTFalseDeps, - FeatureInsertVZEROUPPER]; + FeatureXSAVEOPT]; + list<SubtargetFeature> SNBTuning = [FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureSlowUAMem32, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> SNBFeatures = - !listconcat(WSMFeatures, SNBAdditionalFeatures); + !listconcat(WSMFeatures, SNBAdditionalFeatures); // Ivybridge list<SubtargetFeature> IVBAdditionalFeatures = [FeatureRDRAND, FeatureF16C, FeatureFSGSBase]; - list<SubtargetFeature> IVBTuning = SNBTuning; + list<SubtargetFeature> IVBTuning = SNBTuning; list<SubtargetFeature> IVBFeatures = - !listconcat(SNBFeatures, IVBAdditionalFeatures); + !listconcat(SNBFeatures, IVBAdditionalFeatures); // Haswell list<SubtargetFeature> HSWAdditionalFeatures = [FeatureAVX2, @@ -627,86 +627,86 @@ def ProcessorFeatures { FeatureFMA, FeatureINVPCID, FeatureLZCNT, - FeatureMOVBE]; - list<SubtargetFeature> HSWTuning = [FeatureMacroFusion, - FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureFastScalarFSQRT, - FeatureFastSHLDRotate, - FeatureFast15ByteNOP, - FeatureFastVariableShuffle, - FeaturePOPCNTFalseDeps, - FeatureLZCNTFalseDeps, - FeatureInsertVZEROUPPER]; + FeatureMOVBE]; + list<SubtargetFeature> HSWTuning = [FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> HSWFeatures = - !listconcat(IVBFeatures, HSWAdditionalFeatures); + !listconcat(IVBFeatures, HSWAdditionalFeatures); // Broadwell list<SubtargetFeature> BDWAdditionalFeatures = [FeatureADX, FeatureRDSEED, FeaturePRFCHW]; - list<SubtargetFeature> BDWTuning = HSWTuning; + list<SubtargetFeature> BDWTuning = HSWTuning; list<SubtargetFeature> BDWFeatures = - !listconcat(HSWFeatures, BDWAdditionalFeatures); + !listconcat(HSWFeatures, BDWAdditionalFeatures); // Skylake list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES, FeatureXSAVEC, FeatureXSAVES, FeatureCLFLUSHOPT, - FeatureSGX]; - list<SubtargetFeature> SKLTuning = [FeatureHasFastGather, - FeatureMacroFusion, - FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureFastScalarFSQRT, - FeatureFastVectorFSQRT, - FeatureFastSHLDRotate, - FeatureFast15ByteNOP, - FeatureFastVariableShuffle, - FeaturePOPCNTFalseDeps, - FeatureInsertVZEROUPPER]; + FeatureSGX]; + list<SubtargetFeature> SKLTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> SKLFeatures = - !listconcat(BDWFeatures, SKLAdditionalFeatures); + !listconcat(BDWFeatures, SKLAdditionalFeatures); // Skylake-AVX512 - list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES, - FeatureXSAVEC, - FeatureXSAVES, - FeatureCLFLUSHOPT, - FeatureAVX512, + list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAES, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureAVX512, FeatureCDI, FeatureDQI, FeatureBWI, FeatureVLX, FeaturePKU, FeatureCLWB]; - list<SubtargetFeature> SKXTuning = [FeatureHasFastGather, - FeatureMacroFusion, - FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureFastScalarFSQRT, - FeatureFastVectorFSQRT, - FeatureFastSHLDRotate, - FeatureFast15ByteNOP, - FeatureFastVariableShuffle, - FeaturePrefer256Bit, - FeaturePOPCNTFalseDeps, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> SKXTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePrefer256Bit, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> SKXFeatures = - !listconcat(BDWFeatures, SKXAdditionalFeatures); + !listconcat(BDWFeatures, SKXAdditionalFeatures); // Cascadelake list<SubtargetFeature> CLXAdditionalFeatures = [FeatureVNNI]; - list<SubtargetFeature> CLXTuning = SKXTuning; + list<SubtargetFeature> CLXTuning = SKXTuning; list<SubtargetFeature> CLXFeatures = - !listconcat(SKXFeatures, CLXAdditionalFeatures); + !listconcat(SKXFeatures, CLXAdditionalFeatures); // Cooperlake list<SubtargetFeature> CPXAdditionalFeatures = [FeatureBF16]; - list<SubtargetFeature> CPXTuning = SKXTuning; + list<SubtargetFeature> CPXTuning = SKXTuning; list<SubtargetFeature> CPXFeatures = - !listconcat(CLXFeatures, CPXAdditionalFeatures); + !listconcat(CLXFeatures, CPXAdditionalFeatures); // Cannonlake list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, @@ -717,20 +717,20 @@ def ProcessorFeatures { FeaturePKU, FeatureVBMI, FeatureIFMA, - FeatureSHA]; - list<SubtargetFeature> CNLTuning = [FeatureHasFastGather, - FeatureMacroFusion, - FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureFastScalarFSQRT, - FeatureFastVectorFSQRT, - FeatureFastSHLDRotate, - FeatureFast15ByteNOP, - FeatureFastVariableShuffle, - FeaturePrefer256Bit, - FeatureInsertVZEROUPPER]; + FeatureSHA]; + list<SubtargetFeature> CNLTuning = [FeatureHasFastGather, + FeatureMacroFusion, + FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureFastScalarFSQRT, + FeatureFastVectorFSQRT, + FeatureFastSHLDRotate, + FeatureFast15ByteNOP, + FeatureFastVariableShuffle, + FeaturePrefer256Bit, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> CNLFeatures = - !listconcat(SKLFeatures, CNLAdditionalFeatures); + !listconcat(SKLFeatures, CNLAdditionalFeatures); // Icelake list<SubtargetFeature> ICLAdditionalFeatures = [FeatureBITALG, @@ -741,81 +741,81 @@ def ProcessorFeatures { FeatureVPOPCNTDQ, FeatureGFNI, FeatureCLWB, - FeatureRDPID, - FeatureFSRM]; - list<SubtargetFeature> ICLTuning = CNLTuning; + FeatureRDPID, + FeatureFSRM]; + list<SubtargetFeature> ICLTuning = CNLTuning; list<SubtargetFeature> ICLFeatures = - !listconcat(CNLFeatures, ICLAdditionalFeatures); + !listconcat(CNLFeatures, ICLAdditionalFeatures); // Icelake Server - list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG, - FeatureWBNOINVD]; - list<SubtargetFeature> ICXTuning = CNLTuning; + list<SubtargetFeature> ICXAdditionalFeatures = [FeaturePCONFIG, + FeatureWBNOINVD]; + list<SubtargetFeature> ICXTuning = CNLTuning; list<SubtargetFeature> ICXFeatures = - !listconcat(ICLFeatures, ICXAdditionalFeatures); + !listconcat(ICLFeatures, ICXAdditionalFeatures); //Tigerlake list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT, FeatureMOVDIRI, FeatureMOVDIR64B, FeatureSHSTK]; - list<SubtargetFeature> TGLTuning = CNLTuning; + list<SubtargetFeature> TGLTuning = CNLTuning; list<SubtargetFeature> TGLFeatures = - !listconcat(ICLFeatures, TGLAdditionalFeatures ); - - //Sapphirerapids - list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE, - FeatureAMXINT8, - FeatureAMXBF16, - FeatureBF16, - FeatureSERIALIZE, - FeatureCLDEMOTE, - FeatureWAITPKG, - FeaturePTWRITE, - FeatureAVXVNNI, - FeatureTSXLDTRK, - FeatureENQCMD, - FeatureSHSTK, - FeatureVP2INTERSECT, - FeatureMOVDIRI, - FeatureMOVDIR64B, - FeatureUINTR]; - list<SubtargetFeature> SPRTuning = ICXTuning; - list<SubtargetFeature> SPRFeatures = - !listconcat(ICXFeatures, SPRAdditionalFeatures); - - // Alderlake - list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI, - FeatureCLDEMOTE, - FeatureHRESET, - FeaturePTWRITE, - FeatureSERIALIZE, - FeatureWAITPKG]; - list<SubtargetFeature> ADLTuning = SKLTuning; - list<SubtargetFeature> ADLFeatures = - !listconcat(SKLFeatures, ADLAdditionalFeatures); - + !listconcat(ICLFeatures, TGLAdditionalFeatures ); + + //Sapphirerapids + list<SubtargetFeature> SPRAdditionalFeatures = [FeatureAMXTILE, + FeatureAMXINT8, + FeatureAMXBF16, + FeatureBF16, + FeatureSERIALIZE, + FeatureCLDEMOTE, + FeatureWAITPKG, + FeaturePTWRITE, + FeatureAVXVNNI, + FeatureTSXLDTRK, + FeatureENQCMD, + FeatureSHSTK, + FeatureVP2INTERSECT, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureUINTR]; + list<SubtargetFeature> SPRTuning = ICXTuning; + list<SubtargetFeature> SPRFeatures = + !listconcat(ICXFeatures, SPRAdditionalFeatures); + + // Alderlake + list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI, + FeatureCLDEMOTE, + FeatureHRESET, + FeaturePTWRITE, + FeatureSERIALIZE, + FeatureWAITPKG]; + list<SubtargetFeature> ADLTuning = SKLTuning; + list<SubtargetFeature> ADLFeatures = + !listconcat(SKLFeatures, ADLAdditionalFeatures); + // Atom - list<SubtargetFeature> AtomFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureLAHFSAHF]; - list<SubtargetFeature> AtomTuning = [ProcIntelAtom, - FeatureSlowUAMem16, - FeatureLEAForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureSlowTwoMemOps, - FeatureLEAUsesAG, - FeaturePadShortFunctions, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> AtomFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureLAHFSAHF]; + list<SubtargetFeature> AtomTuning = [ProcIntelAtom, + FeatureSlowUAMem16, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureSlowTwoMemOps, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureInsertVZEROUPPER]; // Silvermont list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42, @@ -823,17 +823,17 @@ def ProcessorFeatures { FeaturePCLMUL, FeaturePRFCHW, FeatureRDRAND]; - list<SubtargetFeature> SLMTuning = [ProcIntelSLM, - FeatureSlowTwoMemOps, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureSlowDivide64, - FeatureSlowPMULLD, - FeatureFast7ByteNOP, - FeaturePOPCNTFalseDeps, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> SLMTuning = [ProcIntelSLM, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowDivide64, + FeatureSlowPMULLD, + FeatureFast7ByteNOP, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> SLMFeatures = - !listconcat(AtomFeatures, SLMAdditionalFeatures); + !listconcat(AtomFeatures, SLMAdditionalFeatures); // Goldmont list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES, @@ -845,33 +845,33 @@ def ProcessorFeatures { FeatureXSAVES, FeatureCLFLUSHOPT, FeatureFSGSBase]; - list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts, - FeatureSlowTwoMemOps, - FeatureSlowLEA, - FeatureSlowIncDec, - FeaturePOPCNTFalseDeps, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> GLMTuning = [FeatureUseGLMDivSqrtCosts, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeaturePOPCNTFalseDeps, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> GLMFeatures = - !listconcat(SLMFeatures, GLMAdditionalFeatures); + !listconcat(SLMFeatures, GLMAdditionalFeatures); // Goldmont Plus list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE, FeatureRDPID, FeatureSGX]; - list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts, - FeatureSlowTwoMemOps, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> GLPTuning = [FeatureUseGLMDivSqrtCosts, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> GLPFeatures = - !listconcat(GLMFeatures, GLPAdditionalFeatures); + !listconcat(GLMFeatures, GLPAdditionalFeatures); // Tremont list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list<SubtargetFeature> TRMTuning = GLPTuning; + list<SubtargetFeature> TRMTuning = GLPTuning; list<SubtargetFeature> TRMFeatures = - !listconcat(GLPFeatures, TRMAdditionalFeatures); + !listconcat(GLPFeatures, TRMAdditionalFeatures); // Knights Landing list<SubtargetFeature> KNLFeatures = [FeatureX87, @@ -903,56 +903,56 @@ def ProcessorFeatures { FeatureBMI, FeatureBMI2, FeatureFMA, - FeaturePRFCHW]; - list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64, - FeatureSlow3OpsLEA, - FeatureSlowIncDec, - FeatureSlowTwoMemOps, - FeaturePreferMaskRegisters, - FeatureHasFastGather, - FeatureSlowPMADDWD]; + FeaturePRFCHW]; + list<SubtargetFeature> KNLTuning = [FeatureSlowDivide64, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureSlowTwoMemOps, + FeaturePreferMaskRegisters, + FeatureHasFastGather, + FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features list<SubtargetFeature> KNMFeatures = !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); // Barcelona - list<SubtargetFeature> BarcelonaFeatures = [FeatureX87, - FeatureCMPXCHG8B, - FeatureSSE4A, - Feature3DNowA, - FeatureFXSR, - FeatureNOPL, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureLAHFSAHF, - FeatureCMOV, - Feature64Bit]; - list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks, - FeatureSlowSHLD, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> BarcelonaFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureSSE4A, + Feature3DNowA, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureLAHFSAHF, + FeatureCMOV, + Feature64Bit]; + list<SubtargetFeature> BarcelonaTuning = [FeatureFastScalarShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; // Bobcat - list<SubtargetFeature> BtVer1Features = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureMMX, - FeatureSSSE3, - FeatureSSE4A, - FeatureFXSR, - FeatureNOPL, - Feature64Bit, - FeatureCMPXCHG16B, - FeaturePRFCHW, - FeatureLZCNT, - FeaturePOPCNT, - FeatureLAHFSAHF]; - list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP, - FeatureFastScalarShiftMasks, - FeatureFastVectorShiftMasks, - FeatureSlowSHLD, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> BtVer1Features = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureLAHFSAHF]; + list<SubtargetFeature> BtVer1Tuning = [FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; // Jaguar list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX, @@ -963,39 +963,39 @@ def ProcessorFeatures { FeatureMOVBE, FeatureXSAVE, FeatureXSAVEOPT]; - list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT, - FeatureFastBEXTR, - FeatureFastHorizontalOps, - FeatureFast15ByteNOP, - FeatureFastScalarShiftMasks, - FeatureFastVectorShiftMasks, - FeatureSlowSHLD]; + list<SubtargetFeature> BtVer2Tuning = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFastHorizontalOps, + FeatureFast15ByteNOP, + FeatureFastScalarShiftMasks, + FeatureFastVectorShiftMasks, + FeatureSlowSHLD]; list<SubtargetFeature> BtVer2Features = - !listconcat(BtVer1Features, BtVer2AdditionalFeatures); + !listconcat(BtVer1Features, BtVer2AdditionalFeatures); // Bulldozer - list<SubtargetFeature> BdVer1Features = [FeatureX87, - FeatureCMPXCHG8B, - FeatureCMOV, - FeatureXOP, - Feature64Bit, - FeatureCMPXCHG16B, - FeatureAES, - FeaturePRFCHW, - FeaturePCLMUL, - FeatureMMX, - FeatureFXSR, - FeatureNOPL, - FeatureLZCNT, - FeaturePOPCNT, - FeatureXSAVE, - FeatureLWP, - FeatureLAHFSAHF]; - list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD, - FeatureFast11ByteNOP, - FeatureFastScalarShiftMasks, - FeatureBranchFusion, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> BdVer1Features = [FeatureX87, + FeatureCMPXCHG8B, + FeatureCMOV, + FeatureXOP, + Feature64Bit, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureFXSR, + FeatureNOPL, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureLWP, + FeatureLAHFSAHF]; + list<SubtargetFeature> BdVer1Tuning = [FeatureSlowSHLD, + FeatureFast11ByteNOP, + FeatureFastScalarShiftMasks, + FeatureBranchFusion, + FeatureInsertVZEROUPPER]; // PileDriver list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C, @@ -1003,16 +1003,16 @@ def ProcessorFeatures { FeatureTBM, FeatureFMA, FeatureFastBEXTR]; - list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning; - list<SubtargetFeature> BdVer2Features = - !listconcat(BdVer1Features, BdVer2AdditionalFeatures); + list<SubtargetFeature> BdVer2Tuning = BdVer1Tuning; + list<SubtargetFeature> BdVer2Features = + !listconcat(BdVer1Features, BdVer2AdditionalFeatures); // Steamroller list<SubtargetFeature> BdVer3AdditionalFeatures = [FeatureXSAVEOPT, FeatureFSGSBase]; - list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning; - list<SubtargetFeature> BdVer3Features = - !listconcat(BdVer2Features, BdVer3AdditionalFeatures); + list<SubtargetFeature> BdVer3Tuning = BdVer2Tuning; + list<SubtargetFeature> BdVer3Features = + !listconcat(BdVer2Features, BdVer3AdditionalFeatures); // Excavator list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2, @@ -1020,9 +1020,9 @@ def ProcessorFeatures { FeatureMOVBE, FeatureRDRAND, FeatureMWAITX]; - list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning; - list<SubtargetFeature> BdVer4Features = - !listconcat(BdVer3Features, BdVer4AdditionalFeatures); + list<SubtargetFeature> BdVer4Tuning = BdVer3Tuning; + list<SubtargetFeature> BdVer4Features = + !listconcat(BdVer3Features, BdVer4AdditionalFeatures); // AMD Zen Processors common ISAs @@ -1058,80 +1058,80 @@ def ProcessorFeatures { FeatureXSAVEC, FeatureXSAVEOPT, FeatureXSAVES]; - list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT, - FeatureFastBEXTR, - FeatureFast15ByteNOP, - FeatureBranchFusion, - FeatureFastScalarShiftMasks, - FeatureSlowSHLD, - FeatureInsertVZEROUPPER]; + list<SubtargetFeature> ZNTuning = [FeatureFastLZCNT, + FeatureFastBEXTR, + FeatureFast15ByteNOP, + FeatureBranchFusion, + FeatureFastScalarShiftMasks, + FeatureSlowSHLD, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureWBNOINVD]; - list<SubtargetFeature> ZN2Tuning = ZNTuning; + list<SubtargetFeature> ZN2Tuning = ZNTuning; list<SubtargetFeature> ZN2Features = !listconcat(ZNFeatures, ZN2AdditionalFeatures); - list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM, - FeatureINVPCID, - FeaturePKU, - FeatureVAES, - FeatureVPCLMULQDQ]; - list<SubtargetFeature> ZN3Tuning = ZNTuning; - list<SubtargetFeature> ZN3Features = - !listconcat(ZN2Features, ZN3AdditionalFeatures); + list<SubtargetFeature> ZN3AdditionalFeatures = [FeatureFSRM, + FeatureINVPCID, + FeaturePKU, + FeatureVAES, + FeatureVPCLMULQDQ]; + list<SubtargetFeature> ZN3Tuning = ZNTuning; + list<SubtargetFeature> ZN3Features = + !listconcat(ZN2Features, ZN3AdditionalFeatures); } //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// -class Proc<string Name, list<SubtargetFeature> Features, - list<SubtargetFeature> TuneFeatures> - : ProcessorModel<Name, GenericModel, Features, TuneFeatures>; - -class ProcModel<string Name, SchedMachineModel Model, - list<SubtargetFeature> Features, - list<SubtargetFeature> TuneFeatures> - : ProcessorModel<Name, Model, Features, TuneFeatures>; +class Proc<string Name, list<SubtargetFeature> Features, + list<SubtargetFeature> TuneFeatures> + : ProcessorModel<Name, GenericModel, Features, TuneFeatures>; +class ProcModel<string Name, SchedMachineModel Model, + list<SubtargetFeature> Features, + list<SubtargetFeature> TuneFeatures> + : ProcessorModel<Name, Model, Features, TuneFeatures>; + // NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled // if i386/i486 is specifically requested. -// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget -// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled. -// It has no effect on code generation. -def : ProcModel<"generic", SandyBridgeModel, - [FeatureX87, FeatureCMPXCHG8B, Feature64Bit], - [FeatureSlow3OpsLEA, - FeatureSlowDivide64, - FeatureSlowIncDec, - FeatureMacroFusion, - FeatureInsertVZEROUPPER]>; - -def : Proc<"i386", [FeatureX87], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"i486", [FeatureX87], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; - -def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, - FeatureNOPL], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; - -def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, - FeatureFXSR, FeatureNOPL], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; - +// NOTE: 64Bit is here as "generic" is the default llc CPU. The X86Subtarget +// constructor checks that any CPU used in 64-bit mode has Feature64Bit enabled. +// It has no effect on code generation. +def : ProcModel<"generic", SandyBridgeModel, + [FeatureX87, FeatureCMPXCHG8B, Feature64Bit], + [FeatureSlow3OpsLEA, + FeatureSlowDivide64, + FeatureSlowIncDec, + FeatureMacroFusion, + FeatureInsertVZEROUPPER]>; + +def : Proc<"i386", [FeatureX87], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"i486", [FeatureX87], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"i586", [FeatureX87, FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentium", [FeatureX87, FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"i686", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, + FeatureNOPL], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"pentium2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, + FeatureFXSR, FeatureNOPL], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + foreach P = ["pentium3", "pentium3m"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, + FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -1144,34 +1144,34 @@ foreach P = ["pentium3", "pentium3m"] in { // measure to avoid performance surprises, in case clang's default cpu // changes slightly. -def : ProcModel<"pentium-m", GenericPostRAModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : ProcModel<"pentium-m", GenericPostRAModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { - def : ProcModel<P, GenericPostRAModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + def : ProcModel<P, GenericPostRAModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; } // Intel Quark. -def : Proc<"lakemont", [FeatureCMPXCHG8B], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"lakemont", [FeatureCMPXCHG8B], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // Intel Core Duo. -def : ProcModel<"yonah", SandyBridgeModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : ProcModel<"yonah", SandyBridgeModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // NetBurst. -def : ProcModel<"prescott", GenericPostRAModel, - [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureNOPL, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : ProcModel<"nocona", GenericPostRAModel, [ +def : ProcModel<"prescott", GenericPostRAModel, + [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : ProcModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, @@ -1181,14 +1181,14 @@ def : ProcModel<"nocona", GenericPostRAModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, -], -[ - FeatureSlowUAMem16, +], +[ + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; // Intel Core 2 Solo/Duo. -def : ProcModel<"core2", SandyBridgeModel, [ +def : ProcModel<"core2", SandyBridgeModel, [ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, @@ -1198,14 +1198,14 @@ def : ProcModel<"core2", SandyBridgeModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, - FeatureLAHFSAHF -], -[ + FeatureLAHFSAHF +], +[ FeatureMacroFusion, - FeatureSlowUAMem16, + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; -def : ProcModel<"penryn", SandyBridgeModel, [ +def : ProcModel<"penryn", SandyBridgeModel, [ FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, @@ -1215,171 +1215,171 @@ def : ProcModel<"penryn", SandyBridgeModel, [ FeatureNOPL, Feature64Bit, FeatureCMPXCHG16B, - FeatureLAHFSAHF -], -[ + FeatureLAHFSAHF +], +[ FeatureMacroFusion, - FeatureSlowUAMem16, + FeatureSlowUAMem16, FeatureInsertVZEROUPPER ]>; // Atom CPUs. foreach P = ["bonnell", "atom"] in { - def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures, - ProcessorFeatures.AtomTuning>; + def : ProcModel<P, AtomModel, ProcessorFeatures.AtomFeatures, + ProcessorFeatures.AtomTuning>; } foreach P = ["silvermont", "slm"] in { - def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures, - ProcessorFeatures.SLMTuning>; + def : ProcModel<P, SLMModel, ProcessorFeatures.SLMFeatures, + ProcessorFeatures.SLMTuning>; } -def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures, - ProcessorFeatures.GLMTuning>; -def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures, - ProcessorFeatures.GLPTuning>; -def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures, - ProcessorFeatures.TRMTuning>; +def : ProcModel<"goldmont", SLMModel, ProcessorFeatures.GLMFeatures, + ProcessorFeatures.GLMTuning>; +def : ProcModel<"goldmont-plus", SLMModel, ProcessorFeatures.GLPFeatures, + ProcessorFeatures.GLPTuning>; +def : ProcModel<"tremont", SLMModel, ProcessorFeatures.TRMFeatures, + ProcessorFeatures.TRMTuning>; // "Arrandale" along with corei3 and corei5 foreach P = ["nehalem", "corei7"] in { - def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures, - ProcessorFeatures.NHMTuning>; + def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.NHMFeatures, + ProcessorFeatures.NHMTuning>; } // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures, - ProcessorFeatures.WSMTuning>; +def : ProcModel<"westmere", SandyBridgeModel, ProcessorFeatures.WSMFeatures, + ProcessorFeatures.WSMTuning>; foreach P = ["sandybridge", "corei7-avx"] in { - def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures, - ProcessorFeatures.SNBTuning>; + def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.SNBFeatures, + ProcessorFeatures.SNBTuning>; } foreach P = ["ivybridge", "core-avx-i"] in { - def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures, - ProcessorFeatures.IVBTuning>; + def : ProcModel<P, SandyBridgeModel, ProcessorFeatures.IVBFeatures, + ProcessorFeatures.IVBTuning>; } foreach P = ["haswell", "core-avx2"] in { - def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures, - ProcessorFeatures.HSWTuning>; + def : ProcModel<P, HaswellModel, ProcessorFeatures.HSWFeatures, + ProcessorFeatures.HSWTuning>; } -def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures, - ProcessorFeatures.BDWTuning>; +def : ProcModel<"broadwell", BroadwellModel, ProcessorFeatures.BDWFeatures, + ProcessorFeatures.BDWTuning>; -def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures, - ProcessorFeatures.SKLTuning>; +def : ProcModel<"skylake", SkylakeClientModel, ProcessorFeatures.SKLFeatures, + ProcessorFeatures.SKLTuning>; // FIXME: define KNL scheduler model -def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures, - ProcessorFeatures.KNLTuning>; -def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures, - ProcessorFeatures.KNLTuning>; +def : ProcModel<"knl", HaswellModel, ProcessorFeatures.KNLFeatures, + ProcessorFeatures.KNLTuning>; +def : ProcModel<"knm", HaswellModel, ProcessorFeatures.KNMFeatures, + ProcessorFeatures.KNLTuning>; foreach P = ["skylake-avx512", "skx"] in { - def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures, - ProcessorFeatures.SKXTuning>; + def : ProcModel<P, SkylakeServerModel, ProcessorFeatures.SKXFeatures, + ProcessorFeatures.SKXTuning>; } -def : ProcModel<"cascadelake", SkylakeServerModel, - ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>; -def : ProcModel<"cooperlake", SkylakeServerModel, - ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>; -def : ProcModel<"cannonlake", SkylakeServerModel, - ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>; -def : ProcModel<"icelake-client", SkylakeServerModel, - ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>; -def : ProcModel<"icelake-server", SkylakeServerModel, - ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>; -def : ProcModel<"tigerlake", SkylakeServerModel, - ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>; -def : ProcModel<"sapphirerapids", SkylakeServerModel, - ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>; -def : ProcModel<"alderlake", SkylakeClientModel, - ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>; +def : ProcModel<"cascadelake", SkylakeServerModel, + ProcessorFeatures.CLXFeatures, ProcessorFeatures.CLXTuning>; +def : ProcModel<"cooperlake", SkylakeServerModel, + ProcessorFeatures.CPXFeatures, ProcessorFeatures.CPXTuning>; +def : ProcModel<"cannonlake", SkylakeServerModel, + ProcessorFeatures.CNLFeatures, ProcessorFeatures.CNLTuning>; +def : ProcModel<"icelake-client", SkylakeServerModel, + ProcessorFeatures.ICLFeatures, ProcessorFeatures.ICLTuning>; +def : ProcModel<"icelake-server", SkylakeServerModel, + ProcessorFeatures.ICXFeatures, ProcessorFeatures.ICXTuning>; +def : ProcModel<"tigerlake", SkylakeServerModel, + ProcessorFeatures.TGLFeatures, ProcessorFeatures.TGLTuning>; +def : ProcModel<"sapphirerapids", SkylakeServerModel, + ProcessorFeatures.SPRFeatures, ProcessorFeatures.SPRTuning>; +def : ProcModel<"alderlake", SkylakeClientModel, + ProcessorFeatures.ADLFeatures, ProcessorFeatures.ADLTuning>; // AMD CPUs. -def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"k6", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"k6-2", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"k6-3", [FeatureX87, FeatureCMPXCHG8B, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA, - FeatureNOPL], - [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, Feature3DNowA, + FeatureNOPL], + [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, - FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL], - [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureCMOV, + FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL], + [FeatureSlowSHLD, FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA, - FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV], - [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16, - FeatureInsertVZEROUPPER]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA, + FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV], + [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA, - FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV, - Feature64Bit], - [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16, - FeatureInsertVZEROUPPER]>; + def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV, + Feature64Bit], + [FeatureFastScalarShiftMasks, FeatureSlowSHLD, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; } foreach P = ["amdfam10", "barcelona"] in { - def : Proc<P, ProcessorFeatures.BarcelonaFeatures, - ProcessorFeatures.BarcelonaTuning>; + def : Proc<P, ProcessorFeatures.BarcelonaFeatures, + ProcessorFeatures.BarcelonaTuning>; } // Bobcat -def : Proc<"btver1", ProcessorFeatures.BtVer1Features, - ProcessorFeatures.BtVer1Tuning>; +def : Proc<"btver1", ProcessorFeatures.BtVer1Features, + ProcessorFeatures.BtVer1Tuning>; // Jaguar -def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features, - ProcessorFeatures.BtVer2Tuning>; +def : ProcModel<"btver2", BtVer2Model, ProcessorFeatures.BtVer2Features, + ProcessorFeatures.BtVer2Tuning>; // Bulldozer -def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features, - ProcessorFeatures.BdVer1Tuning>; +def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features, + ProcessorFeatures.BdVer1Tuning>; // Piledriver -def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features, - ProcessorFeatures.BdVer2Tuning>; +def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features, + ProcessorFeatures.BdVer2Tuning>; // Steamroller -def : Proc<"bdver3", ProcessorFeatures.BdVer3Features, - ProcessorFeatures.BdVer3Tuning>; +def : Proc<"bdver3", ProcessorFeatures.BdVer3Features, + ProcessorFeatures.BdVer3Tuning>; // Excavator -def : Proc<"bdver4", ProcessorFeatures.BdVer4Features, - ProcessorFeatures.BdVer4Tuning>; - -def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures, - ProcessorFeatures.ZNTuning>; -def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, - ProcessorFeatures.ZN2Tuning>; -def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features, - ProcessorFeatures.ZN3Tuning>; - -def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; - -def : Proc<"winchip-c6", [FeatureX87, FeatureMMX], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"winchip2", [FeatureX87, Feature3DNow], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"c3", [FeatureX87, Feature3DNow], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; -def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureCMOV], - [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"bdver4", ProcessorFeatures.BdVer4Features, + ProcessorFeatures.BdVer4Tuning>; + +def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures, + ProcessorFeatures.ZNTuning>; +def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, + ProcessorFeatures.ZN2Tuning>; +def : ProcModel<"znver3", Znver2Model, ProcessorFeatures.ZN3Features, + ProcessorFeatures.ZN3Tuning>; + +def : Proc<"geode", [FeatureX87, FeatureCMPXCHG8B, Feature3DNowA], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; + +def : Proc<"winchip-c6", [FeatureX87, FeatureMMX], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"winchip2", [FeatureX87, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"c3", [FeatureX87, Feature3DNow], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; +def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, + FeatureSSE1, FeatureFXSR, FeatureCMOV], + [FeatureSlowUAMem16, FeatureInsertVZEROUPPER]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1391,8 +1391,8 @@ def : Proc<"c3-2", [FeatureX87, FeatureCMPXCHG8B, FeatureMMX, // covers a huge swath of x86 processors. If there are specific scheduling // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. -def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features, -[ +def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features, +[ FeatureSlow3OpsLEA, FeatureSlowDivide64, FeatureSlowIncDec, @@ -1400,16 +1400,16 @@ def : ProcModel<"x86-64", SandyBridgeModel, ProcessorFeatures.X86_64V1Features, FeatureInsertVZEROUPPER ]>; -// x86-64 micro-architecture levels. -def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features, - ProcessorFeatures.SNBTuning>; -// Close to Haswell. -def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features, - ProcessorFeatures.HSWTuning>; -// Close to the AVX-512 level implemented by Xeon Scalable Processors. -def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features, - ProcessorFeatures.SKXTuning>; - +// x86-64 micro-architecture levels. +def : ProcModel<"x86-64-v2", SandyBridgeModel, ProcessorFeatures.X86_64V2Features, + ProcessorFeatures.SNBTuning>; +// Close to Haswell. +def : ProcModel<"x86-64-v3", HaswellModel, ProcessorFeatures.X86_64V3Features, + ProcessorFeatures.HSWTuning>; +// Close to the AVX-512 level implemented by Xeon Scalable Processors. +def : ProcModel<"x86-64-v4", HaswellModel, ProcessorFeatures.X86_64V4Features, + ProcessorFeatures.SKXTuning>; + //===----------------------------------------------------------------------===// // Calling Conventions //===----------------------------------------------------------------------===// diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp index 2d434bda55..7086ee858b 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.cpp @@ -404,7 +404,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, O << ']'; } -static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO, +static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { Register Reg = MO.getReg(); bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT; @@ -446,9 +446,9 @@ static bool printAsmMRegister(const X86AsmPrinter &P, const MachineOperand &MO, return false; } -static bool printAsmVRegister(const MachineOperand &MO, char Mode, - raw_ostream &O) { - Register Reg = MO.getReg(); +static bool printAsmVRegister(const MachineOperand &MO, char Mode, + raw_ostream &O) { + Register Reg = MO.getReg(); bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT; unsigned Index; @@ -560,7 +560,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, case 't': // Print V8SFmode register case 'g': // Print V16SFmode register if (MO.isReg()) - return printAsmVRegister(MO, ExtraCode[0], O); + return printAsmVRegister(MO, ExtraCode[0], O); PrintOperand(MI, OpNo, O); return false; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h index a3b74c8ee3..fe0a4d551a 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86AsmPrinter.h @@ -134,9 +134,9 @@ public: } bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - const char *ExtraCode, raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, - const char *ExtraCode, raw_ostream &O) override; + const char *ExtraCode, raw_ostream &O) override; bool doInitialization(Module &M) override { SMShadowTracker.reset(0); @@ -145,7 +145,7 @@ public: return AsmPrinter::doInitialization(M); } - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override; void emitFunctionBodyStart() override; void emitFunctionBodyEnd() override; }; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index fdc65acffe..f95e27173f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -154,7 +154,7 @@ static bool isPotentialBlockedMemCpyLd(unsigned Opcode) { return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode); } -static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) { +static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) { switch (LdOpcode) { case X86::MOVUPSrm: case X86::MOVAPSrm: @@ -206,7 +206,7 @@ static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) { } } -static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) { +static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) { bool PBlock = false; PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 || Opcode == X86::MOV32mr || Opcode == X86::MOV32mi || diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp index fae4e688c8..46e18b6c46 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86CallFrameOptimization.cpp @@ -105,7 +105,7 @@ private: void adjustCallSequence(MachineFunction &MF, const CallContext &Context); MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, - Register Reg); + Register Reg); enum InstClassification { Convert, Skip, Exit }; @@ -202,7 +202,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, Align StackAlign = TFL->getStackAlign(); int64_t Advantage = 0; - for (const auto &CC : CallSeqVector) { + for (const auto &CC : CallSeqVector) { // Call sites where no parameters are passed on the stack // do not affect the cost, since there needs to be no // stack adjustment. @@ -265,7 +265,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { if (!isProfitable(MF, CallSeqVector)) return false; - for (const auto &CC : CallSeqVector) { + for (const auto &CC : CallSeqVector) { if (CC.UsePush) { adjustCallSequence(MF, CC); Changed = true; @@ -288,13 +288,13 @@ X86CallFrameOptimization::classifyInstruction( case X86::AND16mi8: case X86::AND32mi8: case X86::AND64mi8: { - const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); + const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); return ImmOp.getImm() == 0 ? Convert : Exit; } case X86::OR16mi8: case X86::OR32mi8: case X86::OR64mi8: { - const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); + const MachineOperand &ImmOp = MI->getOperand(X86::AddrNumOperands); return ImmOp.getImm() == -1 ? Convert : Exit; } case X86::MOV32mi: @@ -336,7 +336,7 @@ X86CallFrameOptimization::classifyInstruction( if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (!Reg.isPhysical()) + if (!Reg.isPhysical()) continue; if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) return Exit; @@ -454,7 +454,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (Reg.isPhysical()) + if (Reg.isPhysical()) UsedRegs.insert(Reg); } } @@ -506,7 +506,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // replace uses. for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx]; - const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands); + const MachineOperand &PushOp = Store->getOperand(X86::AddrNumOperands); MachineBasicBlock::iterator Push = nullptr; unsigned PushOpcode; switch (Store->getOpcode()) { @@ -563,7 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) Push->addOperand(DefMov->getOperand(i)); - Push->cloneMergedMemRefs(MF, {DefMov, &*Store}); + Push->cloneMergedMemRefs(MF, {DefMov, &*Store}); DefMov->eraseFromParent(); } else { PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r; @@ -599,7 +599,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, } MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( - MachineBasicBlock::iterator FrameSetup, Register Reg) { + MachineBasicBlock::iterator FrameSetup, Register Reg) { // Do an extremely restricted form of load folding. // ISel will often create patterns like: // movl 4(%edi), %eax @@ -610,7 +610,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( // movl %eax, (%esp) // call // Get rid of those with prejudice. - if (!Reg.isVirtual()) + if (!Reg.isVirtual()) return nullptr; // Make sure this is the only use of Reg. diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp index 53f57565d5..a497375bd0 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.cpp @@ -95,11 +95,11 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, namespace { -struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler { - X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder, - MachineRegisterInfo &MRI, MachineInstrBuilder &MIB, - CCAssignFn *AssignFn) - : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), +struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler { + X86OutgoingValueHandler(MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI, MachineInstrBuilder &MIB, + CCAssignFn *AssignFn) + : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), DL(MIRBuilder.getMF().getDataLayout()), STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {} @@ -134,10 +134,10 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler { unsigned ValSize = VA.getValVT().getSizeInBits(); unsigned LocSize = VA.getLocVT().getSizeInBits(); if (PhysRegSize > ValSize && LocSize == ValSize) { - assert((PhysRegSize == 128 || PhysRegSize == 80) && - "We expect that to be 128 bit"); - ExtReg = - MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0); + assert((PhysRegSize == 128 || PhysRegSize == 80) && + "We expect that to be 128 bit"); + ExtReg = + MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg).getReg(0); } else ExtReg = extendRegister(ValVReg, VA); @@ -149,9 +149,9 @@ struct X86OutgoingValueHandler : public CallLowering::OutgoingValueHandler { MachineFunction &MF = MIRBuilder.getMF(); Register ExtReg = extendRegister(ValVReg, VA); - auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, - VA.getLocVT().getStoreSize(), - inferAlignFromPtrInfo(MF, MPO)); + auto *MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, + VA.getLocVT().getStoreSize(), + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -184,9 +184,9 @@ protected: } // end anonymous namespace -bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, ArrayRef<Register> VRegs, - FunctionLoweringInfo &FLI) const { +bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, ArrayRef<Register> VRegs, + FunctionLoweringInfo &FLI) const { assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && "Return value without a vreg"); auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0); @@ -195,7 +195,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const DataLayout &DL = MF.getDataLayout(); + const DataLayout &DL = MF.getDataLayout(); LLVMContext &Ctx = Val->getType()->getContext(); const X86TargetLowering &TLI = *getTLI<X86TargetLowering>(); @@ -215,7 +215,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, return false; } - X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); + X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; } @@ -226,10 +226,10 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, namespace { -struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler { - X86IncomingValueHandler(MachineIRBuilder &MIRBuilder, - MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : IncomingValueHandler(MIRBuilder, MRI, AssignFn), +struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler { + X86IncomingValueHandler(MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI, CCAssignFn *AssignFn) + : IncomingValueHandler(MIRBuilder, MRI, AssignFn), DL(MIRBuilder.getMF().getDataLayout()) {} Register getStackAddress(uint64_t Size, int64_t Offset, @@ -246,7 +246,7 @@ struct X86IncomingValueHandler : public CallLowering::IncomingValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { MachineFunction &MF = MIRBuilder.getMF(); - auto *MMO = MF.getMachineMemOperand( + auto *MMO = MF.getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); @@ -296,10 +296,10 @@ protected: const DataLayout &DL; }; -struct FormalArgHandler : public X86IncomingValueHandler { +struct FormalArgHandler : public X86IncomingValueHandler { FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} + : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { MIRBuilder.getMRI()->addLiveIn(PhysReg); @@ -307,10 +307,10 @@ struct FormalArgHandler : public X86IncomingValueHandler { } }; -struct CallReturnHandler : public X86IncomingValueHandler { +struct CallReturnHandler : public X86IncomingValueHandler { CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn *AssignFn, MachineInstrBuilder &MIB) - : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + : X86IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} void markPhysRegUsed(unsigned PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); @@ -322,10 +322,10 @@ protected: } // end anonymous namespace -bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, - FunctionLoweringInfo &FLI) const { +bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const { if (F.arg_empty()) return true; @@ -339,7 +339,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; unsigned Idx = 0; - for (const auto &Arg : F.args()) { + for (const auto &Arg : F.args()) { // TODO: handle not simple cases. if (Arg.hasAttribute(Attribute::ByVal) || Arg.hasAttribute(Attribute::InReg) || @@ -378,10 +378,10 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const DataLayout &DL = F.getParent()->getDataLayout(); + const DataLayout &DL = F.getParent()->getDataLayout(); const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); const TargetInstrInfo &TII = *STI.getInstrInfo(); - const X86RegisterInfo *TRI = STI.getRegisterInfo(); + const X86RegisterInfo *TRI = STI.getRegisterInfo(); // Handle only Linux C, X86_64_SysV calling conventions for now. if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C || @@ -419,7 +419,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; } // Do the actual argument marshalling. - X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86); + X86OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h index 9390122d76..33d2143ef8 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86CallLowering.h @@ -29,12 +29,12 @@ public: X86CallLowering(const X86TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<Register> VRegs, - FunctionLoweringInfo &FLI) const override; + ArrayRef<Register> VRegs, + FunctionLoweringInfo &FLI) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, - FunctionLoweringInfo &FLI) const override; + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp index c80a5d5bb3..eada1f1540 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.cpp @@ -330,15 +330,15 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } -static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT, - CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) { - if (LocVT != MVT::i64) { - LocVT = MVT::i64; - LocInfo = CCValAssign::ZExt; - } - return false; -} - +static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + if (LocVT != MVT::i64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::ZExt; + } + return false; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td index 3735fab818..2fbd0f5cfd 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86CallingConv.td @@ -336,9 +336,9 @@ def RetCC_X86_64_C : CallingConv<[ // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, - // Pointers are always returned in full 64-bit registers. - CCIfPtr<CCCustom<"CC_X86_64_Pointer">>, - + // Pointers are always returned in full 64-bit registers. + CCIfPtr<CCCustom<"CC_X86_64_Pointer">>, + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, CCDelegateTo<RetCC_X86Common> @@ -521,9 +521,9 @@ def CC_X86_64_C : CallingConv<[ CCIfCC<"CallingConv::Swift", CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>, - // Pointers are always passed in full 64-bit registers. - CCIfPtr<CCCustom<"CC_X86_64_Pointer">>, - + // Pointers are always passed in full 64-bit registers. + CCIfPtr<CCCustom<"CC_X86_64_Pointer">>, + // The first 6 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, @@ -1102,7 +1102,7 @@ def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP) // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, - R8, R9, R10)>; + R8, R9, R10)>; // All registers - except r11 def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs, @@ -1160,16 +1160,16 @@ def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15, def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; // Register calling convention preserves few GPR and XMM8-15 -def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; +def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, (sequence "XMM%u", 4, 7))>; def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>; def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>; -def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, +def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, (sequence "R%u", 10, 15))>; def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, (sequence "XMM%u", 8, 15))>; -def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, +def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, (sequence "R%u", 12, 15))>; def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, (sequence "XMM%u", 8, 15))>; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp b/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp index a2de0dc082..e840d30ce3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86CmovConversion.cpp @@ -439,7 +439,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - auto &RDM = RegDefMaps[Reg.isVirtual()]; + auto &RDM = RegDefMaps[Reg.isVirtual()]; if (MachineInstr *DefMI = RDM.lookup(Reg)) { OperandToDefMap[&MO] = DefMI; DepthInfo Info = DepthMap.lookup(DefMI); @@ -459,7 +459,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( if (!MO.isReg() || !MO.isDef()) continue; Register Reg = MO.getReg(); - RegDefMaps[Reg.isVirtual()][Reg] = &MI; + RegDefMaps[Reg.isVirtual()][Reg] = &MI; } unsigned Latency = TSchedModel.computeInstrLatency(&MI); @@ -537,7 +537,7 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( // This is another conservative check to avoid converting CMOV instruction // used with tree-search like algorithm, where the branch is unpredicted. auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); - if (!UIs.empty() && ++UIs.begin() == UIs.end()) { + if (!UIs.empty() && ++UIs.begin() == UIs.end()) { unsigned Op = UIs.begin()->getOpcode(); if (Op == X86::MOV64rm || Op == X86::MOV32rm) { WorthOpGroup = false; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp b/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp index a2ae6345c0..8fd8b38f70 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86DomainReassignment.cpp @@ -141,7 +141,7 @@ public: return false; // It's illegal to replace an instruction that implicitly defines a register // with an instruction that doesn't, unless that register dead. - for (const auto &MO : MI->implicit_operands()) + for (const auto &MO : MI->implicit_operands()) if (MO.isReg() && MO.isDef() && !MO.isDead() && !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg())) return false; @@ -180,7 +180,7 @@ public: MachineRegisterInfo *MRI) const override { assert(isLegal(MI, TII) && "Cannot convert instruction"); MachineBasicBlock *MBB = MI->getParent(); - const DebugLoc &DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); Register Reg = MRI->createVirtualRegister( TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), @@ -220,12 +220,12 @@ public: // Don't allow copies to/flow GR8/GR16 physical registers. // FIXME: Is there some better way to support this? Register DstReg = MI->getOperand(0).getReg(); - if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) || - X86::GR16RegClass.contains(DstReg))) + if (DstReg.isPhysical() && (X86::GR8RegClass.contains(DstReg) || + X86::GR16RegClass.contains(DstReg))) return false; Register SrcReg = MI->getOperand(1).getReg(); - if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg))) + if (SrcReg.isPhysical() && (X86::GR8RegClass.contains(SrcReg) || + X86::GR16RegClass.contains(SrcReg))) return false; return true; @@ -235,7 +235,7 @@ public: MachineRegisterInfo *MRI) const override { assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY"); - for (const auto &MO : MI->operands()) { + for (const auto &MO : MI->operands()) { // Physical registers will not be converted. Assume that converting the // COPY to the destination domain will eventually result in a actual // instruction. @@ -298,7 +298,7 @@ typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>> class Closure { private: /// Virtual registers in the closure. - DenseSet<Register> Edges; + DenseSet<Register> Edges; /// Instructions in the closure. SmallVector<MachineInstr *, 8> Instrs; @@ -330,9 +330,9 @@ public: bool empty() const { return Edges.empty(); } - bool insertEdge(Register Reg) { return Edges.insert(Reg).second; } + bool insertEdge(Register Reg) { return Edges.insert(Reg).second; } - using const_edge_iterator = DenseSet<Register>::const_iterator; + using const_edge_iterator = DenseSet<Register>::const_iterator; iterator_range<const_edge_iterator> edges() const { return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end()); } @@ -348,7 +348,7 @@ public: LLVM_DUMP_METHOD void dump(const MachineRegisterInfo *MRI) const { dbgs() << "Registers: "; bool First = true; - for (Register Reg : Edges) { + for (Register Reg : Edges) { if (!First) dbgs() << ", "; First = false; @@ -403,10 +403,10 @@ private: void initConverters(); /// Starting from \Reg, expand the closure as much as possible. - void buildClosure(Closure &, Register Reg); + void buildClosure(Closure &, Register Reg); /// Enqueue \p Reg to be considered for addition to the closure. - void visitRegister(Closure &, Register Reg, RegDomain &Domain, + void visitRegister(Closure &, Register Reg, RegDomain &Domain, SmallVectorImpl<unsigned> &Worklist); /// Reassign the closure to \p Domain. @@ -426,13 +426,13 @@ char X86DomainReassignment::ID = 0; } // End anonymous namespace. -void X86DomainReassignment::visitRegister(Closure &C, Register Reg, +void X86DomainReassignment::visitRegister(Closure &C, Register Reg, RegDomain &Domain, SmallVectorImpl<unsigned> &Worklist) { if (EnclosedEdges.count(Reg)) return; - if (!Reg.isVirtual()) + if (!Reg.isVirtual()) return; if (!MRI->hasOneDef(Reg)) @@ -503,7 +503,7 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { // Iterate all registers in the closure, replace them with registers in the // destination domain. - for (Register Reg : C.edges()) { + for (Register Reg : C.edges()) { MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain)); for (auto &MO : MRI->use_operands(Reg)) { if (MO.isReg()) @@ -513,13 +513,13 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { } } - for (auto *MI : ToErase) + for (auto *MI : ToErase) MI->eraseFromParent(); } /// \returns true when \p Reg is used as part of an address calculation in \p /// MI. -static bool usedAsAddr(const MachineInstr &MI, Register Reg, +static bool usedAsAddr(const MachineInstr &MI, Register Reg, const TargetInstrInfo *TII) { if (!MI.mayLoadOrStore()) return false; @@ -533,14 +533,14 @@ static bool usedAsAddr(const MachineInstr &MI, Register Reg, for (unsigned MemOpIdx = MemOpStart, MemOpEnd = MemOpStart + X86::AddrNumOperands; MemOpIdx < MemOpEnd; ++MemOpIdx) { - const MachineOperand &Op = MI.getOperand(MemOpIdx); + const MachineOperand &Op = MI.getOperand(MemOpIdx); if (Op.isReg() && Op.getReg() == Reg) return true; } return false; } -void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { +void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { SmallVector<unsigned, 4> Worklist; RegDomain Domain = NoDomain; visitRegister(C, Reg, Domain, Worklist); @@ -590,7 +590,7 @@ void X86DomainReassignment::buildClosure(Closure &C, Register Reg) { continue; Register DefReg = DefOp.getReg(); - if (!DefReg.isVirtual()) { + if (!DefReg.isVirtual()) { C.setAllIllegal(); continue; } @@ -749,7 +749,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { // Go over all virtual registers and calculate a closure. unsigned ClosureID = 0; for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { - Register Reg = Register::index2VirtReg(Idx); + Register Reg = Register::index2VirtReg(Idx); // GPR only current source domain supported. if (!isGPR(MRI->getRegClass(Reg))) diff --git a/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp b/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp index 97f843fa24..1ac8851ecd 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86EvexToVex.cpp @@ -85,8 +85,8 @@ public: private: /// Machine instruction info used throughout the class. const X86InstrInfo *TII = nullptr; - - const X86Subtarget *ST = nullptr; + + const X86Subtarget *ST = nullptr; }; } // end anonymous namespace @@ -96,8 +96,8 @@ char EvexToVexInstPass::ID = 0; bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); - ST = &MF.getSubtarget<X86Subtarget>(); - if (!ST->hasAVX512()) + ST = &MF.getSubtarget<X86Subtarget>(); + if (!ST->hasAVX512()) return false; bool Changed = false; @@ -146,29 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) { } // Do any custom cleanup needed to finalize the conversion. -static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc, - const X86Subtarget *ST) { +static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc, + const X86Subtarget *ST) { (void)NewOpc; unsigned Opc = MI.getOpcode(); switch (Opc) { - case X86::VPDPBUSDSZ256m: - case X86::VPDPBUSDSZ256r: - case X86::VPDPBUSDSZ128m: - case X86::VPDPBUSDSZ128r: - case X86::VPDPBUSDZ256m: - case X86::VPDPBUSDZ256r: - case X86::VPDPBUSDZ128m: - case X86::VPDPBUSDZ128r: - case X86::VPDPWSSDSZ256m: - case X86::VPDPWSSDSZ256r: - case X86::VPDPWSSDSZ128m: - case X86::VPDPWSSDSZ128r: - case X86::VPDPWSSDZ256m: - case X86::VPDPWSSDZ256r: - case X86::VPDPWSSDZ128m: - case X86::VPDPWSSDZ128r: - // These can only VEX convert if AVXVNNI is enabled. - return ST->hasAVXVNNI(); + case X86::VPDPBUSDSZ256m: + case X86::VPDPBUSDSZ256r: + case X86::VPDPBUSDSZ128m: + case X86::VPDPBUSDSZ128r: + case X86::VPDPBUSDZ256m: + case X86::VPDPBUSDZ256r: + case X86::VPDPBUSDZ128m: + case X86::VPDPBUSDZ128r: + case X86::VPDPWSSDSZ256m: + case X86::VPDPWSSDSZ256r: + case X86::VPDPWSSDSZ128m: + case X86::VPDPWSSDSZ128r: + case X86::VPDPWSSDZ256m: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ128m: + case X86::VPDPWSSDZ128r: + // These can only VEX convert if AVXVNNI is enabled. + return ST->hasAVXVNNI(); case X86::VALIGNDZ128rri: case X86::VALIGNDZ128rmi: case X86::VALIGNQZ128rri: @@ -271,7 +271,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { (Desc.TSFlags & X86II::VEX_L) ? makeArrayRef(X86EvexToVex256CompressTable) : makeArrayRef(X86EvexToVex128CompressTable); - const auto *I = llvm::lower_bound(Table, MI.getOpcode()); + const auto *I = llvm::lower_bound(Table, MI.getOpcode()); if (I == Table.end() || I->EvexOpcode != MI.getOpcode()) return false; @@ -280,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { if (usesExtendedRegister(MI)) return false; - if (!performCustomAdjustments(MI, NewOpc, ST)) + if (!performCustomAdjustments(MI, NewOpc, ST)) return false; MI.setDesc(TII->get(NewOpc)); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp index 15af0fb2e8..9998b30754 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86ExpandPseudo.cpp @@ -338,24 +338,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Perform the following transformation. // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx // => - // RBX = InArg + // RBX = InArg // actualcmpxchg Addr - // RBX = SaveRbx + // RBX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); Register SaveRbx = MBBI->getOperand(7).getReg(); // Copy the input argument of the pseudo into the argument of the // actual instruction. - // NOTE: We don't copy the kill flag since the input might be the same reg - // as one of the other operands of LCMPXCHG16B. - TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false); + // NOTE: We don't copy the kill flag since the input might be the same reg + // as one of the other operands of LCMPXCHG16B. + TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false); // Create the actual instruction. - MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B)); + MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B)); // Copy the operands related to the address. for (unsigned Idx = 1; Idx < 6; ++Idx) NewInstr->addOperand(MBBI->getOperand(Idx)); // Finally, restore the value of RBX. - TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, + TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); // Delete the pseudo. @@ -438,69 +438,69 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBB.erase(MBBI); return true; } - case X86::MWAITX_SAVE_RBX: { - // Perform the following transformation. - // SaveRbx = pseudomwaitx InArg, SaveRbx - // => - // [E|R]BX = InArg - // actualmwaitx - // [E|R]BX = SaveRbx - const MachineOperand &InArg = MBBI->getOperand(1); - // Copy the input argument of the pseudo into the argument of the - // actual instruction. - TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill()); - // Create the actual instruction. - BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr)); - // Finally, restore the value of RBX. - Register SaveRbx = MBBI->getOperand(2).getReg(); - TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); - // Delete the pseudo. - MBBI->eraseFromParent(); - return true; - } + case X86::MWAITX_SAVE_RBX: { + // Perform the following transformation. + // SaveRbx = pseudomwaitx InArg, SaveRbx + // => + // [E|R]BX = InArg + // actualmwaitx + // [E|R]BX = SaveRbx + const MachineOperand &InArg = MBBI->getOperand(1); + // Copy the input argument of the pseudo into the argument of the + // actual instruction. + TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill()); + // Create the actual instruction. + BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr)); + // Finally, restore the value of RBX. + Register SaveRbx = MBBI->getOperand(2).getReg(); + TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); + // Delete the pseudo. + MBBI->eraseFromParent(); + return true; + } case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; - case X86::PLDTILECFG: { - MI.RemoveOperand(0); - MI.setDesc(TII->get(X86::LDTILECFG)); - return true; - } - case X86::PSTTILECFG: { - MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg - MI.setDesc(TII->get(X86::STTILECFG)); - return true; - } - case X86::PTILELOADDV: { - MI.RemoveOperand(8); // Remove $tmmcfg - for (unsigned i = 2; i > 0; --i) - MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TILELOADD)); - return true; - } - case X86::PTDPBSSDV: { - MI.RemoveOperand(7); // Remove $tmmcfg - MI.untieRegOperand(4); - for (unsigned i = 3; i > 0; --i) - MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TDPBSSD)); - MI.tieOperands(0, 1); - return true; - } - case X86::PTILESTOREDV: { - MI.RemoveOperand(8); // Remove $tmmcfg - for (int i = 1; i >= 0; --i) - MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TILESTORED)); - return true; - } - case X86::PTILEZEROV: { - for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg - MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TILEZERO)); - return true; - } + case X86::PLDTILECFG: { + MI.RemoveOperand(0); + MI.setDesc(TII->get(X86::LDTILECFG)); + return true; } + case X86::PSTTILECFG: { + MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg + MI.setDesc(TII->get(X86::STTILECFG)); + return true; + } + case X86::PTILELOADDV: { + MI.RemoveOperand(8); // Remove $tmmcfg + for (unsigned i = 2; i > 0; --i) + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILELOADD)); + return true; + } + case X86::PTDPBSSDV: { + MI.RemoveOperand(7); // Remove $tmmcfg + MI.untieRegOperand(4); + for (unsigned i = 3; i > 0; --i) + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TDPBSSD)); + MI.tieOperands(0, 1); + return true; + } + case X86::PTILESTOREDV: { + MI.RemoveOperand(8); // Remove $tmmcfg + for (int i = 1; i >= 0; --i) + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILESTORED)); + return true; + } + case X86::PTILEZEROV: { + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILEZERO)); + return true; + } + } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp index a1a16a19f5..b53aa41575 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FastISel.cpp @@ -284,14 +284,14 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, return false; } - // Make sure no potentially eflags clobbering phi moves can be inserted in - // between. - auto HasPhis = [](const BasicBlock *Succ) { - return !llvm::empty(Succ->phis()); - }; - if (I->isTerminator() && llvm::any_of(successors(I), HasPhis)) - return false; - + // Make sure no potentially eflags clobbering phi moves can be inserted in + // between. + auto HasPhis = [](const BasicBlock *Succ) { + return !llvm::empty(Succ->phis()); + }; + if (I->isTerminator() && llvm::any_of(successors(I), HasPhis)) + return false; + CC = TmpCC; return true; } @@ -792,9 +792,9 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { RC = &X86::GR32RegClass; } - if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL) - StubAM.Base.Reg = X86::RIP; - + if (Subtarget->isPICStyleRIPRel() || GVFlags == X86II::MO_GOTPCREL) + StubAM.Base.Reg = X86::RIP; + LoadReg = createResultReg(RC); MachineInstrBuilder LoadMI = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); @@ -1090,35 +1090,35 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { // If all else fails, try to materialize the value in a register. if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { - auto GetCallRegForValue = [this](const Value *V) { - Register Reg = getRegForValue(V); - - // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits. - if (Reg && Subtarget->isTarget64BitILP32()) { - Register CopyReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr), - CopyReg) - .addReg(Reg); - - Register ExtReg = createResultReg(&X86::GR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg) - .addImm(0) - .addReg(CopyReg) - .addImm(X86::sub_32bit); - Reg = ExtReg; - } - - return Reg; - }; - + auto GetCallRegForValue = [this](const Value *V) { + Register Reg = getRegForValue(V); + + // In 64-bit mode, we need a 64-bit register even if pointers are 32 bits. + if (Reg && Subtarget->isTarget64BitILP32()) { + Register CopyReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32rr), + CopyReg) + .addReg(Reg); + + Register ExtReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ExtReg) + .addImm(0) + .addReg(CopyReg) + .addImm(X86::sub_32bit); + Reg = ExtReg; + } + + return Reg; + }; + if (AM.Base.Reg == 0) { - AM.Base.Reg = GetCallRegForValue(V); + AM.Base.Reg = GetCallRegForValue(V); return AM.Base.Reg != 0; } if (AM.IndexReg == 0) { assert(AM.Scale == 1 && "Scale with no index!"); - AM.IndexReg = GetCallRegForValue(V); + AM.IndexReg = GetCallRegForValue(V); return AM.IndexReg != 0; } } @@ -1261,15 +1261,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; - // TODO - SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false); + // TODO + SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*Op0IsKill=*/false); SrcVT = MVT::i8; } unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - // TODO - SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, - /*Op0IsKill=*/false); + // TODO + SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, + /*Op0IsKill=*/false); } // Make the copy. @@ -1463,8 +1463,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), ResultReg); - ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, - /*Op0IsKill=*/true, X86::sub_8bit); + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, + /*Op0IsKill=*/true, X86::sub_8bit); if (!ResultReg) return false; break; @@ -1587,11 +1587,11 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8), Result32).addReg(ResultReg); - ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, - /*Op0IsKill=*/true, X86::sub_16bit); + ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, + /*Op0IsKill=*/true, X86::sub_16bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, - ResultReg, /*Op0IsKill=*/true); + ResultReg, /*Op0IsKill=*/true); if (ResultReg == 0) return false; } @@ -1633,11 +1633,11 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8), Result32).addReg(ResultReg); - ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, - /*Op0IsKill=*/true, X86::sub_16bit); + ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, + /*Op0IsKill=*/true, X86::sub_16bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND, - ResultReg, /*Op0IsKill=*/true); + ResultReg, /*Op0IsKill=*/true); if (ResultReg == 0) return false; } @@ -1789,7 +1789,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), OpReg) .addReg(KOpReg); - OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true, + OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Op0IsKill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) @@ -2021,7 +2021,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { // Now reference the 8-bit subreg of the result. ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, - /*Op0IsKill=*/true, X86::sub_8bit); + /*Op0IsKill=*/true, X86::sub_8bit); } // Copy the result out of the physreg if we haven't already. if (!ResultReg) { @@ -2135,7 +2135,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); - CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) @@ -2289,12 +2289,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *VR128 = &X86::VR128RegClass; Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, - /*Op0IsKill=*/false, LHSReg, LHSIsKill); - Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, - /*Op0IsKill=*/true, RHSReg, RHSIsKill); - Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true, - AndReg, /*Op1IsKill=*/true); + Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, + /*Op0IsKill=*/false, LHSReg, LHSIsKill); + Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, + /*Op0IsKill=*/true, RHSReg, RHSIsKill); + Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*Op0IsKill=*/true, + AndReg, /*Op1IsKill=*/true); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(OrReg); @@ -2353,7 +2353,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); - CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Op0IsKill=*/true, X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) @@ -2610,7 +2610,7 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, unsigned Reg; bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); - RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM); + RV &= X86FastEmitStore(VT, Reg, /*ValIsKill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; @@ -2674,15 +2674,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); // Explicitly zero-extend the input to 32-bit. InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg, - /*Op0IsKill=*/false); + /*Op0IsKill=*/false); // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, - InputReg, /*Op0IsKill=*/true); + InputReg, /*Op0IsKill=*/true); unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr : X86::VCVTPH2PSrr; - InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true); + InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Op0IsKill=*/true); // The result value is in the lower 32-bits of ResultReg. // Emit an explicit copy from register class VR128 to register class FR32. @@ -2740,7 +2740,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // ... unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); while (Depth--) { - Register DestReg = createResultReg(RC); + Register DestReg = createResultReg(RC); addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg), SrcReg); SrcReg = DestReg; @@ -2910,7 +2910,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. - if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) std::swap(LHS, RHS); unsigned BaseOpc, CondCode; @@ -3723,10 +3723,10 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: case MVT::i8: - return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true, + return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Op0IsKill=*/true, X86::sub_8bit); case MVT::i16: - return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true, + return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Op0IsKill=*/true, X86::sub_16bit); case MVT::i32: return SrcReg; @@ -3823,7 +3823,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { .addConstantPoolIndex(CPI, 0, OpFlag); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); - addRegReg(MIB, AddrReg, false, PICBase, false); + addRegReg(MIB, AddrReg, false, PICBase, false); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getConstantPool(*FuncInfo.MF), MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp index f8d822aebc..ddf96bc7b6 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupBWInsts.cpp @@ -187,7 +187,7 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { /// If so, return that super register in \p SuperDestReg. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, Register &SuperDestReg) const { - const X86RegisterInfo *TRI = &TII->getRegisterInfo(); + const X86RegisterInfo *TRI = &TII->getRegisterInfo(); Register OrigDestReg = OrigMI->getOperand(0).getReg(); SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32); @@ -319,7 +319,7 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { // This is only correct if we access the same subregister index: otherwise, // we could try to replace "movb %ah, %al" with "movl %eax, %eax". - const X86RegisterInfo *TRI = &TII->getRegisterInfo(); + const X86RegisterInfo *TRI = &TII->getRegisterInfo(); if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) != TRI->getSubRegIndex(NewDestReg, OldDest.getReg())) return nullptr; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp index 0054d5818a..482708e30d 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupLEAs.cpp @@ -376,8 +376,8 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); if (Segment.getReg() != 0 || !Disp.isImm() || Scale.getImm() > 1 || - MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) != - MachineBasicBlock::LQR_Dead) + MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I) != + MachineBasicBlock::LQR_Dead) return false; Register DestReg = MI.getOperand(0).getReg(); @@ -450,7 +450,7 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, } else return false; - MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); MBB.erase(I); I = NewMI; return true; @@ -486,7 +486,7 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); // now to replace with an equivalent LEA... LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); - MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*MBI, *NewMI, 1); MBB.erase(MBI); MachineBasicBlock::iterator J = static_cast<MachineBasicBlock::iterator>(NewMI); @@ -508,8 +508,8 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); if (Segment.getReg() != 0 || !Offset.isImm() || - MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) != - MachineBasicBlock::LQR_Dead) + MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) != + MachineBasicBlock::LQR_Dead) return; const Register DstR = Dst.getReg(); const Register SrcR1 = Base.getReg(); @@ -540,7 +540,7 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, LLVM_DEBUG(NewMI->dump();); } if (NewMI) { - MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); MBB.erase(I); I = NewMI; } @@ -560,8 +560,8 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || - MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) != - MachineBasicBlock::LQR_Dead || + MBB.computeRegisterLiveness(TRI, X86::EFLAGS, I, 4) != + MachineBasicBlock::LQR_Dead || Segment.getReg() != X86::NoRegister) return; @@ -647,7 +647,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, } } - MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); MBB.erase(I); I = NewMI; return; @@ -673,7 +673,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, .add(Index); LLVM_DEBUG(NewMI->dump();); - MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); MBB.erase(I); I = NewMI; return; @@ -696,7 +696,7 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, .add(Base); LLVM_DEBUG(NewMI->dump();); - MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); + MBB.getParent()->substituteDebugValuesForInst(*I, *NewMI, 1); MBB.erase(I); I = NewMI; } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp index 269f8ce6bd..a46da8aa48 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FixupSetCC.cpp @@ -101,24 +101,24 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass; - if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) { - // If we cannot constrain the register, we would need an additional copy - // and are better off keeping the MOVZX32rr8 we have now. - continue; - } - - ++NumSubstZexts; - Changed = true; - + if (!MRI->constrainRegClass(ZExt->getOperand(0).getReg(), RC)) { + // If we cannot constrain the register, we would need an additional copy + // and are better off keeping the MOVZX32rr8 we have now. + continue; + } + + ++NumSubstZexts; + Changed = true; + // Initialize a register with 0. This must go before the eflags def - Register ZeroReg = MRI->createVirtualRegister(RC); + Register ZeroReg = MRI->createVirtualRegister(RC); BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), ZeroReg); // X86 setcc only takes an output GR8, so fake a GR32 input by inserting // the setcc result into the low byte of the zeroed register. BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(), - TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg()) + TII->get(X86::INSERT_SUBREG), ZExt->getOperand(0).getReg()) .addReg(ZeroReg) .addReg(MI.getOperand(0).getReg()) .addImm(X86::sub_8bit); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp index d43fd807a5..539a3e2a20 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -97,7 +97,7 @@ private: CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, MachineBasicBlock::iterator CopyDefI); - Register promoteCondToReg(MachineBasicBlock &MBB, + Register promoteCondToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, X86::CondCode Cond); std::pair<unsigned, bool> @@ -739,7 +739,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETCC(MI); if (Cond != X86::COND_INVALID && !MI.mayStore() && - MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) { + MI.getOperand(0).isReg() && MI.getOperand(0).getReg().isVirtual()) { assert(MI.getOperand(0).isDef() && "A non-storing SETcc should always define a register!"); CondRegs[Cond] = MI.getOperand(0).getReg(); @@ -753,7 +753,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( return CondRegs; } -Register X86FlagsCopyLoweringPass::promoteCondToReg( +Register X86FlagsCopyLoweringPass::promoteCondToReg( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, X86::CondCode Cond) { Register Reg = MRI->createVirtualRegister(PromoteRC); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp index 866f113640..54003a72c1 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.cpp @@ -28,7 +28,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" @@ -235,7 +235,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, if (isSub && !isEAXLiveIn(MBB)) Reg = Rax; else - Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); + Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; unsigned AddSubRROpc = @@ -292,7 +292,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // need to find a dead register when using pop. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) - : TRI->findDeadCallerSavedReg(MBB, MBBI); + : TRI->findDeadCallerSavedReg(MBB, MBBI); if (Reg) { unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) @@ -437,9 +437,9 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( } const MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const Register FramePtr = TRI->getFrameRegister(MF); - const Register MachineFramePtr = - STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64)) + const Register FramePtr = TRI->getFrameRegister(MF); + const Register MachineFramePtr = + STI.isTarget64BitILP32() ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true); // Offset = space for return address + size of the frame pointer itself. @@ -1690,7 +1690,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, assert(Personality == EHPersonality::MSVC_CXX); Register FrameReg; int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; - int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed(); + int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg).getFixed(); // ESP is the first field, so no extra displacement is needed. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, false, EHRegOffset) @@ -1711,9 +1711,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (IsWin64Prologue && IsFunclet) Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); else - Offset = - getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() + - SEHFrameOffset; + Offset = + getFrameIndexReference(MF, FI, IgnoredFrameReg).getFixed() + + SEHFrameOffset; HasWinCFI = true; assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); @@ -1785,8 +1785,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; Register UsedReg; int Offset = - getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) - .getFixed(); + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) + .getFixed(); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) @@ -1864,8 +1864,8 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); Register SPReg; int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg, - /*IgnoreSPUpdates*/ true) - .getFixed(); + /*IgnoreSPUpdates*/ true) + .getFixed(); assert(Offset >= 0 && SPReg == TRI->getStackRegister()); return static_cast<unsigned>(Offset); } @@ -1920,7 +1920,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); Register FramePtr = TRI->getFrameRegister(MF); - Register MachineFramePtr = + Register MachineFramePtr = Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -2091,16 +2091,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } - - // Emit tilerelease for AMX kernel. - const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!MRI.reg_nodbg_empty(X86::TMMCFG)) - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + + // Emit tilerelease for AMX kernel. + const MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!MRI.reg_nodbg_empty(X86::TMMCFG)) + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); } -StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - Register &FrameReg) const { +StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsFixed = MFI.isFixedObjectIndex(FI); @@ -2147,7 +2147,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); if (FI && FI == X86FI->getFAIndex()) - return StackOffset::getFixed(-SEHFrameOffset); + return StackOffset::getFixed(-SEHFrameOffset); // FPDelta is the offset from the "traditional" FP location of the old base // pointer followed by return address and the location required by the @@ -2163,23 +2163,23 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. - return StackOffset::getFixed(Offset + SlotSize + FPDelta); + return StackOffset::getFixed(Offset + SlotSize + FPDelta); } else { assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); - return StackOffset::getFixed(Offset + StackSize); + return StackOffset::getFixed(Offset + StackSize); } } else if (TRI->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. - return StackOffset::getFixed(Offset + SlotSize + FPDelta); + return StackOffset::getFixed(Offset + SlotSize + FPDelta); } else { assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); - return StackOffset::getFixed(Offset + StackSize); + return StackOffset::getFixed(Offset + StackSize); } // FIXME: Support tail calls } else { if (!HasFP) - return StackOffset::getFixed(Offset + StackSize); + return StackOffset::getFixed(Offset + StackSize); // Skip the saved EBP. Offset += SlotSize; @@ -2190,7 +2190,7 @@ StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, Offset -= TailCallReturnAddrDelta; } - return StackOffset::getFixed(Offset + FPDelta); + return StackOffset::getFixed(Offset + FPDelta); } int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, @@ -2201,27 +2201,27 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, const auto it = WinEHXMMSlotInfo.find(FI); if (it == WinEHXMMSlotInfo.end()) - return getFrameIndexReference(MF, FI, FrameReg).getFixed(); + return getFrameIndexReference(MF, FI, FrameReg).getFixed(); FrameReg = TRI->getStackRegister(); return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) + it->second; } -StackOffset -X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, - Register &FrameReg, - int Adjustment) const { +StackOffset +X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, + Register &FrameReg, + int Adjustment) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); FrameReg = TRI->getStackRegister(); - return StackOffset::getFixed(MFI.getObjectOffset(FI) - - getOffsetOfLocalArea() + Adjustment); + return StackOffset::getFixed(MFI.getObjectOffset(FI) - + getOffsetOfLocalArea() + Adjustment); } -StackOffset -X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, - int FI, Register &FrameReg, - bool IgnoreSPUpdates) const { +StackOffset +X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, + int FI, Register &FrameReg, + bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // Does not include any dynamic realign. @@ -2898,8 +2898,8 @@ static unsigned getHiPELiteral( // non-meta instructions between MBBI and MBB.end(). static bool blockEndIsUnreachable(const MachineBasicBlock &MBB, MachineBasicBlock::const_iterator MBBI) { - return llvm::all_of( - MBB.successors(), + return llvm::all_of( + MBB.successors(), [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) && std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) { return MI.isMetaInstruction(); @@ -3082,8 +3082,8 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, unsigned Regs[2]; unsigned FoundRegs = 0; - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const MachineOperand &RegMask = Prev->getOperand(1); + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const MachineOperand &RegMask = Prev->getOperand(1); auto &RegClass = Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; @@ -3269,14 +3269,14 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // If we may need to emit frameless compact unwind information, give // up as this is currently broken: PR25614. - bool CompactUnwind = - MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() != - nullptr; - return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) || - !CompactUnwind) && - // The lowering of segmented stack and HiPE only support entry - // blocks as prologue blocks: PR26107. This limitation may be - // lifted if we fix: + bool CompactUnwind = + MF.getMMI().getContext().getObjectFileInfo()->getCompactUnwindSection() != + nullptr; + return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF) || + !CompactUnwind) && + // The lowering of segmented stack and HiPE only support entry + // blocks as prologue blocks: PR26107. This limitation may be + // lifted if we fix: // - adjustForSegmentedStacks // - adjustForHiPEPrologue MF.getFunction().getCallingConv() != CallingConv::HiPE && @@ -3311,7 +3311,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( } Register UsedReg; - int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed(); + int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg).getFixed(); int EndOffset = -EHRegOffset - EHRegSize; FuncInfo.EHRegNodeEndOffset = EndOffset; @@ -3334,8 +3334,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( // MOV32rm SavedEBPOffset(%esi), %ebp assert(X86FI->getHasSEHFramePtrSave()); int Offset = - getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) - .getFixed(); + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg) + .getFixed(); assert(UsedReg == BasePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), UsedReg, true, Offset) @@ -3380,7 +3380,7 @@ struct X86FrameSortingObject { // at the end of our list. struct X86FrameSortingComparator { inline bool operator()(const X86FrameSortingObject &A, - const X86FrameSortingObject &B) const { + const X86FrameSortingObject &B) const { uint64_t DensityAScaled, DensityBScaled; // For consistency in our comparison, all invalid objects are placed @@ -3516,21 +3516,21 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // emitPrologue if it gets called and emits CFI. MF.setHasWinCFI(false); - // If we are using Windows x64 CFI, ensure that the stack is always 8 byte - // aligned. The format doesn't support misaligned stack adjustments. - if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) - MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize)); - + // If we are using Windows x64 CFI, ensure that the stack is always 8 byte + // aligned. The format doesn't support misaligned stack adjustments. + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) + MF.getFrameInfo().ensureMaxAlignment(Align(SlotSize)); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. - if (STI.is64Bit() && MF.hasEHFunclets() && - classifyEHPersonality(MF.getFunction().getPersonalityFn()) == - EHPersonality::MSVC_CXX) { - adjustFrameForMsvcCxxEh(MF); - } -} - -void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const { + if (STI.is64Bit() && MF.hasEHFunclets() && + classifyEHPersonality(MF.getFunction().getPersonalityFn()) == + EHPersonality::MSVC_CXX) { + adjustFrameForMsvcCxxEh(MF); + } +} + +void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const { // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset // relative to RSP after the prologue. Find the offset of the last fixed // object, so that we can allocate a slot immediately following it. If there diff --git a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h index 26e80811af..a8c6320afa 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,7 @@ #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #include "llvm/CodeGen/TargetFrameLowering.h" -#include "llvm/Support/TypeSize.h" +#include "llvm/Support/TypeSize.h" namespace llvm { @@ -103,17 +103,17 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; bool needsFrameIndexResolution(const MachineFunction &MF) const override; - StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, - Register &FrameReg) const override; + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, Register &SPReg) const; - StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI, - Register &SPReg, int Adjustment) const; - StackOffset - getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - Register &FrameReg, - bool IgnoreSPUpdates) const override; + StackOffset getFrameIndexReferenceSP(const MachineFunction &MF, int FI, + Register &SPReg, int Adjustment) const; + StackOffset + getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, + Register &FrameReg, + bool IgnoreSPUpdates) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -224,7 +224,7 @@ private: const DebugLoc &DL, uint64_t Offset, uint64_t Align) const; - void adjustFrameForMsvcCxxEh(MachineFunction &MF) const; + void adjustFrameForMsvcCxxEh(MachineFunction &MF) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp index 1df9a0d170..c4f21ed402 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -17,7 +17,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/ConstantRange.h" @@ -45,8 +45,8 @@ static cl::opt<bool> EnablePromoteAnyextLoad( "x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); -extern cl::opt<bool> IndirectBranchTracking; - +extern cl::opt<bool> IndirectBranchTracking; + //===----------------------------------------------------------------------===// // Pattern Matcher Implementation //===----------------------------------------------------------------------===// @@ -207,8 +207,8 @@ namespace { void Select(SDNode *N) override; bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); - bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, - bool AllowSegmentRegForX32 = false); + bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, + bool AllowSegmentRegForX32 = false); bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); @@ -503,8 +503,8 @@ namespace { bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); - bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, - SDValue A, SDValue B, SDValue C, uint8_t Imm); + bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, + SDValue A, SDValue B, SDValue C, uint8_t Imm); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); @@ -527,9 +527,9 @@ namespace { // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); - if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || - Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || - Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { + if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || + Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || + Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. @@ -801,69 +801,69 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { return false; } -static bool isEndbrImm64(uint64_t Imm) { -// There may be some other prefix bytes between 0xF3 and 0x0F1EFA. -// i.g: 0xF3660F1EFA, 0xF3670F1EFA - if ((Imm & 0x00FFFFFF) != 0x0F1EFA) - return false; - - uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, - 0x65, 0x66, 0x67, 0xf0, 0xf2}; - int i = 24; // 24bit 0x0F1EFA has matched - while (i < 64) { - uint8_t Byte = (Imm >> i) & 0xFF; - if (Byte == 0xF3) - return true; - if (!llvm::is_contained(OptionalPrefixBytes, Byte)) - return false; - i += 8; - } - - return false; -} - +static bool isEndbrImm64(uint64_t Imm) { +// There may be some other prefix bytes between 0xF3 and 0x0F1EFA. +// i.g: 0xF3660F1EFA, 0xF3670F1EFA + if ((Imm & 0x00FFFFFF) != 0x0F1EFA) + return false; + + uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, + 0x65, 0x66, 0x67, 0xf0, 0xf2}; + int i = 24; // 24bit 0x0F1EFA has matched + while (i < 64) { + uint8_t Byte = (Imm >> i) & 0xFF; + if (Byte == 0xF3) + return true; + if (!llvm::is_contained(OptionalPrefixBytes, Byte)) + return false; + i += 8; + } + + return false; +} + void X86DAGToDAGISel::PreprocessISelDAG() { bool MadeChange = false; for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. - // This is for CET enhancement. - // - // ENDBR32 and ENDBR64 have specific opcodes: - // ENDBR32: F3 0F 1E FB - // ENDBR64: F3 0F 1E FA - // And we want that attackers won’t find unintended ENDBR32/64 - // opcode matches in the binary - // Here’s an example: - // If the compiler had to generate asm for the following code: - // a = 0xF30F1EFA - // it could, for example, generate: - // mov 0xF30F1EFA, dword ptr[a] - // In such a case, the binary would include a gadget that starts - // with a fake ENDBR64 opcode. Therefore, we split such generation - // into multiple operations, let it not shows in the binary - if (N->getOpcode() == ISD::Constant) { - MVT VT = N->getSimpleValueType(0); - int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue(); - int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; - if (Imm == EndbrImm || isEndbrImm64(Imm)) { - // Check that the cf-protection-branch is enabled. - Metadata *CFProtectionBranch = - MF->getMMI().getModule()->getModuleFlag("cf-protection-branch"); - if (CFProtectionBranch || IndirectBranchTracking) { - SDLoc dl(N); - SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true); - Complement = CurDAG->getNOT(dl, Complement, VT); - --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement); - ++I; - MadeChange = true; - continue; - } - } - } - + // This is for CET enhancement. + // + // ENDBR32 and ENDBR64 have specific opcodes: + // ENDBR32: F3 0F 1E FB + // ENDBR64: F3 0F 1E FA + // And we want that attackers won’t find unintended ENDBR32/64 + // opcode matches in the binary + // Here’s an example: + // If the compiler had to generate asm for the following code: + // a = 0xF30F1EFA + // it could, for example, generate: + // mov 0xF30F1EFA, dword ptr[a] + // In such a case, the binary would include a gadget that starts + // with a fake ENDBR64 opcode. Therefore, we split such generation + // into multiple operations, let it not shows in the binary + if (N->getOpcode() == ISD::Constant) { + MVT VT = N->getSimpleValueType(0); + int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue(); + int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; + if (Imm == EndbrImm || isEndbrImm64(Imm)) { + // Check that the cf-protection-branch is enabled. + Metadata *CFProtectionBranch = + MF->getMMI().getModule()->getModuleFlag("cf-protection-branch"); + if (CFProtectionBranch || IndirectBranchTracking) { + SDLoc dl(N); + SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true); + Complement = CurDAG->getNOT(dl, Complement, VT); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement); + ++I; + MadeChange = true; + continue; + } + } + } + // If this is a target specific AND node with no flag usages, turn it back // into ISD::AND to enable test instruction matching. if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { @@ -1068,8 +1068,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::STRICT_FFLOOR: case ISD::FTRUNC: case ISD::STRICT_FTRUNC: - case ISD::FROUNDEVEN: - case ISD::STRICT_FROUNDEVEN: + case ISD::FROUNDEVEN: + case ISD::STRICT_FROUNDEVEN: case ISD::FNEARBYINT: case ISD::STRICT_FNEARBYINT: case ISD::FRINT: @@ -1085,8 +1085,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::FFLOOR: Imm = 0x9; break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: Imm = 0xB; break; - case ISD::STRICT_FROUNDEVEN: - case ISD::FROUNDEVEN: Imm = 0x8; break; + case ISD::STRICT_FROUNDEVEN: + case ISD::FROUNDEVEN: Imm = 0x8; break; case ISD::STRICT_FNEARBYINT: case ISD::FNEARBYINT: Imm = 0xC; break; case ISD::STRICT_FRINT: @@ -1099,11 +1099,11 @@ void X86DAGToDAGISel::PreprocessISelDAG() { Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, {N->getValueType(0), MVT::Other}, {N->getOperand(0), N->getOperand(1), - CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); + CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); else Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), - CurDAG->getTargetConstant(Imm, dl, MVT::i32)); + CurDAG->getTargetConstant(Imm, dl, MVT::i32)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; @@ -1614,26 +1614,26 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, } -bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, - bool AllowSegmentRegForX32) { +bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, + bool AllowSegmentRegForX32) { SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. // load fs:0 -> FS segment register. // - // This optimization is generally valid because the GNU TLS model defines that - // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode - // with 32-bit registers, as we get in ILP32 mode, those registers are first - // zero-extended to 64 bits and then added it to the base address, which gives - // unwanted results when the register holds a negative value. + // This optimization is generally valid because the GNU TLS model defines that + // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode + // with 32-bit registers, as we get in ILP32 mode, those registers are first + // zero-extended to 64 bits and then added it to the base address, which gives + // unwanted results when the register holds a negative value. // For more information see http://people.redhat.com/drepper/tls.pdf - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) { if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && !IndirectTlsSegRefs && (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || - Subtarget->isTargetFuchsia())) { - if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) - return true; + Subtarget->isTargetFuchsia())) { + if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) + return true; switch (N->getPointerInfo().getAddrSpace()) { case X86AS::GS: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); @@ -1644,8 +1644,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, // Address space X86AS::SS is not handled here, because it is not used to // address TLS areas. } - } - } + } + } return true; } @@ -1729,21 +1729,21 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { if (matchAddressRecursively(N, AM, 0)) return true; - // Post-processing: Make a second attempt to fold a load, if we now know - // that there will not be any other register. This is only performed for - // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded - // any foldable load the first time. - if (Subtarget->isTarget64BitILP32() && - AM.BaseType == X86ISelAddressMode::RegBase && - AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { - SDValue Save_Base_Reg = AM.Base_Reg; - if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) { - AM.Base_Reg = SDValue(); - if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true)) - AM.Base_Reg = Save_Base_Reg; - } - } - + // Post-processing: Make a second attempt to fold a load, if we now know + // that there will not be any other register. This is only performed for + // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded + // any foldable load the first time. + if (Subtarget->isTarget64BitILP32() && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { + SDValue Save_Base_Reg = AM.Base_Reg; + if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) { + AM.Base_Reg = SDValue(); + if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true)) + AM.Base_Reg = Save_Base_Reg; + } + } + // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has // a smaller encoding and avoids a scaled-index. if (AM.Scale == 2 && @@ -2718,12 +2718,12 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, AM.Disp += GA->getOffset(); AM.SymbolFlags = GA->getTargetFlags(); - if (Subtarget->is32Bit()) { + if (Subtarget->is32Bit()) { AM.Scale = 1; AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); } - MVT VT = N.getSimpleValueType(); + MVT VT = N.getSimpleValueType(); getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); return true; } @@ -2813,10 +2813,10 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { return false; Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); - if (!CR) - return Width == 32 && TM.getCodeModel() == CodeModel::Small; - - return CR->getSignedMin().sge(-1ull << Width) && + if (!CR) + return Width == 32 && TM.getCodeModel() == CodeModel::Small; + + return CR->getSignedMin().sge(-1ull << Width) && CR->getSignedMax().slt(1ull << Width); } @@ -3210,7 +3210,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { - unsigned NewOpc = + unsigned NewOpc = ((Opc == X86ISD::ADD) == IsOne) ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); @@ -3466,7 +3466,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // Match the shift amount as: (bitwidth - y). It should go away, too. if (ShiftAmt.getOpcode() != ISD::SUB) return false; - auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); + auto *V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); if (!V0 || V0->getZExtValue() != Bitwidth) return false; NBits = ShiftAmt.getOperand(1); @@ -3589,7 +3589,7 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); - insertDAGNode(*CurDAG, SDValue(Node, 0), C8); + insertDAGNode(*CurDAG, SDValue(Node, 0), C8); SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); @@ -4019,129 +4019,129 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } -bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, - SDNode *ParentBC, SDValue A, SDValue B, - SDValue C, uint8_t Imm) { - assert(A.isOperandOf(ParentA)); - assert(B.isOperandOf(ParentBC)); - assert(C.isOperandOf(ParentBC)); - - auto tryFoldLoadOrBCast = - [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, SDValue &Segment) { - if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) - return true; - - // Not a load, check for broadcast which may be behind a bitcast. - if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { - P = L.getNode(); - L = L.getOperand(0); - } - - if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) - return false; - - // Only 32 and 64 bit broadcasts are supported. - auto *MemIntr = cast<MemIntrinsicSDNode>(L); - unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); - if (Size != 32 && Size != 64) - return false; - - return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); - }; - - bool FoldedLoad = false; - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { - FoldedLoad = true; - } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, - Tmp4)) { - FoldedLoad = true; - std::swap(A, C); - // Swap bits 1/4 and 3/6. - uint8_t OldImm = Imm; - Imm = OldImm & 0xa5; - if (OldImm & 0x02) Imm |= 0x10; - if (OldImm & 0x10) Imm |= 0x02; - if (OldImm & 0x08) Imm |= 0x40; - if (OldImm & 0x40) Imm |= 0x08; - } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3, - Tmp4)) { - FoldedLoad = true; - std::swap(B, C); - // Swap bits 1/2 and 5/6. - uint8_t OldImm = Imm; - Imm = OldImm & 0x99; - if (OldImm & 0x02) Imm |= 0x04; - if (OldImm & 0x04) Imm |= 0x02; - if (OldImm & 0x20) Imm |= 0x40; - if (OldImm & 0x40) Imm |= 0x20; - } - - SDLoc DL(Root); - - SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); - - MVT NVT = Root->getSimpleValueType(0); - - MachineSDNode *MNode; - if (FoldedLoad) { - SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); - - unsigned Opc; - if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { - auto *MemIntr = cast<MemIntrinsicSDNode>(C); - unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); - assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); - - bool UseD = EltSize == 32; - if (NVT.is128BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; - else if (NVT.is256BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; - else if (NVT.is512BitVector()) - Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; - else - llvm_unreachable("Unexpected vector size!"); - } else { - bool UseD = NVT.getVectorElementType() == MVT::i32; - if (NVT.is128BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; - else if (NVT.is256BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; - else if (NVT.is512BitVector()) - Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; - else - llvm_unreachable("Unexpected vector size!"); - } - - SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; - MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); - - // Update the chain. - ReplaceUses(C.getValue(1), SDValue(MNode, 1)); - // Record the mem-refs - CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()}); - } else { - bool UseD = NVT.getVectorElementType() == MVT::i32; - unsigned Opc; - if (NVT.is128BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; - else if (NVT.is256BitVector()) - Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; - else if (NVT.is512BitVector()) - Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; - else - llvm_unreachable("Unexpected vector size!"); - - MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); - } - - ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0)); - CurDAG->RemoveDeadNode(Root); - return true; -} - +bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, + SDNode *ParentBC, SDValue A, SDValue B, + SDValue C, uint8_t Imm) { + assert(A.isOperandOf(ParentA)); + assert(B.isOperandOf(ParentBC)); + assert(C.isOperandOf(ParentBC)); + + auto tryFoldLoadOrBCast = + [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, SDValue &Segment) { + if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) + return true; + + // Not a load, check for broadcast which may be behind a bitcast. + if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { + P = L.getNode(); + L = L.getOperand(0); + } + + if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; + + // Only 32 and 64 bit broadcasts are supported. + auto *MemIntr = cast<MemIntrinsicSDNode>(L); + unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); + if (Size != 32 && Size != 64) + return false; + + return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); + }; + + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + FoldedLoad = true; + } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4)) { + FoldedLoad = true; + std::swap(A, C); + // Swap bits 1/4 and 3/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0xa5; + if (OldImm & 0x02) Imm |= 0x10; + if (OldImm & 0x10) Imm |= 0x02; + if (OldImm & 0x08) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x08; + } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4)) { + FoldedLoad = true; + std::swap(B, C); + // Swap bits 1/2 and 5/6. + uint8_t OldImm = Imm; + Imm = OldImm & 0x99; + if (OldImm & 0x02) Imm |= 0x04; + if (OldImm & 0x04) Imm |= 0x02; + if (OldImm & 0x20) Imm |= 0x40; + if (OldImm & 0x40) Imm |= 0x20; + } + + SDLoc DL(Root); + + SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); + + MVT NVT = Root->getSimpleValueType(0); + + MachineSDNode *MNode; + if (FoldedLoad) { + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + + unsigned Opc; + if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast<MemIntrinsicSDNode>(C); + unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); + assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); + + bool UseD = EltSize == 32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; + else + llvm_unreachable("Unexpected vector size!"); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; + else + llvm_unreachable("Unexpected vector size!"); + } + + SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; + MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); + + // Update the chain. + ReplaceUses(C.getValue(1), SDValue(MNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()}); + } else { + bool UseD = NVT.getVectorElementType() == MVT::i32; + unsigned Opc; + if (NVT.is128BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; + else if (NVT.is256BitVector()) + Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; + else if (NVT.is512BitVector()) + Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; + else + llvm_unreachable("Unexpected vector size!"); + + MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); + } + + ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0)); + CurDAG->RemoveDeadNode(Root); + return true; +} + // Try to match two logic ops to a VPTERNLOG. // FIXME: Handle inverted inputs? // FIXME: Handle more complex patterns that use an operand more than once? @@ -4160,62 +4160,62 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto getFoldableLogicOp = [](SDValue Op) { - // Peek through single use bitcast. - if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) - Op = Op.getOperand(0); - - if (!Op.hasOneUse()) - return SDValue(); - - unsigned Opc = Op.getOpcode(); - if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || - Opc == X86ISD::ANDNP) - return Op; - - return SDValue(); + auto getFoldableLogicOp = [](SDValue Op) { + // Peek through single use bitcast. + if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) + Op = Op.getOperand(0); + + if (!Op.hasOneUse()) + return SDValue(); + + unsigned Opc = Op.getOpcode(); + if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || + Opc == X86ISD::ANDNP) + return Op; + + return SDValue(); }; - SDValue A, FoldableOp; - if ((FoldableOp = getFoldableLogicOp(N1))) { + SDValue A, FoldableOp; + if ((FoldableOp = getFoldableLogicOp(N1))) { A = N0; - } else if ((FoldableOp = getFoldableLogicOp(N0))) { + } else if ((FoldableOp = getFoldableLogicOp(N0))) { A = N1; } else return false; - SDValue B = FoldableOp.getOperand(0); - SDValue C = FoldableOp.getOperand(1); - - // We can build the appropriate control immediate by performing the logic - // operation we're matching using these constants for A, B, and C. - const uint8_t TernlogMagicA = 0xf0; - const uint8_t TernlogMagicB = 0xcc; - const uint8_t TernlogMagicC = 0xaa; - - uint8_t Imm; - switch (FoldableOp.getOpcode()) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; - case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; - case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; - case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; - } - - switch (N->getOpcode()) { + SDValue B = FoldableOp.getOperand(0); + SDValue C = FoldableOp.getOperand(1); + + // We can build the appropriate control immediate by performing the logic + // operation we're matching using these constants for A, B, and C. + const uint8_t TernlogMagicA = 0xf0; + const uint8_t TernlogMagicB = 0xcc; + const uint8_t TernlogMagicC = 0xaa; + + uint8_t Imm; + switch (FoldableOp.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); - case X86ISD::ANDNP: - if (A == N0) - Imm &= ~TernlogMagicA; - else - Imm = ~(Imm) & TernlogMagicA; + case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; + case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; + case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; + case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; + } + + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86ISD::ANDNP: + if (A == N0) + Imm &= ~TernlogMagicA; + else + Imm = ~(Imm) & TernlogMagicA; break; - case ISD::AND: Imm &= TernlogMagicA; break; - case ISD::OR: Imm |= TernlogMagicA; break; - case ISD::XOR: Imm ^= TernlogMagicA; break; + case ISD::AND: Imm &= TernlogMagicA; break; + case ISD::OR: Imm |= TernlogMagicA; break; + case ISD::XOR: Imm ^= TernlogMagicA; break; } - return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm); + return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm); } /// If the high bits of an 'and' operand are known zero, try setting the @@ -4282,7 +4282,7 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { // A negative mask allows a smaller encoding. Create a new 'and' node. SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); - insertDAGNode(*CurDAG, SDValue(And, 0), NewMask); + insertDAGNode(*CurDAG, SDValue(And, 0), NewMask); SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); ReplaceNode(And, NewAnd.getNode()); SelectCode(NewAnd.getNode()); @@ -4316,15 +4316,15 @@ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ VPTESTM_CASE(v64i8, BZ##SUFFIX) \ VPTESTM_CASE(v32i16, WZ##SUFFIX) - if (FoldedBCast) { + if (FoldedBCast) { switch (TestVT.SimpleTy) { - VPTESTM_BROADCAST_CASES(rmb) + VPTESTM_BROADCAST_CASES(rmb) } } - if (FoldedLoad) { + if (FoldedLoad) { switch (TestVT.SimpleTy) { - VPTESTM_FULL_CASES(rm) + VPTESTM_FULL_CASES(rm) } } @@ -4383,56 +4383,56 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, } } - // Without VLX we need to widen the operation. + // Without VLX we need to widen the operation. bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); - auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, - SDValue &Base, SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment) { - // If we need to widen, we can't fold the load. - if (!Widen) - if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) - return true; + auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, + SDValue &Base, SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + // If we need to widen, we can't fold the load. + if (!Widen) + if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) + return true; - // If we didn't fold a load, try to match broadcast. No widening limitation - // for this. But only 32 and 64 bit types are supported. - if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) - return false; + // If we didn't fold a load, try to match broadcast. No widening limitation + // for this. But only 32 and 64 bit types are supported. + if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) + return false; // Look through single use bitcasts. - if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { - P = L.getNode(); - L = L.getOperand(0); + if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { + P = L.getNode(); + L = L.getOperand(0); } - if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) - return false; - - auto *MemIntr = cast<MemIntrinsicSDNode>(L); - if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) - return false; + if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; - return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); + auto *MemIntr = cast<MemIntrinsicSDNode>(L); + if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) + return false; + + return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; - // We can only fold loads if the sources are unique. - bool CanFoldLoads = Src0 != Src1; - - bool FoldedLoad = false; - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (CanFoldLoads) { - FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4); - if (!FoldedLoad) { - // And is commutative. - FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, - Tmp2, Tmp3, Tmp4); - if (FoldedLoad) - std::swap(Src0, Src1); + // We can only fold loads if the sources are unique. + bool CanFoldLoads = Src0 != Src1; + + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (CanFoldLoads) { + FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4); + if (!FoldedLoad) { + // And is commutative. + FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, + Tmp2, Tmp3, Tmp4); + if (FoldedLoad) + std::swap(Src0, Src1); } } - bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; + bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; bool IsMasked = InMask.getNode() != nullptr; @@ -4456,7 +4456,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, if (IsMasked) { // Widen the mask. - unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); + unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MaskVT, InMask, RC), 0); @@ -4468,23 +4468,23 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, IsMasked); MachineSDNode *CNode; - if (FoldedLoad) { + if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); if (IsMasked) { SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, - Src1.getOperand(0) }; + Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } else { SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, - Src1.getOperand(0) }; + Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } // Update the chain. - ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); + ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); @@ -4494,7 +4494,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, // If we widened, we need to shrink the mask VT. if (Widen) { - unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); + unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, ResVT, SDValue(CNode, 0), RC); @@ -4550,9 +4550,9 @@ bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); ReplaceNode(N, Ternlog.getNode()); - - return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(), - A, B, C, 0xCA); + + return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(), + A, B, C, 0xCA); } void X86DAGToDAGISel::Select(SDNode *Node) { @@ -4568,95 +4568,95 @@ void X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = Node->getConstantOperandVal(1); - switch (IntNo) { - default: break; - case Intrinsic::x86_encodekey128: - case Intrinsic::x86_encodekey256: { - if (!Subtarget->hasKL()) - break; - - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); - case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break; - case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break; - } - - SDValue Chain = Node->getOperand(0); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), - SDValue()); - if (Opcode == X86::ENCODEKEY256) - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), - Chain.getValue(1)); - - MachineSDNode *Res = CurDAG->getMachineNode( - Opcode, dl, Node->getVTList(), - {Node->getOperand(2), Chain, Chain.getValue(1)}); - ReplaceNode(Node, Res); - return; - } - case Intrinsic::x86_tileloadd64_internal: { - if (!Subtarget->hasAMXTILE()) - break; - unsigned Opc = X86::PTILELOADDV; - // _tile_loadd_internal(row, col, buf, STRIDE) - SDValue Base = Node->getOperand(4); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(5); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; - SDValue Ops[] = {Node->getOperand(2), - Node->getOperand(3), - Base, - Scale, - Index, - Disp, - Segment, - CFG, - Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); - ReplaceNode(Node, CNode); - return; - } - case Intrinsic::x86_tdpbssd_internal: { - if (!Subtarget->hasAMXTILE()) - break; - SDValue Chain = Node->getOperand(0); - unsigned Opc = X86::PTDPBSSDV; - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Ops[] = {Node->getOperand(2), - Node->getOperand(3), - Node->getOperand(4), - Node->getOperand(5), - Node->getOperand(6), - Node->getOperand(7), - CFG, - Chain}; - MachineSDNode *CNode = - CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); - ReplaceNode(Node, CNode); - return; - } - case Intrinsic::x86_tilezero_internal: { - if (!Subtarget->hasAMXTILE()) - break; - unsigned Opc = X86::PTILEZEROV; - SDValue Chain = Node->getOperand(0); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; - MachineSDNode *CNode = - CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); - ReplaceNode(Node, CNode); - return; - } - } - break; - } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = Node->getConstantOperandVal(1); + switch (IntNo) { + default: break; + case Intrinsic::x86_encodekey128: + case Intrinsic::x86_encodekey256: { + if (!Subtarget->hasKL()) + break; + + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break; + case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break; + } + + SDValue Chain = Node->getOperand(0); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), + SDValue()); + if (Opcode == X86::ENCODEKEY256) + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), + Chain.getValue(1)); + + MachineSDNode *Res = CurDAG->getMachineNode( + Opcode, dl, Node->getVTList(), + {Node->getOperand(2), Chain, Chain.getValue(1)}); + ReplaceNode(Node, Res); + return; + } + case Intrinsic::x86_tileloadd64_internal: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc = X86::PTILELOADDV; + // _tile_loadd_internal(row, col, buf, STRIDE) + SDValue Base = Node->getOperand(4); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(5); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {Node->getOperand(2), + Node->getOperand(3), + Base, + Scale, + Index, + Disp, + Segment, + CFG, + Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } + case Intrinsic::x86_tdpbssd_internal: { + if (!Subtarget->hasAMXTILE()) + break; + SDValue Chain = Node->getOperand(0); + unsigned Opc = X86::PTDPBSSDV; + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), + Node->getOperand(3), + Node->getOperand(4), + Node->getOperand(5), + Node->getOperand(6), + Node->getOperand(7), + CFG, + Chain}; + MachineSDNode *CNode = + CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } + case Intrinsic::x86_tilezero_internal: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc = X86::PTILEZEROV; + SDValue Chain = Node->getOperand(0); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + MachineSDNode *CNode = + CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } + } + break; + } case ISD::INTRINSIC_VOID: { unsigned IntNo = Node->getConstantOperandVal(1); switch (IntNo) { @@ -4711,31 +4711,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } - case Intrinsic::x86_tilestored64_internal: { - unsigned Opc = X86::PTILESTOREDV; - // _tile_stored_internal(row, col, buf, STRIDE, c) - SDValue Base = Node->getOperand(4); - SDValue Scale = getI8Imm(1, dl); - SDValue Index = Node->getOperand(5); - SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Chain = Node->getOperand(0); - MachineSDNode *CNode; - SDValue Ops[] = {Node->getOperand(2), - Node->getOperand(3), - Base, - Scale, - Index, - Disp, - Segment, - Node->getOperand(6), - CFG, - Chain}; - CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - ReplaceNode(Node, CNode); - return; - } + case Intrinsic::x86_tilestored64_internal: { + unsigned Opc = X86::PTILESTOREDV; + // _tile_stored_internal(row, col, buf, STRIDE, c) + SDValue Base = Node->getOperand(4); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(5); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + SDValue Ops[] = {Node->getOperand(2), + Node->getOperand(3), + Base, + Scale, + Index, + Disp, + Segment, + Node->getOperand(6), + CFG, + Chain}; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + ReplaceNode(Node, CNode); + return; + } case Intrinsic::x86_tileloadd64: case Intrinsic::x86_tileloaddt164: case Intrinsic::x86_tilestored64: { @@ -4816,19 +4816,19 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; break; - case X86ISD::VPTERNLOG: { - uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue(); - if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2), Imm)) - return; - break; - } - - case X86ISD::ANDNP: - if (tryVPTERNLOG(Node)) - return; - break; - + case X86ISD::VPTERNLOG: { + uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue(); + if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), Imm)) + return; + break; + } + + case X86ISD::ANDNP: + if (tryVPTERNLOG(Node)) + return; + break; + case ISD::AND: if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { // Try to form a masked VPTESTM. Operands can be in either order. @@ -5927,63 +5927,63 @@ void X86DAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } - case X86ISD::AESENCWIDE128KL: - case X86ISD::AESDECWIDE128KL: - case X86ISD::AESENCWIDE256KL: - case X86ISD::AESDECWIDE256KL: { - if (!Subtarget->hasWIDEKL()) - break; - - unsigned Opcode; - switch (Node->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode!"); - case X86ISD::AESENCWIDE128KL: - Opcode = X86::AESENCWIDE128KL; - break; - case X86ISD::AESDECWIDE128KL: - Opcode = X86::AESDECWIDE128KL; - break; - case X86ISD::AESENCWIDE256KL: - Opcode = X86::AESENCWIDE256KL; - break; - case X86ISD::AESDECWIDE256KL: - Opcode = X86::AESDECWIDE256KL; - break; - } - - SDValue Chain = Node->getOperand(0); - SDValue Addr = Node->getOperand(1); - - SDValue Base, Scale, Index, Disp, Segment; - if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment)) - break; - - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), - SDValue()); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), - Chain.getValue(1)); - Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), - Chain.getValue(1)); - - MachineSDNode *Res = CurDAG->getMachineNode( - Opcode, dl, Node->getVTList(), - {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)}); - CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand()); - ReplaceNode(Node, Res); - return; - } + case X86ISD::AESENCWIDE128KL: + case X86ISD::AESDECWIDE128KL: + case X86ISD::AESENCWIDE256KL: + case X86ISD::AESDECWIDE256KL: { + if (!Subtarget->hasWIDEKL()) + break; + + unsigned Opcode; + switch (Node->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case X86ISD::AESENCWIDE128KL: + Opcode = X86::AESENCWIDE128KL; + break; + case X86ISD::AESDECWIDE128KL: + Opcode = X86::AESDECWIDE128KL; + break; + case X86ISD::AESENCWIDE256KL: + Opcode = X86::AESENCWIDE256KL; + break; + case X86ISD::AESDECWIDE256KL: + Opcode = X86::AESDECWIDE256KL; + break; + } + + SDValue Chain = Node->getOperand(0); + SDValue Addr = Node->getOperand(1); + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment)) + break; + + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), + SDValue()); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), + Chain.getValue(1)); + + MachineSDNode *Res = CurDAG->getMachineNode( + Opcode, dl, Node->getVTList(), + {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)}); + CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand()); + ReplaceNode(Node, Res); + return; } + } SelectCode(Node); } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp index 1e2407c7e7..c1320facd0 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.cpp @@ -35,7 +35,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -77,14 +77,14 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); -static cl::opt<int> ExperimentalPrefInnermostLoopAlignment( - "x86-experimental-pref-innermost-loop-alignment", cl::init(4), - cl::desc( - "Sets the preferable loop alignment for experiments (as log2 bytes) " - "for innermost loops only. If specified, this option overrides " - "alignment set by x86-experimental-pref-loop-alignment."), - cl::Hidden); - +static cl::opt<int> ExperimentalPrefInnermostLoopAlignment( + "x86-experimental-pref-innermost-loop-alignment", cl::init(4), + cl::desc( + "Sets the preferable loop alignment for experiments (as log2 bytes) " + "for innermost loops only. If specified, this option overrides " + "alignment set by x86-experimental-pref-loop-alignment."), + cl::Hidden); + static cl::opt<bool> MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " @@ -144,24 +144,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addBypassSlowDiv(64, 32); } - // Setup Windows compiler runtime calls. - if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { - static const struct { - const RTLIB::Libcall Op; - const char * const Name; - const CallingConv::ID CC; - } LibraryCalls[] = { - { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall }, - { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall }, - { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall }, - { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall }, - { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall }, - }; - - for (const auto &LC : LibraryCalls) { - setLibcallName(LC.Op, LC.Name); - setLibcallCallingConv(LC.Op, LC.CC); - } + // Setup Windows compiler runtime calls. + if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall }, + { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall }, + { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall }, + { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall }, + { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + } } if (Subtarget.getTargetTriple().isOSMSVCRT()) { @@ -207,8 +207,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::ABS , MVT::i64 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::ABS , MVT::i64 , Custom); } // Funnel shifts. @@ -293,19 +293,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - if (Subtarget.hasSSE2()) { - // Custom lowering for saturating float to int conversions. - // We handle promotion to larger result types manually. - for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { - setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); - setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); - } - if (Subtarget.is64Bit()) { - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); - } - } - + if (Subtarget.hasSSE2()) { + // Custom lowering for saturating float to int conversions. + // We handle promotion to larger result types manually. + for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); + } + if (Subtarget.is64Bit()) { + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); + } + } + // Handle address space casts between mixed sized pointers. setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); @@ -412,7 +412,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); - setOperationAction(ISD::PARITY, MVT::i8, Custom); + setOperationAction(ISD::PARITY, MVT::i8, Custom); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { @@ -423,11 +423,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP , MVT::i64 , Expand); else setOperationAction(ISD::CTPOP , MVT::i64 , Custom); - - setOperationAction(ISD::PARITY, MVT::i16, Custom); - setOperationAction(ISD::PARITY, MVT::i32, Custom); - if (Subtarget.is64Bit()) - setOperationAction(ISD::PARITY, MVT::i64, Custom); + + setOperationAction(ISD::PARITY, MVT::i16, Custom); + setOperationAction(ISD::PARITY, MVT::i32, Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::PARITY, MVT::i64, Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -521,7 +521,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); - setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); + setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); @@ -1114,8 +1114,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); - setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); - setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); + setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::FROUND, RoundedTy, Custom); } @@ -1129,8 +1129,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); - setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); - + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1171,10 +1171,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { - setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); - } - + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) @@ -1216,8 +1216,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); - setOperationAction(ISD::FROUNDEVEN, VT, Legal); - setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); @@ -1345,10 +1345,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); - setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); - setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); - setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); - setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); + setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); + setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); + setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); @@ -1607,8 +1607,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); - setOperationAction(ISD::FROUNDEVEN, VT, Legal); - setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); + setOperationAction(ISD::FROUNDEVEN, VT, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); } @@ -1737,17 +1737,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasVBMI2()) { - for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, - MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { + for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, + MVT::v16i16, MVT::v8i32, MVT::v4i64, + MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } - - setOperationAction(ISD::ROTL, MVT::v32i16, Custom); - setOperationAction(ISD::ROTR, MVT::v8i16, Custom); - setOperationAction(ISD::ROTR, MVT::v16i16, Custom); - setOperationAction(ISD::ROTR, MVT::v32i16, Custom); + + setOperationAction(ISD::ROTL, MVT::v32i16, Custom); + setOperationAction(ISD::ROTR, MVT::v8i16, Custom); + setOperationAction(ISD::ROTR, MVT::v16i16, Custom); + setOperationAction(ISD::ROTR, MVT::v32i16, Custom); } }// useAVX512Regs @@ -1919,10 +1919,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } - if (Subtarget.hasAMXTILE()) { - addRegisterClass(MVT::x86amx, &X86::TILERegClass); - } - + if (Subtarget.hasAMXTILE()) { + addRegisterClass(MVT::x86amx, &X86::TILERegClass); + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -1952,8 +1952,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ADDCARRY, VT, Custom); setOperationAction(ISD::SUBCARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); - setOperationAction(ISD::SADDO_CARRY, VT, Custom); - setOperationAction(ISD::SSUBO_CARRY, VT, Custom); + setOperationAction(ISD::SADDO_CARRY, VT, Custom); + setOperationAction(ISD::SSUBO_CARRY, VT, Custom); } if (!Subtarget.is64Bit()) { @@ -2507,21 +2507,21 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. return SegmentOffset(IRB, 0x10, getAddressSpace()); } else { - unsigned AddressSpace = getAddressSpace(); - // Specially, some users may customize the base reg and offset. - unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset; - // If we don't set -stack-protector-guard-offset value: + unsigned AddressSpace = getAddressSpace(); + // Specially, some users may customize the base reg and offset. + unsigned Offset = getTargetMachine().Options.StackProtectorGuardOffset; + // If we don't set -stack-protector-guard-offset value: // %fs:0x28, unless we're using a Kernel code model, in which case // it's %gs:0x28. gs:0x14 on i386. - if (Offset == (unsigned)-1) - Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; - - const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg; - if (GuardReg == "fs") - AddressSpace = X86AS::FS; - else if (GuardReg == "gs") - AddressSpace = X86AS::GS; - return SegmentOffset(IRB, Offset, AddressSpace); + if (Offset == (unsigned)-1) + Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + + const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg; + if (GuardReg == "fs") + AddressSpace = X86AS::FS; + else if (GuardReg == "gs") + AddressSpace = X86AS::GS; + return SegmentOffset(IRB, Offset, AddressSpace); } } return TargetLowering::getIRStackGuard(IRB); @@ -2545,13 +2545,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { } return; } - - auto GuardMode = getTargetMachine().Options.StackProtectorGuard; - + + auto GuardMode = getTargetMachine().Options.StackProtectorGuard; + // glibc, bionic, and Fuchsia have a special slot for the stack guard. - if ((GuardMode == llvm::StackProtectorGuards::TLS || - GuardMode == llvm::StackProtectorGuards::None) - && hasStackGuardSlotTLS(Subtarget.getTargetTriple())) + if ((GuardMode == llvm::StackProtectorGuards::TLS || + GuardMode == llvm::StackProtectorGuards::None) + && hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } @@ -3101,9 +3101,9 @@ SDValue X86TargetLowering::LowerCallResult( // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); - if (VA.isExtInLoc()) { + if (VA.isExtInLoc()) { if (VA.getValVT().isVector() && - VA.getValVT().getScalarType() == MVT::i1 && + VA.getValVT().getScalarType() == MVT::i1 && ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 @@ -3171,7 +3171,7 @@ argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) { static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) { - SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); + SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); return DAG.getMemcpy( Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), @@ -3420,8 +3420,8 @@ private: void forwardMustTailParameters(SDValue &Chain); - bool is64Bit() const { return Subtarget.is64Bit(); } - bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } + bool is64Bit() const { return Subtarget.is64Bit(); } + bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } X86MachineFunctionInfo *FuncInfo; const SDLoc &DL; @@ -3532,10 +3532,10 @@ void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back( - DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32)); + DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32)); SaveXMMOps.push_back( - DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); - llvm::append_range(SaveXMMOps, LiveXMMRegs); + DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); + llvm::append_range(SaveXMMOps, LiveXMMRegs); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL, MVT::Other, SaveXMMOps)); } @@ -3809,7 +3809,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). - int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); + int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -3916,7 +3916,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB); bool HasNoCfCheck = (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()); - bool IsIndirectCall = (CI && CI->isIndirectCall()); + bool IsIndirectCall = (CI && CI->isIndirectCall()); const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); @@ -4156,13 +4156,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Subtarget.isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT - // GOT pointer (except regcall). + // GOT pointer (except regcall). if (!isTailCall) { - // Indirect call with RegCall calling convertion may use up all the - // general registers, so it is not suitable to bind EBX reister for - // GOT address, just let register allocator handle it. - if (CallConv != CallingConv::X86_RegCall) - RegsToPass.push_back(std::make_pair( + // Indirect call with RegCall calling convertion may use up all the + // general registers, so it is not suitable to bind EBX reister for + // GOT address, just let register allocator handle it. + if (CallConv != CallingConv::X86_RegCall) + RegsToPass.push_back(std::make_pair( Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { @@ -4329,7 +4329,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(Callee); if (isTailCall) - Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); + Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. @@ -4403,7 +4403,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return Ret; } - if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { + if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); } else { Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); @@ -4522,7 +4522,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!VR.isVirtual()) + if (!VR.isVirtual()) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -4574,8 +4574,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) return false; - if (VA.getLocVT().getFixedSizeInBits() > - Arg.getValueSizeInBits().getFixedSize()) { + if (VA.getLocVT().getFixedSizeInBits() > + Arg.getValueSizeInBits().getFixedSize()) { // If the argument location is wider than the argument type, check that any // extension flags match. if (Flags.isZExt() != MFI.isObjectZExt(FI) || @@ -5083,47 +5083,47 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { - Info.flags = MachineMemOperand::MONone; - Info.offset = 0; + Info.flags = MachineMemOperand::MONone; + Info.offset = 0; const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); - if (!IntrData) { - switch (Intrinsic) { - case Intrinsic::x86_aesenc128kl: - case Intrinsic::x86_aesdec128kl: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = I.getArgOperand(1); - Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - case Intrinsic::x86_aesenc256kl: - case Intrinsic::x86_aesdec256kl: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = I.getArgOperand(1); - Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - case Intrinsic::x86_aesencwide128kl: - case Intrinsic::x86_aesdecwide128kl: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = I.getArgOperand(0); - Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - case Intrinsic::x86_aesencwide256kl: - case Intrinsic::x86_aesdecwide256kl: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.ptrVal = I.getArgOperand(0); - Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); - Info.align = Align(1); - Info.flags |= MachineMemOperand::MOLoad; - return true; - } + if (!IntrData) { + switch (Intrinsic) { + case Intrinsic::x86_aesenc128kl: + case Intrinsic::x86_aesdec128kl: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(1); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + case Intrinsic::x86_aesenc256kl: + case Intrinsic::x86_aesdec256kl: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(1); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + case Intrinsic::x86_aesencwide128kl: + case Intrinsic::x86_aesdecwide128kl: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(0); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + case Intrinsic::x86_aesencwide256kl: + case Intrinsic::x86_aesdecwide256kl: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = I.getArgOperand(0); + Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); + Info.align = Align(1); + Info.flags |= MachineMemOperand::MOLoad; + return true; + } return false; - } + } switch (IntrData->Type) { case TRUNCATE_TO_MEM_VI8: @@ -5193,7 +5193,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"); - + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); @@ -5366,7 +5366,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, // width. if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) return false; - + return true; } @@ -5510,14 +5510,14 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } -/// Return true if every element in Mask is the undef sentinel value or equal to -/// the specified value.. -static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) { - return llvm::all_of(Mask, [CmpVal](int M) { - return (M == SM_SentinelUndef) || (M == CmpVal); - }); -} - +/// Return true if every element in Mask is the undef sentinel value or equal to +/// the specified value.. +static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) { + return llvm::all_of(Mask, [CmpVal](int M) { + return (M == SM_SentinelUndef) || (M == CmpVal); + }); +} + /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); @@ -5924,7 +5924,7 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { - assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && + assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && "Unsupported vector widening type"); SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) @@ -6288,22 +6288,22 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { return DAG.getBitcast(VT, Vec); } -// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode. -static unsigned getOpcode_EXTEND(unsigned Opcode) { - switch (Opcode) { - case ISD::ANY_EXTEND: - case ISD::ANY_EXTEND_VECTOR_INREG: - return ISD::ANY_EXTEND; - case ISD::ZERO_EXTEND: - case ISD::ZERO_EXTEND_VECTOR_INREG: - return ISD::ZERO_EXTEND; - case ISD::SIGN_EXTEND: - case ISD::SIGN_EXTEND_VECTOR_INREG: - return ISD::SIGN_EXTEND; - } - llvm_unreachable("Unknown opcode"); -} - +// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode. +static unsigned getOpcode_EXTEND(unsigned Opcode) { + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + return ISD::ANY_EXTEND; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + return ISD::ZERO_EXTEND; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + return ISD::SIGN_EXTEND; + } + llvm_unreachable("Unknown opcode"); +} + // Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode. static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { switch (Opcode) { @@ -6320,8 +6320,8 @@ static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) { llvm_unreachable("Unknown opcode"); } -static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue In, SelectionDAG &DAG) { +static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, + SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || @@ -6373,10 +6373,10 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) { return SDValue(); } -void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, +void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo, bool Unary) { - assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && - "Illegal vector type to unpack"); + assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && + "Illegal vector type to unpack"); assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); @@ -6405,7 +6405,7 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, } /// Returns a vector_shuffle node for an unpackl operation. -static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, +static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector<int, 8> Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); @@ -6413,7 +6413,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, } /// Returns a vector_shuffle node for an unpackh operation. -static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, +static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector<int, 8> Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); @@ -6660,30 +6660,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } // Extract constant bits from a subvector broadcast. - if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { - auto *MemIntr = cast<MemIntrinsicSDNode>(Op); - SDValue Ptr = MemIntr->getBasePtr(); - if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { - Type *CstTy = Cst->getType(); - unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); - if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0) - return false; - unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits(); - unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits; - unsigned NumSubVecs = SizeInBits / CstSizeInBits; - APInt UndefSubElts(NumSubElts, 0); - SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs, - APInt(SubEltSizeInBits, 0)); - for (unsigned i = 0; i != NumSubElts; ++i) { - if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], - UndefSubElts, i)) - return false; - for (unsigned j = 1; j != NumSubVecs; ++j) - SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; - } - UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), - UndefSubElts); - return CastBitData(UndefSubElts, SubEltBits); + if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op); + SDValue Ptr = MemIntr->getBasePtr(); + if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { + Type *CstTy = Cst->getType(); + unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); + if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0) + return false; + unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits(); + unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits; + unsigned NumSubVecs = SizeInBits / CstSizeInBits; + APInt UndefSubElts(NumSubElts, 0); + SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs, + APInt(SubEltSizeInBits, 0)); + for (unsigned i = 0; i != NumSubElts; ++i) { + if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], + UndefSubElts, i)) + return false; + for (unsigned j = 1; j != NumSubVecs; ++j) + SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; + } + UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), + UndefSubElts); + return CastBitData(UndefSubElts, SubEltBits); } } @@ -6704,26 +6704,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { - // If bitcasts to larger elements we might lose track of undefs - don't - // allow any to be safe. - unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); - bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; - - APInt UndefSrcElts, UndefSubElts; - SmallVector<APInt, 32> EltSrcBits, EltSubBits; - if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, + // If bitcasts to larger elements we might lose track of undefs - don't + // allow any to be safe. + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; + + APInt UndefSrcElts, UndefSubElts; + SmallVector<APInt, 32> EltSrcBits, EltSubBits; + if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, UndefSubElts, EltSubBits, - AllowWholeUndefs && AllowUndefs, - AllowPartialUndefs && AllowUndefs) && - getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, - UndefSrcElts, EltSrcBits, - AllowWholeUndefs && AllowUndefs, - AllowPartialUndefs && AllowUndefs)) { + AllowWholeUndefs && AllowUndefs, + AllowPartialUndefs && AllowUndefs) && + getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, + UndefSrcElts, EltSrcBits, + AllowWholeUndefs && AllowUndefs, + AllowPartialUndefs && AllowUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); - UndefSrcElts.insertBits(UndefSubElts, BaseIdx); + UndefSrcElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) - EltSrcBits[BaseIdx + i] = EltSubBits[i]; - return CastBitData(UndefSrcElts, EltSrcBits); + EltSrcBits[BaseIdx + i] = EltSubBits[i]; + return CastBitData(UndefSrcElts, EltSrcBits); } } @@ -6836,7 +6836,7 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, return false; // Insert the extracted elements into the mask. - for (const APInt &Elt : EltBits) + for (const APInt &Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; @@ -7517,8 +7517,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, case ISD::OR: { // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. - SDValue N0 = peekThroughBitcasts(N.getOperand(0)); - SDValue N1 = peekThroughBitcasts(N.getOperand(1)); + SDValue N0 = peekThroughBitcasts(N.getOperand(0)); + SDValue N1 = peekThroughBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector<int, 64> SrcMask0, SrcMask1; @@ -7533,20 +7533,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVector<int, 64> Mask0, Mask1; narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); - for (int i = 0; i != (int)MaskSize; ++i) { + for (int i = 0; i != (int)MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) - Mask.push_back(i); + Mask.push_back(i); else if (Mask0[i] == SM_SentinelZero) - Mask.push_back(i + MaskSize); + Mask.push_back(i + MaskSize); else return false; } - Ops.push_back(N0); - Ops.push_back(N1); + Ops.push_back(N0); + Ops.push_back(N1); return true; } case ISD::INSERT_SUBVECTOR: { @@ -7578,8 +7578,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Subvector shuffle inputs must not be larger than the subvector. if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) { - return SubVT.getFixedSizeInBits() < - SubInput.getValueSizeInBits().getFixedSize(); + return SubVT.getFixedSizeInBits() < + SubInput.getValueSizeInBits().getFixedSize(); })) return false; @@ -7600,11 +7600,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } Ops.push_back(Src); Ops.append(SubInputs.begin(), SubInputs.end()); - if (ISD::isBuildVectorAllZeros(Src.getNode())) - Mask.append(NumElts, SM_SentinelZero); - else - for (int i = 0; i != (int)NumElts; ++i) - Mask.push_back(i); + if (ISD::isBuildVectorAllZeros(Src.getNode())) + Mask.append(NumElts, SM_SentinelZero); + else + for (int i = 0; i != (int)NumElts; ++i) + Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { @@ -7705,33 +7705,33 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); - // If we know input saturation won't happen (or we don't care for particular - // lanes), we can treat this as a truncation shuffle. - bool Offset0 = false, Offset1 = false; + // If we know input saturation won't happen (or we don't care for particular + // lanes), we can treat this as a truncation shuffle. + bool Offset0 = false, Offset1 = false; if (Opcode == X86ISD::PACKSS) { - if ((!(N0.isUndef() || EltsLHS.isNullValue()) && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || - (!(N1.isUndef() || EltsRHS.isNullValue()) && + (!(N1.isUndef() || EltsRHS.isNullValue()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; - // We can't easily fold ASHR into a shuffle, but if it was feeding a - // PACKSS then it was likely being used for sign-extension for a - // truncation, so just peek through and adjust the mask accordingly. - if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && - N0.getConstantOperandAPInt(1) == NumBitsPerElt) { - Offset0 = true; - N0 = N0.getOperand(0); - } - if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && - N1.getConstantOperandAPInt(1) == NumBitsPerElt) { - Offset1 = true; - N1 = N1.getOperand(0); - } + // We can't easily fold ASHR into a shuffle, but if it was feeding a + // PACKSS then it was likely being used for sign-extension for a + // truncation, so just peek through and adjust the mask accordingly. + if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && + N0.getConstantOperandAPInt(1) == NumBitsPerElt) { + Offset0 = true; + N0 = N0.getOperand(0); + } + if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && + N1.getConstantOperandAPInt(1) == NumBitsPerElt) { + Offset1 = true; + N1 = N1.getOperand(0); + } } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!(N0.isUndef() || EltsLHS.isNullValue()) && + if ((!(N0.isUndef() || EltsLHS.isNullValue()) && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || - (!(N1.isUndef() || EltsRHS.isNullValue()) && + (!(N1.isUndef() || EltsRHS.isNullValue()) && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } @@ -7743,13 +7743,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); - - if (Offset0 || Offset1) { - for (int &M : Mask) - if ((Offset0 && isInRange(M, 0, NumElts)) || - (Offset1 && isInRange(M, NumElts, 2 * NumElts))) - ++M; - } + + if (Offset0 || Offset1) { + for (int &M : Mask) + if ((Offset0 && isInRange(M, 0, NumElts)) || + (Offset1 && isInRange(M, NumElts, 2 * NumElts))) + ++M; + } return true; } case X86ISD::VTRUNC: { @@ -8037,7 +8037,7 @@ static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, } // Use PINSRB/PINSRW/PINSRD to create a build vector. -static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask, +static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8052,7 +8052,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask, bool First = true; for (unsigned i = 0; i < NumElts; ++i) { - bool IsNonZero = NonZeroMask[i]; + bool IsNonZero = NonZeroMask[i]; if (!IsNonZero) continue; @@ -8079,7 +8079,7 @@ static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask, } /// Custom lower build_vector of v16i8. -static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, +static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8088,7 +8088,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) - return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, + return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); SDLoc dl(Op); @@ -8096,8 +8096,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; i += 2) { - bool ThisIsNonZero = NonZeroMask[i]; - bool NextIsNonZero = NonZeroMask[i + 1]; + bool ThisIsNonZero = NonZeroMask[i]; + bool NextIsNonZero = NonZeroMask[i + 1]; if (!ThisIsNonZero && !NextIsNonZero) continue; @@ -8145,7 +8145,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, } /// Custom lower build_vector of v8i16. -static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask, +static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8153,7 +8153,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask, return SDValue(); // Use PINSRW to insert each byte directly. - return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, + return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); } @@ -8487,8 +8487,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; - int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt); - int LoadSizeInBits = NumLoadedElts * BaseSizeInBits; + int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt); + int LoadSizeInBits = NumLoadedElts * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // TODO: Support offsetting the base load. @@ -8550,7 +8550,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // base pointer. If the vector contains zeros, then attempt to shuffle those // elements. if (FirstLoadedElt == 0 && - (NumLoadedElts == (int)NumElems || IsDereferenceable) && + (NumLoadedElts == (int)NumElems || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); @@ -8638,11 +8638,11 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; - // Don't attempt a 1:N subvector broadcast - it should be caught by - // combineConcatVectorOps, else will cause infinite loops. - if (RepeatSize > ScalarSize && SubElems == 1) - continue; - + // Don't attempt a 1:N subvector broadcast - it should be caught by + // combineConcatVectorOps, else will cause infinite loops. + if (RepeatSize > ScalarSize && SubElems == 1) + continue; + bool Match = true; SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { @@ -8674,14 +8674,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { - SDValue Broadcast = RepeatLoad; - if (RepeatSize > ScalarSize) { - while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) - Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); - } else { - Broadcast = - DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); - } + SDValue Broadcast = RepeatLoad; + if (RepeatSize > ScalarSize) { + while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) + Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); + } else { + Broadcast = + DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); + } return DAG.getBitcast(VT, Broadcast); } } @@ -8769,21 +8769,21 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, return SDValue(); MVT VT = BVOp->getSimpleValueType(0); - unsigned NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); SDLoc dl(BVOp); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); - // See if the build vector is a repeating sequence of scalars (inc. splat). - SDValue Ld; + // See if the build vector is a repeating sequence of scalars (inc. splat). + SDValue Ld; BitVector UndefElements; - SmallVector<SDValue, 16> Sequence; - if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) { - assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit."); - if (Sequence.size() == 1) - Ld = Sequence[0]; - } + SmallVector<SDValue, 16> Sequence; + if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) { + assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit."); + if (Sequence.size() == 1) + Ld = Sequence[0]; + } // Attempt to use VBROADCASTM // From this pattern: @@ -8791,34 +8791,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) - if (!Sequence.empty() && Subtarget.hasCDI()) { - // If not a splat, are the upper sequence values zeroable? - unsigned SeqLen = Sequence.size(); - bool UpperZeroOrUndef = - SeqLen == 1 || - llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) { - return !V || V.isUndef() || isNullConstant(V); - }); - SDValue Op0 = Sequence[0]; - if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || - (Op0.getOpcode() == ISD::ZERO_EXTEND && - Op0.getOperand(0).getOpcode() == ISD::BITCAST))) { - SDValue BOperand = Op0.getOpcode() == ISD::BITCAST - ? Op0.getOperand(0) - : Op0.getOperand(0).getOperand(0); + if (!Sequence.empty() && Subtarget.hasCDI()) { + // If not a splat, are the upper sequence values zeroable? + unsigned SeqLen = Sequence.size(); + bool UpperZeroOrUndef = + SeqLen == 1 || + llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) { + return !V || V.isUndef() || isNullConstant(V); + }); + SDValue Op0 = Sequence[0]; + if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || + (Op0.getOpcode() == ISD::ZERO_EXTEND && + Op0.getOperand(0).getOpcode() == ISD::BITCAST))) { + SDValue BOperand = Op0.getOpcode() == ISD::BITCAST + ? Op0.getOperand(0) + : Op0.getOperand(0).getOperand(0); MVT MaskVT = BOperand.getSimpleValueType(); - MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen); - if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q + MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen); + if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d - MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen); - if (!VT.is512BitVector() && !Subtarget.hasVLX()) { - unsigned Scale = 512 / VT.getSizeInBits(); - BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen)); - } - SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand); - if (BcstVT.getSizeInBits() != VT.getSizeInBits()) - Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits()); - return DAG.getBitcast(VT, Bcst); + MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen); + if (!VT.is512BitVector() && !Subtarget.hasVLX()) { + unsigned Scale = 512 / VT.getSizeInBits(); + BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen)); + } + SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand); + if (BcstVT.getSizeInBits() != VT.getSizeInBits()) + Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits()); + return DAG.getBitcast(VT, Bcst); } } } @@ -8868,15 +8868,15 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); - MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); + MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign(); - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = {DAG.getEntryNode(), VCP}; - MachinePointerInfo MPI = - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); - return DAG.getMemIntrinsicNode( - X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, - MachineMemOperand::MOLoad); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), VCP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + return DAG.getMemIntrinsicNode( + X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, + MachineMemOperand::MOLoad); } } } @@ -8897,8 +8897,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - // TODO: Handle broadcasts of non-constant sequences. - + // TODO: Handle broadcasts of non-constant sequences. + // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. // FIXME: Is the use count needed for non-constant, non-load case? @@ -10233,69 +10233,69 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return VectorConstant; unsigned EVTBits = EltVT.getSizeInBits(); - APInt UndefMask = APInt::getNullValue(NumElems); - APInt ZeroMask = APInt::getNullValue(NumElems); - APInt NonZeroMask = APInt::getNullValue(NumElems); + APInt UndefMask = APInt::getNullValue(NumElems); + APInt ZeroMask = APInt::getNullValue(NumElems); + APInt NonZeroMask = APInt::getNullValue(NumElems); bool IsAllConstants = true; SmallSet<SDValue, 8> Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); - if (Elt.isUndef()) { - UndefMask.setBit(i); + if (Elt.isUndef()) { + UndefMask.setBit(i); continue; - } + } Values.insert(Elt); if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) { IsAllConstants = false; NumConstants--; } - if (X86::isZeroNode(Elt)) { - ZeroMask.setBit(i); - } else { - NonZeroMask.setBit(i); + if (X86::isZeroNode(Elt)) { + ZeroMask.setBit(i); + } else { + NonZeroMask.setBit(i); } } - // All undef vector. Return an UNDEF. All zero vectors were handled above. - if (NonZeroMask == 0) { - assert(UndefMask.isAllOnesValue() && "Fully undef mask expected"); + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NonZeroMask == 0) { + assert(UndefMask.isAllOnesValue() && "Fully undef mask expected"); return DAG.getUNDEF(VT); - } - - BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); - - // If the upper elts of a ymm/zmm are undef/zero then we might be better off - // lowering to a smaller build vector and padding with undef/zero. - if ((VT.is256BitVector() || VT.is512BitVector()) && - !isFoldableUseOfShuffle(BV)) { - unsigned UpperElems = NumElems / 2; - APInt UndefOrZeroMask = UndefMask | ZeroMask; - unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes(); - if (NumUpperUndefsOrZeros >= UpperElems) { - if (VT.is512BitVector() && - NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) - UpperElems = NumElems - (NumElems / 4); - bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems; - MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); - SDValue NewBV = - DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); - return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); - } - } - - if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) - return AddSub; - if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) - return HorizontalOp; - if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) - return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) - return BitOp; - - unsigned NumZero = ZeroMask.countPopulation(); - unsigned NumNonZero = NonZeroMask.countPopulation(); - + } + + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); + + // If the upper elts of a ymm/zmm are undef/zero then we might be better off + // lowering to a smaller build vector and padding with undef/zero. + if ((VT.is256BitVector() || VT.is512BitVector()) && + !isFoldableUseOfShuffle(BV)) { + unsigned UpperElems = NumElems / 2; + APInt UndefOrZeroMask = UndefMask | ZeroMask; + unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes(); + if (NumUpperUndefsOrZeros >= UpperElems) { + if (VT.is512BitVector() && + NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) + UpperElems = NumElems - (NumElems / 4); + bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems; + MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); + SDValue NewBV = + DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); + return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); + } + } + + if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) + return AddSub; + if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) + return HorizontalOp; + if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) + return Broadcast; + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) + return BitOp; + + unsigned NumZero = ZeroMask.countPopulation(); + unsigned NumNonZero = NonZeroMask.countPopulation(); + // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not @@ -10358,7 +10358,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { - unsigned Idx = NonZeroMask.countTrailingZeros(); + unsigned Idx = NonZeroMask.countTrailingZeros(); SDValue Item = Op.getOperand(Idx); // If we have a constant or non-constant insertion into the low element of @@ -10422,7 +10422,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> - unsigned Idx = NonZeroMask.countTrailingZeros(); + unsigned Idx = NonZeroMask.countTrailingZeros(); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); @@ -10491,7 +10491,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. - unsigned Idx = NonZeroMask.countTrailingZeros(); + unsigned Idx = NonZeroMask.countTrailingZeros(); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); @@ -10501,12 +10501,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) - if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero, + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) - if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero, + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; @@ -10519,7 +10519,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (NumElems == 4 && NumZero > 0) { SmallVector<SDValue, 8> Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { - bool isZero = !NonZeroMask[i]; + bool isZero = !NonZeroMask[i]; if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -10527,7 +10527,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } for (unsigned i = 0; i < 2; ++i) { - switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { + switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. @@ -10544,8 +10544,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; - bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; + bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; + bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, @@ -10817,35 +10817,35 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); } -/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come -/// from multiple lanes - this is different to isLaneCrossingShuffleMask to -/// better support 'repeated mask + lane permute' style shuffles. -static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, - unsigned ScalarSizeInBits, - ArrayRef<int> Mask) { - assert(LaneSizeInBits && ScalarSizeInBits && - (LaneSizeInBits % ScalarSizeInBits) == 0 && - "Illegal shuffle lane size"); - int NumElts = Mask.size(); - int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; - int NumLanes = NumElts / NumEltsPerLane; - if (NumLanes > 1) { - for (int i = 0; i != NumLanes; ++i) { - int SrcLane = -1; - for (int j = 0; j != NumEltsPerLane; ++j) { - int M = Mask[(i * NumEltsPerLane) + j]; - if (M < 0) - continue; - int Lane = (M % NumElts) / NumEltsPerLane; - if (SrcLane >= 0 && SrcLane != Lane) - return true; - SrcLane = Lane; - } - } - } - return false; -} - +/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come +/// from multiple lanes - this is different to isLaneCrossingShuffleMask to +/// better support 'repeated mask + lane permute' style shuffles. +static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, + unsigned ScalarSizeInBits, + ArrayRef<int> Mask) { + assert(LaneSizeInBits && ScalarSizeInBits && + (LaneSizeInBits % ScalarSizeInBits) == 0 && + "Illegal shuffle lane size"); + int NumElts = Mask.size(); + int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; + int NumLanes = NumElts / NumEltsPerLane; + if (NumLanes > 1) { + for (int i = 0; i != NumLanes; ++i) { + int SrcLane = -1; + for (int j = 0; j != NumEltsPerLane; ++j) { + int M = Mask[(i * NumEltsPerLane) + j]; + if (M < 0) + continue; + int Lane = (M % NumElts) / NumEltsPerLane; + if (SrcLane >= 0 && SrcLane != Lane) + return true; + SrcLane = Lane; + } + } + } + return false; +} + /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same @@ -10907,11 +10907,11 @@ is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. -static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, - unsigned EltSizeInBits, +static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, + unsigned EltSizeInBits, ArrayRef<int> Mask, SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = LaneSizeInBits / EltSizeInBits; + int LaneSize = LaneSizeInBits / EltSizeInBits; RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { @@ -10942,67 +10942,67 @@ static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, return true; } -/// Test whether a target shuffle mask is equivalent within each sub-lane. -/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. -static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, - ArrayRef<int> Mask, - SmallVectorImpl<int> &RepeatedMask) { - return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(), - Mask, RepeatedMask); -} - -/// Checks whether the vector elements referenced by two shuffle masks are -/// equivalent. -static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, - int Idx, int ExpectedIdx) { - assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && - ExpectedIdx < MaskSize && "Out of range element index"); - if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) - return false; - - switch (Op.getOpcode()) { - case ISD::BUILD_VECTOR: - // If the values are build vectors, we can look through them to find - // equivalent inputs that make the shuffles equivalent. - // TODO: Handle MaskSize != Op.getNumOperands()? - if (MaskSize == (int)Op.getNumOperands() && - MaskSize == (int)ExpectedOp.getNumOperands()) - return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); - break; - case X86ISD::VBROADCAST: - case X86ISD::VBROADCAST_LOAD: - // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()? - return (Op == ExpectedOp && - (int)Op.getValueType().getVectorNumElements() == MaskSize); - case X86ISD::HADD: - case X86ISD::HSUB: - case X86ISD::FHADD: - case X86ISD::FHSUB: - case X86ISD::PACKSS: - case X86ISD::PACKUS: - // HOP(X,X) can refer to the elt from the lower/upper half of a lane. - // TODO: Handle MaskSize != NumElts? - // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. - if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { - MVT VT = Op.getSimpleValueType(); - int NumElts = VT.getVectorNumElements(); - if (MaskSize == NumElts) { - int NumLanes = VT.getSizeInBits() / 128; - int NumEltsPerLane = NumElts / NumLanes; - int NumHalfEltsPerLane = NumEltsPerLane / 2; - bool SameLane = - (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); - bool SameElt = - (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); - return SameLane && SameElt; - } - } - break; - } - - return false; -} - +/// Test whether a target shuffle mask is equivalent within each sub-lane. +/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. +static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, + ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(), + Mask, RepeatedMask); +} + +/// Checks whether the vector elements referenced by two shuffle masks are +/// equivalent. +static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, + int Idx, int ExpectedIdx) { + assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && + ExpectedIdx < MaskSize && "Out of range element index"); + if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) + return false; + + switch (Op.getOpcode()) { + case ISD::BUILD_VECTOR: + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + // TODO: Handle MaskSize != Op.getNumOperands()? + if (MaskSize == (int)Op.getNumOperands() && + MaskSize == (int)ExpectedOp.getNumOperands()) + return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); + break; + case X86ISD::VBROADCAST: + case X86ISD::VBROADCAST_LOAD: + // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()? + return (Op == ExpectedOp && + (int)Op.getValueType().getVectorNumElements() == MaskSize); + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: + case X86ISD::PACKSS: + case X86ISD::PACKUS: + // HOP(X,X) can refer to the elt from the lower/upper half of a lane. + // TODO: Handle MaskSize != NumElts? + // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. + if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { + MVT VT = Op.getSimpleValueType(); + int NumElts = VT.getVectorNumElements(); + if (MaskSize == NumElts) { + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + int NumHalfEltsPerLane = NumEltsPerLane / 2; + bool SameLane = + (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); + bool SameElt = + (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); + return SameLane && SameElt; + } + } + break; + } + + return false; +} + /// Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -11013,23 +11013,23 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. -static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, - SDValue V1 = SDValue(), - SDValue V2 = SDValue()) { - int Size = Mask.size(); - if (Size != (int)ExpectedMask.size()) +static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, + SDValue V1 = SDValue(), + SDValue V2 = SDValue()) { + int Size = Mask.size(); + if (Size != (int)ExpectedMask.size()) return false; for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); - int MaskIdx = Mask[i]; - int ExpectedIdx = ExpectedMask[i]; - if (0 <= MaskIdx && MaskIdx != ExpectedIdx) { - SDValue MaskV = MaskIdx < Size ? V1 : V2; - SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; - MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); - ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); - if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) + int MaskIdx = Mask[i]; + int ExpectedIdx = ExpectedMask[i]; + if (0 <= MaskIdx && MaskIdx != ExpectedIdx) { + SDValue MaskV = MaskIdx < Size ? V1 : V2; + SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; + MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); + ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); + if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) return false; } } @@ -11045,7 +11045,7 @@ static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, /// /// SM_SentinelZero is accepted as a valid negative index but must match in /// both. -static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, +static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { @@ -11059,23 +11059,23 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) return false; - // Don't use V1/V2 if they're not the same size as the shuffle mask type. - if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits()) - V1 = SDValue(); - if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits()) - V2 = SDValue(); + // Don't use V1/V2 if they're not the same size as the shuffle mask type. + if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits()) + V1 = SDValue(); + if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits()) + V2 = SDValue(); for (int i = 0; i < Size; ++i) { - int MaskIdx = Mask[i]; - int ExpectedIdx = ExpectedMask[i]; - if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) + int MaskIdx = Mask[i]; + int ExpectedIdx = ExpectedMask[i]; + if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) continue; - if (0 <= MaskIdx && 0 <= ExpectedIdx) { - SDValue MaskV = MaskIdx < Size ? V1 : V2; - SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; - MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); - ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); - if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) + if (0 <= MaskIdx && 0 <= ExpectedIdx) { + SDValue MaskV = MaskIdx < Size ? V1 : V2; + SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; + MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); + ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); + if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) continue; } // TODO - handle SM_Sentinel equivalences. @@ -11087,25 +11087,25 @@ static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, // Attempt to create a shuffle mask from a VSELECT condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask, SDValue Cond) { - EVT CondVT = Cond.getValueType(); - unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); - unsigned NumElts = CondVT.getVectorNumElements(); - - APInt UndefElts; - SmallVector<APInt, 32> EltBits; - if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, - true, false)) + EVT CondVT = Cond.getValueType(); + unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); + unsigned NumElts = CondVT.getVectorNumElements(); + + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, + true, false)) return false; - Mask.resize(NumElts, SM_SentinelUndef); + Mask.resize(NumElts, SM_SentinelUndef); - for (int i = 0; i != (int)NumElts; ++i) { + for (int i = 0; i != (int)NumElts; ++i) { Mask[i] = i; // Arbitrarily choose from the 2nd operand if the select condition element // is undef. // TODO: Can we do better by matching patterns such as even/odd? - if (UndefElts[i] || EltBits[i].isNullValue()) - Mask[i] += NumElts; + if (UndefElts[i] || EltBits[i].isNullValue()) + Mask[i] += NumElts; } return true; @@ -11123,8 +11123,8 @@ static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { SmallVector<int, 8> Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); - bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) || - isTargetShuffleEquivalent(VT, Mask, Unpckhwd)); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) || + isTargetShuffleEquivalent(VT, Mask, Unpckhwd)); return IsUnpackwdMask; } @@ -11141,8 +11141,8 @@ static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) { for (unsigned i = 0; i != 4; ++i) { SmallVector<int, 16> UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); - if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) || - isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask)) + if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) || + isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask)) return true; } return false; @@ -11177,15 +11177,15 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); - // If the mask only uses one non-undef element, then fully 'splat' it to - // improve later broadcast matching. - int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); - assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"); - - int FirstElt = Mask[FirstIndex]; - if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) - return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; - + // If the mask only uses one non-undef element, then fully 'splat' it to + // improve later broadcast matching. + int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); + assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"); + + int FirstElt = Mask[FirstIndex]; + if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) + return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; + unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; @@ -11335,8 +11335,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector<int, 64> Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1, - (IsUnary ? V1 : V2))) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1, + (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); @@ -11344,8 +11344,8 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1, - (IsUnary ? V1 : V2))) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1, + (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); @@ -11383,14 +11383,14 @@ static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) { + if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; @@ -11407,21 +11407,21 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SelectionDAG &DAG) { SmallVector<int, 8> Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); - if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) + if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector<int, 8> Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); - if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) + if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); - if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) + if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); - if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) + if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); @@ -11437,9 +11437,9 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); unsigned UnpackOpcode; - if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) + if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) UnpackOpcode = X86ISD::UNPCKL; - else if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) + else if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) UnpackOpcode = X86ISD::UNPCKH; else return SDValue(); @@ -11491,51 +11491,51 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, return false; } -// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper -// element padding to the final DstVT. -static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, - const X86Subtarget &Subtarget, - SelectionDAG &DAG, bool ZeroUppers) { - MVT SrcVT = Src.getSimpleValueType(); - MVT DstSVT = DstVT.getScalarType(); - unsigned NumDstElts = DstVT.getVectorNumElements(); - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits(); - - if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) - return SDValue(); - - // Perform a direct ISD::TRUNCATE if possible. - if (NumSrcElts == NumDstElts) - return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src); - - if (NumSrcElts > NumDstElts) { - MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); - return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits()); - } - - if ((NumSrcElts * DstEltSizeInBits) >= 128) { - MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); - return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, - DstVT.getSizeInBits()); - } - - // Non-VLX targets must truncate from a 512-bit type, so we need to - // widen, truncate and then possibly extract the original subvector. - if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) { - SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512); - return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers); - } - - // Fallback to a X86ISD::VTRUNC, padding if necessary. - MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits); - SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src); - if (DstVT != TruncVT) - Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, - DstVT.getSizeInBits()); - return Trunc; +// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper +// element padding to the final DstVT. +static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, bool ZeroUppers) { + MVT SrcVT = Src.getSimpleValueType(); + MVT DstSVT = DstVT.getScalarType(); + unsigned NumDstElts = DstVT.getVectorNumElements(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits(); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) + return SDValue(); + + // Perform a direct ISD::TRUNCATE if possible. + if (NumSrcElts == NumDstElts) + return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src); + + if (NumSrcElts > NumDstElts) { + MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); + return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits()); + } + + if ((NumSrcElts * DstEltSizeInBits) >= 128) { + MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); + return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, + DstVT.getSizeInBits()); + } + + // Non-VLX targets must truncate from a 512-bit type, so we need to + // widen, truncate and then possibly extract the original subvector. + if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) { + SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512); + return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers); + } + + // Fallback to a X86ISD::VTRUNC, padding if necessary. + MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits); + SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src); + if (DstVT != TruncVT) + Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, + DstVT.getSizeInBits()); + return Trunc; } // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. @@ -11551,99 +11551,99 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 // t18: v2i64 = bitcast t51 // -// One can just use a single vpmovdw instruction, without avx512vl we need to -// use the zmm variant and extract the lower subvector, padding with zeroes. -// TODO: Merge with lowerShuffleAsVTRUNC. -static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); - if (!Subtarget.hasAVX512()) - return SDValue(); - - unsigned NumElts = VT.getVectorNumElements(); - unsigned EltSizeInBits = VT.getScalarSizeInBits(); - unsigned MaxScale = 64 / EltSizeInBits; - for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { - unsigned NumSrcElts = NumElts / Scale; - unsigned UpperElts = NumElts - NumSrcElts; - if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || - !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) - continue; - - SDValue Src = V1; - if (!Src.hasOneUse()) - return SDValue(); - - Src = peekThroughOneUseBitcasts(Src); - if (Src.getOpcode() != ISD::TRUNCATE || - Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale)) - return SDValue(); - Src = Src.getOperand(0); - - // VPMOVWB is only available with avx512bw. - MVT SrcVT = Src.getSimpleValueType(); - if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && - !Subtarget.hasBWI()) +// One can just use a single vpmovdw instruction, without avx512vl we need to +// use the zmm variant and extract the lower subvector, padding with zeroes. +// TODO: Merge with lowerShuffleAsVTRUNC. +static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); + if (!Subtarget.hasAVX512()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned MaxScale = 64 / EltSizeInBits; + for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { + unsigned NumSrcElts = NumElts / Scale; + unsigned UpperElts = NumElts - NumSrcElts; + if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || + !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) + continue; + + SDValue Src = V1; + if (!Src.hasOneUse()) + return SDValue(); + + Src = peekThroughOneUseBitcasts(Src); + if (Src.getOpcode() != ISD::TRUNCATE || + Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale)) return SDValue(); - - bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); - return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); - } - - return SDValue(); -} - -// Attempt to match binary shuffle patterns as a truncate. -static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unexpected VTRUNC type"); - if (!Subtarget.hasAVX512()) - return SDValue(); - - unsigned NumElts = VT.getVectorNumElements(); - unsigned EltSizeInBits = VT.getScalarSizeInBits(); - unsigned MaxScale = 64 / EltSizeInBits; - for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { - // TODO: Support non-BWI VPMOVWB truncations? - unsigned SrcEltBits = EltSizeInBits * Scale; - if (SrcEltBits < 32 && !Subtarget.hasBWI()) - continue; - - // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...> - // Bail if the V2 elements are undef. - unsigned NumHalfSrcElts = NumElts / Scale; - unsigned NumSrcElts = 2 * NumHalfSrcElts; - if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || - isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) - continue; - - // The elements beyond the truncation must be undef/zero. - unsigned UpperElts = NumElts - NumSrcElts; - if (UpperElts > 0 && - !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) - continue; - bool UndefUppers = - UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts); - - // As we're using both sources then we need to concat them together - // and truncate from the double-sized src. - MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); - SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); - - MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); - MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); - Src = DAG.getBitcast(SrcVT, Src); - return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); - } - - return SDValue(); + Src = Src.getOperand(0); + + // VPMOVWB is only available with avx512bw. + MVT SrcVT = Src.getSimpleValueType(); + if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && + !Subtarget.hasBWI()) + return SDValue(); + + bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); + return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); + } + + return SDValue(); +} + +// Attempt to match binary shuffle patterns as a truncate. +static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unexpected VTRUNC type"); + if (!Subtarget.hasAVX512()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned MaxScale = 64 / EltSizeInBits; + for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { + // TODO: Support non-BWI VPMOVWB truncations? + unsigned SrcEltBits = EltSizeInBits * Scale; + if (SrcEltBits < 32 && !Subtarget.hasBWI()) + continue; + + // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...> + // Bail if the V2 elements are undef. + unsigned NumHalfSrcElts = NumElts / Scale; + unsigned NumSrcElts = 2 * NumHalfSrcElts; + if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || + isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) + continue; + + // The elements beyond the truncation must be undef/zero. + unsigned UpperElts = NumElts - NumSrcElts; + if (UpperElts > 0 && + !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) + continue; + bool UndefUppers = + UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts); + + // As we're using both sources then we need to concat them together + // and truncate from the double-sized src. + MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); + SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); + + MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); + MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); + Src = DAG.getBitcast(SrcVT, Src); + return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); + } + + return SDValue(); } /// Check whether a compaction lowering can be done by dropping even @@ -11761,14 +11761,14 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, // Try binary shuffle. SmallVector<int, 32> BinaryMask; createPackShuffleMask(VT, BinaryMask, false, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2)) + if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2)) if (MatchPACK(V1, V2, PackVT)) return true; // Try unary shuffle. SmallVector<int, 32> UnaryMask; createPackShuffleMask(VT, UnaryMask, true, NumStages); - if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1)) + if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1)) if (MatchPACK(V1, V1, PackVT)) return true; } @@ -12317,32 +12317,32 @@ static SDValue lowerShuffleAsByteRotateAndPermute( /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and -/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. -static SDValue lowerShuffleAsDecomposedShuffleMerge( +/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. +static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - int NumElts = Mask.size(); - int NumLanes = VT.getSizeInBits() / 128; - int NumEltsPerLane = NumElts / NumLanes; - + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + // Shuffle the input elements into the desired positions in V1 and V2 and - // unpack/blend them together. - bool IsAlternating = true; - SmallVector<int, 32> V1Mask(NumElts, -1); - SmallVector<int, 32> V2Mask(NumElts, -1); - SmallVector<int, 32> FinalMask(NumElts, -1); - for (int i = 0; i < NumElts; ++i) { - int M = Mask[i]; - if (M >= 0 && M < NumElts) { - V1Mask[i] = M; - FinalMask[i] = i; - IsAlternating &= (i & 1) == 0; - } else if (M >= NumElts) { - V2Mask[i] = M - NumElts; - FinalMask[i] = i + NumElts; - IsAlternating &= (i & 1) == 1; - } - } + // unpack/blend them together. + bool IsAlternating = true; + SmallVector<int, 32> V1Mask(NumElts, -1); + SmallVector<int, 32> V2Mask(NumElts, -1); + SmallVector<int, 32> FinalMask(NumElts, -1); + for (int i = 0; i < NumElts; ++i) { + int M = Mask[i]; + if (M >= 0 && M < NumElts) { + V1Mask[i] = M; + FinalMask[i] = i; + IsAlternating &= (i & 1) == 0; + } else if (M >= NumElts) { + V2Mask[i] = M - NumElts; + FinalMask[i] = i + NumElts; + IsAlternating &= (i & 1) == 1; + } + } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as @@ -12366,30 +12366,30 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( return BlendPerm; } - // If the final mask is an alternating blend of vXi8/vXi16, convert to an - // UNPCKL(SHUFFLE, SHUFFLE) pattern. - // TODO: It doesn't have to be alternating - but each lane mustn't have more - // than half the elements coming from each source. - if (IsAlternating && VT.getScalarSizeInBits() < 32) { - V1Mask.assign(NumElts, -1); - V2Mask.assign(NumElts, -1); - FinalMask.assign(NumElts, -1); - for (int i = 0; i != NumElts; i += NumEltsPerLane) - for (int j = 0; j != NumEltsPerLane; ++j) { - int M = Mask[i + j]; - if (M >= 0 && M < NumElts) { - V1Mask[i + (j / 2)] = M; - FinalMask[i + j] = i + (j / 2); - } else if (M >= NumElts) { - V2Mask[i + (j / 2)] = M - NumElts; - FinalMask[i + j] = i + (j / 2) + NumElts; - } - } - } - + // If the final mask is an alternating blend of vXi8/vXi16, convert to an + // UNPCKL(SHUFFLE, SHUFFLE) pattern. + // TODO: It doesn't have to be alternating - but each lane mustn't have more + // than half the elements coming from each source. + if (IsAlternating && VT.getScalarSizeInBits() < 32) { + V1Mask.assign(NumElts, -1); + V2Mask.assign(NumElts, -1); + FinalMask.assign(NumElts, -1); + for (int i = 0; i != NumElts; i += NumEltsPerLane) + for (int j = 0; j != NumEltsPerLane; ++j) { + int M = Mask[i + j]; + if (M >= 0 && M < NumElts) { + V1Mask[i + (j / 2)] = M; + FinalMask[i + j] = i + (j / 2); + } else if (M >= NumElts) { + V2Mask[i + (j / 2)] = M - NumElts; + FinalMask[i + j] = i + (j / 2) + NumElts; + } + } + } + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); - return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); + return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } /// Try to lower a vector shuffle as a bit rotation. @@ -13047,8 +13047,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, - DL, ExtVT, InputV, DAG); + InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, + DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -13656,8 +13656,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, MVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); - SDValue NewAddr = - DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL); + SDValue NewAddr = + DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL); // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather // than MOVDDUP. @@ -13830,7 +13830,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. - unsigned InsertPSMask = 0; + unsigned InsertPSMask = 0; if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); @@ -14018,8 +14018,8 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. - if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) || - isShuffleEquivalent(Mask, {1, 3}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) || + isShuffleEquivalent(Mask, {1, 3}, V1, V2)) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. @@ -14065,10 +14065,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); - int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), - Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), - Mask[1] < 0 ? -1 : (Mask[1] * 2), - Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; + int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), + Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), + Mask[1] < 0 ? -1 : (Mask[1] * 2), + Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, @@ -14128,7 +14128,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely @@ -14222,12 +14222,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } - } else if (NumV2Elements == 3) { - // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but - // we can get here due to other paths (e.g repeated mask matching) that we - // don't want to do another round of lowerVECTOR_SHUFFLE. - ShuffleVectorSDNode::commuteMask(NewMask); - return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); + } else if (NumV2Elements == 3) { + // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but + // we can get here due to other paths (e.g repeated mask matching) that we + // don't want to do another round of lowerVECTOR_SHUFFLE. + ShuffleVectorSDNode::commuteMask(NewMask); + return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); @@ -14256,9 +14256,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { - if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); - if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2)) + if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } @@ -14272,9 +14272,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { - if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); - if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2)) + if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } @@ -14316,9 +14316,9 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { - if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2)) + if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } @@ -14366,9 +14366,9 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; - if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2)) Mask = UnpackLoMask; - else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2)) + else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2)) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, @@ -14426,7 +14426,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. @@ -15035,11 +15035,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return ZExt; - // Try to use lower using a truncation. - if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { @@ -15120,11 +15120,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget)) return V; - // Try to use lower using a truncation. - if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -15176,49 +15176,49 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, } // We can always bit-blend if we have to so the fallback strategy is to - // decompose into single-input permutes and blends/unpacks. - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, + // decompose into single-input permutes and blends/unpacks. + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } -// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets, -// sub-512-bit shuffles are padded to 512-bits for the shuffle and then -// the active subvector is extracted. +// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets, +// sub-512-bit shuffles are padded to 512-bits for the shuffle and then +// the active subvector is extracted. static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - MVT MaskVT = VT.changeTypeToInteger(); - SDValue MaskNode; - MVT ShuffleVT = VT; - if (!VT.is512BitVector() && !Subtarget.hasVLX()) { - V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512); - V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512); - ShuffleVT = V1.getSimpleValueType(); - - // Adjust mask to correct indices for the second input. - int NumElts = VT.getVectorNumElements(); - unsigned Scale = 512 / VT.getSizeInBits(); - SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end()); - for (int &M : AdjustedMask) - if (NumElts <= M) - M += (Scale - 1) * NumElts; - MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true); - MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512); - } else { - MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true); - } - - SDValue Result; + ArrayRef<int> Mask, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT.changeTypeToInteger(); + SDValue MaskNode; + MVT ShuffleVT = VT; + if (!VT.is512BitVector() && !Subtarget.hasVLX()) { + V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512); + V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512); + ShuffleVT = V1.getSimpleValueType(); + + // Adjust mask to correct indices for the second input. + int NumElts = VT.getVectorNumElements(); + unsigned Scale = 512 / VT.getSizeInBits(); + SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end()); + for (int &M : AdjustedMask) + if (NumElts <= M) + M += (Scale - 1) * NumElts; + MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true); + MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512); + } else { + MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true); + } + + SDValue Result; if (V2.isUndef()) - Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1); - else - Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2); + Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1); + else + Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2); - if (VT != ShuffleVT) - Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits()); - - return Result; + if (VT != ShuffleVT) + Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits()); + + return Result; } /// Generic lowering of v16i8 shuffles. @@ -15256,15 +15256,15 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return ZExt; - // Try to use lower using a truncation. - if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - - if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, @@ -15447,17 +15447,17 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; - // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). - if (Subtarget.hasVBMI()) - return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget, - DAG); - - // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. - if (Subtarget.hasXOP()) { - SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); - return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); - } - + // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). + if (Subtarget.hasVBMI()) + return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget, + DAG); + + // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. + if (Subtarget.hasXOP()) { + SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); + return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); + } + // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( @@ -15512,9 +15512,9 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Result; } - // Handle multi-input cases by blending/unpacking single-input shuffles. + // Handle multi-input cases by blending/unpacking single-input shuffles. if (NumV2Elements > 0) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 @@ -15694,7 +15694,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, } /// Either split a vector in halves or decompose the shuffles and the -/// blend/unpack. +/// blend/unpack. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select @@ -15729,8 +15729,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, return true; }; if (DoBothBroadcast()) - return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, - DAG); + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, + DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -15746,9 +15746,9 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); - // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This - // requires that the decomposed single-input shuffles don't end up here. - return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, + // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This + // requires that the decomposed single-input shuffles don't end up here. + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, DAG); } @@ -15796,94 +15796,94 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; - bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef(); - - /// Attempts to find a sublane permute with the given size - /// that gets all elements into their target lanes. - /// - /// If successful, fills CrossLaneMask and InLaneMask and returns true. - /// If unsuccessful, returns false and may overwrite InLaneMask. - auto getSublanePermute = [&](int NumSublanes) -> SDValue { - int NumSublanesPerLane = NumSublanes / NumLanes; - int NumEltsPerSublane = NumElts / NumSublanes; - - SmallVector<int, 16> CrossLaneMask; - SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef); - // CrossLaneMask but one entry == one sublane. - SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef); - - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - if (M < 0) - continue; - - int SrcSublane = M / NumEltsPerSublane; - int DstLane = i / NumEltsPerLane; - - // We only need to get the elements into the right lane, not sublane. - // So search all sublanes that make up the destination lane. - bool Found = false; - int DstSubStart = DstLane * NumSublanesPerLane; - int DstSubEnd = DstSubStart + NumSublanesPerLane; - for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) { - if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane)) - continue; - - Found = true; - CrossLaneMaskLarge[DstSublane] = SrcSublane; - int DstSublaneOffset = DstSublane * NumEltsPerSublane; - InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane; - break; - } - if (!Found) - return SDValue(); - } - - // Fill CrossLaneMask using CrossLaneMaskLarge. - narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask); - - if (!CanUseSublanes) { - // If we're only shuffling a single lowest lane and the rest are identity - // then don't bother. - // TODO - isShuffleMaskInputInPlace could be extended to something like - // this. - int NumIdentityLanes = 0; - bool OnlyShuffleLowestLane = true; - for (int i = 0; i != NumLanes; ++i) { - int LaneOffset = i * NumEltsPerLane; - if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane, - i * NumEltsPerLane)) - NumIdentityLanes++; - else if (CrossLaneMask[LaneOffset] != 0) - OnlyShuffleLowestLane = false; - } - if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) - return SDValue(); - } - - SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); - return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), - InLaneMask); - }; - - // First attempt a solution with full lanes. - if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes)) - return V; - - // The rest of the solutions use sublanes. - if (!CanUseSublanes) - return SDValue(); - - // Then attempt a solution with 64-bit sublanes (vpermq). - if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2)) - return V; - - // If that doesn't work and we have fast variable shuffle, - // attempt 32-bit sublanes (vpermd). - if (!Subtarget.hasFastVariableShuffle()) - return SDValue(); - - return getSublanePermute(/*NumSublanes=*/NumLanes * 4); + bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef(); + + /// Attempts to find a sublane permute with the given size + /// that gets all elements into their target lanes. + /// + /// If successful, fills CrossLaneMask and InLaneMask and returns true. + /// If unsuccessful, returns false and may overwrite InLaneMask. + auto getSublanePermute = [&](int NumSublanes) -> SDValue { + int NumSublanesPerLane = NumSublanes / NumLanes; + int NumEltsPerSublane = NumElts / NumSublanes; + + SmallVector<int, 16> CrossLaneMask; + SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef); + // CrossLaneMask but one entry == one sublane. + SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + int SrcSublane = M / NumEltsPerSublane; + int DstLane = i / NumEltsPerLane; + + // We only need to get the elements into the right lane, not sublane. + // So search all sublanes that make up the destination lane. + bool Found = false; + int DstSubStart = DstLane * NumSublanesPerLane; + int DstSubEnd = DstSubStart + NumSublanesPerLane; + for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) { + if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane)) + continue; + + Found = true; + CrossLaneMaskLarge[DstSublane] = SrcSublane; + int DstSublaneOffset = DstSublane * NumEltsPerSublane; + InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane; + break; + } + if (!Found) + return SDValue(); + } + + // Fill CrossLaneMask using CrossLaneMaskLarge. + narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask); + + if (!CanUseSublanes) { + // If we're only shuffling a single lowest lane and the rest are identity + // then don't bother. + // TODO - isShuffleMaskInputInPlace could be extended to something like + // this. + int NumIdentityLanes = 0; + bool OnlyShuffleLowestLane = true; + for (int i = 0; i != NumLanes; ++i) { + int LaneOffset = i * NumEltsPerLane; + if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane, + i * NumEltsPerLane)) + NumIdentityLanes++; + else if (CrossLaneMask[LaneOffset] != 0) + OnlyShuffleLowestLane = false; + } + if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) + return SDValue(); + } + + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); + return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), + InLaneMask); + }; + + // First attempt a solution with full lanes. + if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes)) + return V; + + // The rest of the solutions use sublanes. + if (!CanUseSublanes) + return SDValue(); + + // Then attempt a solution with 64-bit sublanes (vpermq). + if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2)) + return V; + + // If that doesn't work and we have fast variable shuffle, + // attempt 32-bit sublanes (vpermd). + if (!Subtarget.hasFastVariableShuffle()) + return SDValue(); + + return getSublanePermute(/*NumSublanes=*/NumLanes * 4); } /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one @@ -15996,8 +15996,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. - bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2); - if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) { + bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2); + if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) { // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. @@ -16739,7 +16739,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Broadcast; // Use low duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { @@ -16800,7 +16800,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16828,7 +16828,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -16910,7 +16910,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16930,7 +16930,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } @@ -16963,9 +16963,9 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) + if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); - if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) + if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (V2.isUndef()) @@ -17019,13 +17019,13 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) - return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, - DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, + DAG); // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -17058,8 +17058,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && !Subtarget.hasAVX512()) - return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, - DAG); + return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, + DAG); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17144,7 +17144,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } @@ -17186,11 +17186,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget)) return V; - // Try to use lower using a truncation. - if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17243,9 +17243,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return PSHUFB; - // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16). - if (Subtarget.hasBWI()) - return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG); + // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16). + if (Subtarget.hasBWI()) + return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. @@ -17301,11 +17301,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget)) return V; - // Try to use lower using a truncation. - if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable, - Subtarget, DAG)) - return V; - + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17348,9 +17348,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return PSHUFB; - // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). - if (Subtarget.hasVBMI()) - return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG); + // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). + if (Subtarget.hasVBMI()) + return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. @@ -17477,9 +17477,9 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, // Check for patterns which can be matched with a single insert of a 256-bit // subvector. - bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2); + bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2); if (OnlyUsesV1 || - isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) { + isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, @@ -17564,7 +17564,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, if (V2.isUndef()) { // Use low duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2)) + if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { @@ -17604,7 +17604,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Blend; - return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. @@ -17623,9 +17623,9 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); // Use even/odd duplicate instructions for masks that match their pattern. - if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) + if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); - if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) + if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); if (V2.isUndef()) @@ -17663,7 +17663,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; - return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. @@ -17711,14 +17711,14 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Rotate; // Try to use PALIGNR. - if (Subtarget.hasBWI()) - if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, - Subtarget, DAG)) - return Rotate; + if (Subtarget.hasBWI()) + if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, + Subtarget, DAG)) + return Rotate; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; - + // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) @@ -17728,7 +17728,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Blend; - return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. @@ -17805,7 +17805,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Blend; - return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. @@ -17868,7 +17868,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return PSHUFB; - return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. @@ -17924,7 +17924,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) - return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); + return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. @@ -18378,7 +18378,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, // Modify the new Mask to take all zeros from the all-zero vector. // Choose indices that are blend-friendly. bool UsedZeroVector = false; - assert(is_contained(WidenedMask, SM_SentinelZero) && + assert(is_contained(WidenedMask, SM_SentinelZero) && "V2's non-undef elements are used?!"); for (int i = 0; i != NewNumElts; ++i) if (WidenedMask[i] == SM_SentinelZero) { @@ -18431,11 +18431,11 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. - if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { - SmallVector<int, 32> Mask; - if (createShuffleMaskFromVSELECT(Mask, Cond)) - return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); - } + if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { + SmallVector<int, 32> Mask; + if (createShuffleMaskFromVSELECT(Mask, Cond)) + return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); + } return SDValue(); } @@ -18549,9 +18549,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } @@ -18706,8 +18706,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } @@ -18901,9 +18901,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, Opc = X86ISD::PINSRB; } - assert(N1.getValueType() != MVT::i32 && "Unexpected VT"); - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8); + assert(N1.getValueType() != MVT::i32 && "Unexpected VT"); + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } @@ -19151,12 +19151,12 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, if (GV) { // Create a target global address if this is a global. If possible, fold the // offset into the global address reference. Otherwise, ADD it on later. - // Suppress the folding if Offset is negative: movl foo-1, %eax is not - // allowed because if the address of foo is 0, the ELF R_X86_64_32 - // relocation will compute to a negative value, which is invalid. + // Suppress the folding if Offset is negative: movl foo-1, %eax is not + // allowed because if the address of foo is 0, the ELF R_X86_64_32 + // relocation will compute to a negative value, which is invalid. int64_t GlobalOffset = 0; - if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 && - X86::isOffsetSuitableForCodeModel(Offset, M, true)) { + if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 && + X86::isOffsetSuitableForCodeModel(Offset, M, true)) { std::swap(GlobalOffset, Offset); } Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); @@ -19243,7 +19243,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); } -// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { @@ -19251,17 +19251,17 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, X86::RAX, X86II::MO_TLSGD); } -// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 -static SDValue -LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, - const EVT PtrVT) { - return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, - X86::EAX, X86II::MO_TLSGD); -} - +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 +static SDValue +LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, + X86::EAX, X86II::MO_TLSGD); +} + static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, - SelectionDAG &DAG, const EVT PtrVT, - bool Is64Bit, bool Is64BitLP64) { + SelectionDAG &DAG, const EVT PtrVT, + bool Is64Bit, bool Is64BitLP64) { SDLoc dl(GA); // Get the start address of the TLS block for this module. @@ -19270,9 +19270,9 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; - if (Is64Bit) { - unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX; - Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg, + if (Is64Bit) { + unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX; + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InFlag; @@ -19369,15 +19369,15 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: - if (Subtarget.is64Bit()) { - if (Subtarget.isTarget64BitLP64()) - return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); - return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); - } + if (Subtarget.is64Bit()) { + if (Subtarget.isTarget64BitLP64()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); + return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); + } return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: - return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), - Subtarget.isTarget64BitLP64()); + return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), + Subtarget.isTarget64BitLP64()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), @@ -19477,7 +19477,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); - const DataLayout &DL = DAG.getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); @@ -19570,29 +19570,29 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, if (IsFSHR) std::swap(Op0, Op1); - // With AVX512, but not VLX we need to widen to get a 512-bit result type. - if (!Subtarget.hasVLX() && !VT.is512BitVector()) { - Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512); - Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512); - } - - SDValue Funnel; + // With AVX512, but not VLX we need to widen to get a 512-bit result type. + if (!Subtarget.hasVLX() && !VT.is512BitVector()) { + Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512); + Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512); + } + + SDValue Funnel; APInt APIntShiftAmt; - MVT ResultVT = Op0.getSimpleValueType(); + MVT ResultVT = Op0.getSimpleValueType(); if (X86::isConstantSplat(Amt, APIntShiftAmt)) { uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); - Funnel = - DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0, - Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); - } else { - if (!Subtarget.hasVLX() && !VT.is512BitVector()) - Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512); - Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, - ResultVT, Op0, Op1, Amt); - } - if (!Subtarget.hasVLX() && !VT.is512BitVector()) - Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits()); - return Funnel; + Funnel = + DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0, + Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); + } else { + if (!Subtarget.hasVLX() && !VT.is512BitVector()) + Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512); + Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, + ResultVT, Op0, Op1, Amt); + } + if (!Subtarget.hasVLX() && !VT.is512BitVector()) + Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits()); + return Funnel; } assert( (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -19944,7 +19944,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, } if (VT == MVT::f128) - return SDValue(); + return SDValue(); SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) @@ -20025,10 +20025,10 @@ static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0 - // when converting 0 when rounding toward negative infinity. Caller will - // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode. - assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!"); + // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0 + // when converting 0 when rounding toward negative infinity. Caller will + // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode. + assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!"); // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 @@ -20063,27 +20063,27 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, // Load the 64-bit value into an XMM register. SDValue XR1 = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); - SDValue CLod0 = DAG.getLoad( - MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); + SDValue CLod0 = DAG.getLoad( + MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); - SDValue CLod1 = DAG.getLoad( - MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); + SDValue CLod1 = DAG.getLoad( + MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3() && + if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); @@ -20385,7 +20385,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) - return SDValue(); + return SDValue(); if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -20412,30 +20412,30 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; - // The transform for i64->f64 isn't correct for 0 when rounding to negative - // infinity. It produces -0.0, so disable under strictfp. - if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict) + // The transform for i64->f64 isn't correct for 0 when rounding to negative + // infinity. It produces -0.0, so disable under strictfp. + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); - if (Subtarget.is64Bit() && SrcVT == MVT::i64 && - (DstVT == MVT::f32 || DstVT == MVT::f64)) + if (Subtarget.is64Bit() && SrcVT == MVT::i64 && + (DstVT == MVT::f32 || DstVT == MVT::f64)) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8); int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - Align SlotAlign(8); + Align SlotAlign(8); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); if (SrcVT == MVT::i32) { - SDValue OffsetSlot = - DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl); - SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign); + SDValue OffsetSlot = + DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl); + SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), - OffsetSlot, MPI.getWithOffset(4), SlotAlign); + OffsetSlot, MPI.getWithOffset(4), SlotAlign); std::pair<SDValue, SDValue> Tmp = - BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG); + BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); @@ -20451,15 +20451,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); } SDValue Store = - DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign); + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign); // For i64 source, we need to add the appropriate power of 2 if the input - // was negative. We must be careful to do the computation in x87 extended - // precision, not in SSE. + // was negative. We must be careful to do the computation in x87 extended + // precision, not in SSE. SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI, - SlotAlign, MachineMemOperand::MOLoad); + SlotAlign, MachineMemOperand::MOLoad); Chain = Fild.getValue(1); @@ -20562,8 +20562,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // - // Adjust = (Value >= Thresh) ? 0x80000000 : 0; - // FltOfs = (Value >= Thresh) ? 0x80000000 : 0; + // Adjust = (Value >= Thresh) ? 0x80000000 : 0; + // FltOfs = (Value >= Thresh) ? 0x80000000 : 0; // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent @@ -20593,31 +20593,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { - Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, - /*IsSignaling*/ true); + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, + /*IsSignaling*/ true); Chain = Cmp.getValue(1); } else { - Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE); - } - - // Our preferred lowering of - // - // (Value >= Thresh) ? 0x8000000000000000ULL : 0 - // - // is - // - // (Value >= Thresh) << 63 - // - // but since we can get here after LegalOperations, DAGCombine might do the - // wrong thing if we create a select. So, directly create the preferred - // version. - SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp); - SDValue Const63 = DAG.getConstant(63, DL, MVT::i8); - Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63); - - SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal, - DAG.getConstantFP(0.0, DL, TheVT)); - + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE); + } + + // Our preferred lowering of + // + // (Value >= Thresh) ? 0x8000000000000000ULL : 0 + // + // is + // + // (Value >= Thresh) << 63 + // + // but since we can get here after LegalOperations, DAGCombine might do the + // wrong thing if we create a select. So, directly create the preferred + // version. + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp); + SDValue Const63 = DAG.getConstant(63, DL, MVT::i8); + Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63); + + SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal, + DAG.getConstantFP(0.0, DL, TheVT)); + if (IsStrict) { Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, { Chain, Value, FltOfs }); @@ -21075,8 +21075,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { - In = DAG.getBitcast(MVT::v8i32, In); - + In = DAG.getBitcast(MVT::v8i32, In); + // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; @@ -21085,17 +21085,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getIntPtrConstant(0, DL)); } - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, DAG.getIntPtrConstant(0, DL)); - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(4, DL)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(4, DL)); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { - In = DAG.getBitcast(MVT::v32i8, In); - + In = DAG.getBitcast(MVT::v32i8, In); + // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { // The PSHUFB mask: @@ -21106,17 +21106,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); - static const int ShufMask2[] = {0, 2, -1, -1}; - In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, - DAG.getBitcast(MVT::v16i16, In), - DAG.getIntPtrConstant(0, DL)); + static const int ShufMask2[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, + DAG.getBitcast(MVT::v16i16, In), + DAG.getIntPtrConstant(0, DL)); } - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, DAG.getIntPtrConstant(0, DL)); - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, - DAG.getIntPtrConstant(16, DL)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In, + DAG.getIntPtrConstant(16, DL)); // The PSHUFB mask: static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, @@ -21452,155 +21452,155 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI); } -SDValue -X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { - // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation, - // but making use of X86 specifics to produce better instruction sequences. - SDNode *Node = Op.getNode(); - bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT; - unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; - SDLoc dl(SDValue(Node, 0)); - SDValue Src = Node->getOperand(0); - - // There are three types involved here: SrcVT is the source floating point - // type, DstVT is the type of the result, and TmpVT is the result of the - // intermediate FP_TO_*INT operation we'll use (which may be a promotion of - // DstVT). - EVT SrcVT = Src.getValueType(); - EVT DstVT = Node->getValueType(0); - EVT TmpVT = DstVT; - - // This code is only for floats and doubles. Fall back to generic code for - // anything else. - if (!isScalarFPTypeInSSEReg(SrcVT)) - return SDValue(); - - unsigned SatWidth = Node->getConstantOperandVal(1); - unsigned DstWidth = DstVT.getScalarSizeInBits(); - unsigned TmpWidth = TmpVT.getScalarSizeInBits(); - assert(SatWidth <= DstWidth && SatWidth <= TmpWidth && - "Expected saturation width smaller than result width"); - - // Promote result of FP_TO_*INT to at least 32 bits. - if (TmpWidth < 32) { - TmpVT = MVT::i32; - TmpWidth = 32; - } - - // Promote conversions to unsigned 32-bit to 64-bit, because it will allow - // us to use a native signed conversion instead. - if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { - TmpVT = MVT::i64; - TmpWidth = 64; - } - - // If the saturation width is smaller than the size of the temporary result, - // we can always use signed conversion, which is native. - if (SatWidth < TmpWidth) - FpToIntOpcode = ISD::FP_TO_SINT; - - // Determine minimum and maximum integer values and their corresponding - // floating-point values. - APInt MinInt, MaxInt; - if (IsSigned) { - MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth); - MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth); - } else { - MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth); - MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth); - } - - APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT)); - APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT)); - - APFloat::opStatus MinStatus = MinFloat.convertFromAPInt( - MinInt, IsSigned, APFloat::rmTowardZero); - APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt( - MaxInt, IsSigned, APFloat::rmTowardZero); - bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) - && !(MaxStatus & APFloat::opStatus::opInexact); - - SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT); - SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT); - - // If the integer bounds are exactly representable as floats, emit a - // min+max+fptoi sequence. Otherwise use comparisons and selects. - if (AreExactFloatBounds) { - if (DstVT != TmpVT) { - // Clamp by MinFloat from below. If Src is NaN, propagate NaN. - SDValue MinClamped = DAG.getNode( - X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); - // Clamp by MaxFloat from above. If Src is NaN, propagate NaN. - SDValue BothClamped = DAG.getNode( - X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); - // Convert clamped value to integer. - SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped); - - // NaN will become INDVAL, with the top bit set and the rest zero. - // Truncation will discard the top bit, resulting in zero. - return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); - } - - // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat. - SDValue MinClamped = DAG.getNode( - X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); - // Clamp by MaxFloat from above. NaN cannot occur. - SDValue BothClamped = DAG.getNode( - X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); - // Convert clamped value to integer. - SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped); - - if (!IsSigned) { - // In the unsigned case we're done, because we mapped NaN to MinFloat, - // which is zero. - return FpToInt; - } - - // Otherwise, select zero if Src is NaN. - SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); - } - - SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); - SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT); - - // Result of direct conversion, which may be selected away. - SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src); - - if (DstVT != TmpVT) { - // NaN will become INDVAL, with the top bit set and the rest zero. - // Truncation will discard the top bit, resulting in zero. - FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); - } - - SDValue Select = FpToInt; - // For signed conversions where we saturate to the same size as the - // result type of the fptoi instructions, INDVAL coincides with integer - // minimum, so we don't need to explicitly check it. - if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { - // If Src ULT MinFloat, select MinInt. In particular, this also selects - // MinInt if Src is NaN. - Select = DAG.getSelectCC( - dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); - } - - // If Src OGT MaxFloat, select MaxInt. - Select = DAG.getSelectCC( - dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); - - // In the unsigned case we are done, because we mapped NaN to MinInt, which - // is already zero. The promoted case was already handled above. - if (!IsSigned || DstVT != TmpVT) { - return Select; - } - - // Otherwise, select 0 if Src is NaN. - SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); -} - +SDValue +X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { + // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation, + // but making use of X86 specifics to produce better instruction sequences. + SDNode *Node = Op.getNode(); + bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT; + unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + SDLoc dl(SDValue(Node, 0)); + SDValue Src = Node->getOperand(0); + + // There are three types involved here: SrcVT is the source floating point + // type, DstVT is the type of the result, and TmpVT is the result of the + // intermediate FP_TO_*INT operation we'll use (which may be a promotion of + // DstVT). + EVT SrcVT = Src.getValueType(); + EVT DstVT = Node->getValueType(0); + EVT TmpVT = DstVT; + + // This code is only for floats and doubles. Fall back to generic code for + // anything else. + if (!isScalarFPTypeInSSEReg(SrcVT)) + return SDValue(); + + unsigned SatWidth = Node->getConstantOperandVal(1); + unsigned DstWidth = DstVT.getScalarSizeInBits(); + unsigned TmpWidth = TmpVT.getScalarSizeInBits(); + assert(SatWidth <= DstWidth && SatWidth <= TmpWidth && + "Expected saturation width smaller than result width"); + + // Promote result of FP_TO_*INT to at least 32 bits. + if (TmpWidth < 32) { + TmpVT = MVT::i32; + TmpWidth = 32; + } + + // Promote conversions to unsigned 32-bit to 64-bit, because it will allow + // us to use a native signed conversion instead. + if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { + TmpVT = MVT::i64; + TmpWidth = 64; + } + + // If the saturation width is smaller than the size of the temporary result, + // we can always use signed conversion, which is native. + if (SatWidth < TmpWidth) + FpToIntOpcode = ISD::FP_TO_SINT; + + // Determine minimum and maximum integer values and their corresponding + // floating-point values. + APInt MinInt, MaxInt; + if (IsSigned) { + MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth); + MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth); + } else { + MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth); + MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth); + } + + APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT)); + APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT)); + + APFloat::opStatus MinStatus = MinFloat.convertFromAPInt( + MinInt, IsSigned, APFloat::rmTowardZero); + APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt( + MaxInt, IsSigned, APFloat::rmTowardZero); + bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) + && !(MaxStatus & APFloat::opStatus::opInexact); + + SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT); + SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT); + + // If the integer bounds are exactly representable as floats, emit a + // min+max+fptoi sequence. Otherwise use comparisons and selects. + if (AreExactFloatBounds) { + if (DstVT != TmpVT) { + // Clamp by MinFloat from below. If Src is NaN, propagate NaN. + SDValue MinClamped = DAG.getNode( + X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); + // Clamp by MaxFloat from above. If Src is NaN, propagate NaN. + SDValue BothClamped = DAG.getNode( + X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); + // Convert clamped value to integer. + SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped); + + // NaN will become INDVAL, with the top bit set and the rest zero. + // Truncation will discard the top bit, resulting in zero. + return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); + } + + // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat. + SDValue MinClamped = DAG.getNode( + X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); + // Clamp by MaxFloat from above. NaN cannot occur. + SDValue BothClamped = DAG.getNode( + X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); + // Convert clamped value to integer. + SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped); + + if (!IsSigned) { + // In the unsigned case we're done, because we mapped NaN to MinFloat, + // which is zero. + return FpToInt; + } + + // Otherwise, select zero if Src is NaN. + SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); + return DAG.getSelectCC( + dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); + } + + SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); + SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT); + + // Result of direct conversion, which may be selected away. + SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src); + + if (DstVT != TmpVT) { + // NaN will become INDVAL, with the top bit set and the rest zero. + // Truncation will discard the top bit, resulting in zero. + FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); + } + + SDValue Select = FpToInt; + // For signed conversions where we saturate to the same size as the + // result type of the fptoi instructions, INDVAL coincides with integer + // minimum, so we don't need to explicitly check it. + if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { + // If Src ULT MinFloat, select MinInt. In particular, this also selects + // MinInt if Src is NaN. + Select = DAG.getSelectCC( + dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); + } + + // If Src OGT MaxFloat, select MaxInt. + Select = DAG.getSelectCC( + dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); + + // In the unsigned case we are done, because we mapped NaN to MinInt, which + // is already zero. The promoted case was already handled above. + if (!IsSigned || DstVT != TmpVT) { + return Select; + } + + // Otherwise, select 0 if Src is NaN. + SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); + return DAG.getSelectCC( + dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); +} + SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -21609,8 +21609,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); - if (VT == MVT::f128) - return SDValue(); + if (VT == MVT::f128) + return SDValue(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -21626,10 +21626,10 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); // It's legal except when f128 is involved - if (In.getSimpleValueType() != MVT::f128) + if (In.getSimpleValueType() != MVT::f128) return Op; - return SDValue(); + return SDValue(); } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -21994,7 +21994,7 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, if (M == SrcOpMap.end()) { VT = Src.getValueType(); // Quit if not the same type. - if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType()) + if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType()) return false; unsigned NumElts = VT.getVectorNumElements(); APInt EltCount = APInt::getNullValue(NumElts); @@ -22032,11 +22032,11 @@ static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC) { EVT VT = V.getValueType(); - unsigned ScalarSize = VT.getScalarSizeInBits(); - if (Mask.getBitWidth() != ScalarSize) { - assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"); - return SDValue(); - } + unsigned ScalarSize = VT.getScalarSizeInBits(); + if (Mask.getBitWidth() != ScalarSize) { + assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"); + return SDValue(); + } assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE); @@ -22940,8 +22940,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } - if (VT.getFixedSizeInBits() > - Op.getSimpleValueType().getFixedSizeInBits()) { + if (VT.getFixedSizeInBits() > + Op.getSimpleValueType().getFixedSizeInBits()) { // We emitted a compare with an XMM/YMM result. Finish converting to a // mask register using a vptestm. EVT CastVT = EVT(VT).changeVectorElementTypeToInteger(); @@ -23116,10 +23116,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } // Try to use SUBUS and PCMPEQ. - if (FlipSigns) - if (SDValue V = - LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) - return V; + if (FlipSigns) + if (SDValue V = + LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) + return V; // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple @@ -23914,7 +23914,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); - assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()); + assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); @@ -24089,8 +24089,8 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL); unsigned HalfOffset = Value0.getValueType().getStoreSize(); SDValue Ptr0 = Store->getBasePtr(); - SDValue Ptr1 = - DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL); + SDValue Ptr1 = + DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), Store->getOriginalAlign(), @@ -24125,8 +24125,8 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SmallVector<SDValue, 4> Stores; for (unsigned i = 0; i != NumElems; ++i) { unsigned Offset = i * ScalarSize; - SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), - TypeSize::Fixed(Offset), DL); + SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), + TypeSize::Fixed(Offset), DL); SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, DAG.getIntPtrConstant(i, DL)); SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, @@ -24147,22 +24147,22 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { - unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); - assert(NumElts <= 8 && "Unexpected VT"); + unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); + assert(NumElts <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); - // We must pad with zeros to ensure we store zeroes to any unused bits. + // We must pad with zeros to ensure we store zeroes to any unused bits. StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); - // Make sure we store zeros in the extra bits. - if (NumElts < 8) - StoredVal = DAG.getZeroExtendInReg( - StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts)); + // Make sure we store zeros in the extra bits. + if (NumElts < 8) + StoredVal = DAG.getZeroExtendInReg( + StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), @@ -24418,7 +24418,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); + Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); @@ -24519,7 +24519,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store fp_offset - FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL); + FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL); Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, @@ -24584,18 +24584,18 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Subtarget.hasSSE1()); } - // Insert VAARG node into the DAG - // VAARG returns two values: Variable Argument Address, Chain - SDValue InstOps[] = {Chain, SrcPtr, - DAG.getTargetConstant(ArgSize, dl, MVT::i32), - DAG.getTargetConstant(ArgMode, dl, MVT::i8), - DAG.getTargetConstant(Align, dl, MVT::i32)}; + // Insert VAARG node into the DAG + // VAARG returns two values: Variable Argument Address, Chain + SDValue InstOps[] = {Chain, SrcPtr, + DAG.getTargetConstant(ArgSize, dl, MVT::i32), + DAG.getTargetConstant(ArgMode, dl, MVT::i8), + DAG.getTargetConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( - Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl, - VTs, InstOps, MVT::i64, MachinePointerInfo(SV), - /*Alignment=*/None, - MachineMemOperand::MOLoad | MachineMemOperand::MOStore); + Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl, + VTs, InstOps, MVT::i64, MachinePointerInfo(SV), + /*Alignment=*/None, + MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it @@ -24619,11 +24619,11 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); SDLoc DL(Op); - return DAG.getMemcpy( - Chain, DL, DstPtr, SrcPtr, - DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL), - Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false, - false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); + return DAG.getMemcpy( + Chain, DL, DstPtr, SrcPtr, + DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL), + Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false, + false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // Helper to get immediate/variable SSE shift opcode from other shift opcodes. @@ -25070,12 +25070,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); - if (IntrData->Type == INTR_TYPE_3OP_IMM8 && - Src3.getValueType() != MVT::i8) { - Src3 = DAG.getTargetConstant( - cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8); - } - + if (IntrData->Type == INTR_TYPE_3OP_IMM8 && + Src3.getValueType() != MVT::i8) { + Src3 = DAG.getTargetConstant( + cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8); + } + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -25094,18 +25094,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src1, Src2, Src3}); } - case INTR_TYPE_4OP_IMM8: { - assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant); - SDValue Src4 = Op.getOperand(4); - if (Src4.getValueType() != MVT::i8) { - Src4 = DAG.getTargetConstant( - cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8); - } - - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Src4); - } + case INTR_TYPE_4OP_IMM8: { + assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant); + SDValue Src4 = Op.getOperand(4); + if (Src4.getValueType() != MVT::i8) { + Src4 = DAG.getTargetConstant( + cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8); + } + + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Src4); + } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); @@ -25338,21 +25338,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); SDValue CC = Op.getOperand(3); - SDValue Mask = Op.getOperand(4); + SDValue Mask = Op.getOperand(4); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { - SDValue Sae = Op.getOperand(5); + SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC, Mask, Sae); + Op.getOperand(2), CC, Mask, Sae); if (!isRoundModeCurDirection(Sae)) return SDValue(); } //default rounding mode return DAG.getNode(IntrData->Opc0, dl, MaskVT, - {Op.getOperand(1), Op.getOperand(2), CC, Mask}); + {Op.getOperand(1), Op.getOperand(2), CC, Mask}); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -25507,11 +25507,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), RoundingMode); } case BEXTRI: { - assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode"); + assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode"); uint64_t Imm = Op.getConstantOperandVal(2); - SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl, - Op.getValueType()); + SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl, + Op.getValueType()); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Control); } @@ -25902,8 +25902,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // MMX register. ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), - DAG.getTargetConstant(NewIntrinsic, DL, - getPointerTy(DAG.getDataLayout())), + DAG.getTargetConstant(NewIntrinsic, DL, + getPointerTy(DAG.getDataLayout())), Op.getOperand(1), ShAmt); } } @@ -26273,97 +26273,97 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } - case Intrinsic::x86_aesenc128kl: - case Intrinsic::x86_aesdec128kl: - case Intrinsic::x86_aesenc256kl: - case Intrinsic::x86_aesdec256kl: { - SDLoc DL(Op); - SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other); - SDValue Chain = Op.getOperand(0); - unsigned Opcode; - - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); - case Intrinsic::x86_aesenc128kl: - Opcode = X86ISD::AESENC128KL; - break; - case Intrinsic::x86_aesdec128kl: - Opcode = X86ISD::AESDEC128KL; - break; - case Intrinsic::x86_aesenc256kl: - Opcode = X86ISD::AESENC256KL; - break; - case Intrinsic::x86_aesdec256kl: - Opcode = X86ISD::AESDEC256KL; - break; - } - - MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); - MachineMemOperand *MMO = MemIntr->getMemOperand(); - EVT MemVT = MemIntr->getMemoryVT(); - SDValue Operation = DAG.getMemIntrinsicNode( - Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT, - MMO); - SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG); - - return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), - {ZF, Operation.getValue(0), Operation.getValue(2)}); - } - case Intrinsic::x86_aesencwide128kl: - case Intrinsic::x86_aesdecwide128kl: - case Intrinsic::x86_aesencwide256kl: - case Intrinsic::x86_aesdecwide256kl: { - SDLoc DL(Op); - SDVTList VTs = DAG.getVTList( - {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, - MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other}); - SDValue Chain = Op.getOperand(0); - unsigned Opcode; - - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); - case Intrinsic::x86_aesencwide128kl: - Opcode = X86ISD::AESENCWIDE128KL; - break; - case Intrinsic::x86_aesdecwide128kl: - Opcode = X86ISD::AESDECWIDE128KL; - break; - case Intrinsic::x86_aesencwide256kl: - Opcode = X86ISD::AESENCWIDE256KL; - break; - case Intrinsic::x86_aesdecwide256kl: - Opcode = X86ISD::AESDECWIDE256KL; - break; - } - - MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); - MachineMemOperand *MMO = MemIntr->getMemOperand(); - EVT MemVT = MemIntr->getMemoryVT(); - SDValue Operation = DAG.getMemIntrinsicNode( - Opcode, DL, VTs, - {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - Op.getOperand(5), Op.getOperand(6), Op.getOperand(7), - Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)}, - MemVT, MMO); - SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG); - - return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), - {ZF, Operation.getValue(1), Operation.getValue(2), - Operation.getValue(3), Operation.getValue(4), - Operation.getValue(5), Operation.getValue(6), - Operation.getValue(7), Operation.getValue(8), - Operation.getValue(9)}); - } - case Intrinsic::x86_testui: { - SDLoc dl(Op); - SDValue Chain = Op.getOperand(0); - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); - SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain); - SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); - return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, - Operation.getValue(1)); - } - } + case Intrinsic::x86_aesenc128kl: + case Intrinsic::x86_aesdec128kl: + case Intrinsic::x86_aesenc256kl: + case Intrinsic::x86_aesdec256kl: { + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other); + SDValue Chain = Op.getOperand(0); + unsigned Opcode; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_aesenc128kl: + Opcode = X86ISD::AESENC128KL; + break; + case Intrinsic::x86_aesdec128kl: + Opcode = X86ISD::AESDEC128KL; + break; + case Intrinsic::x86_aesenc256kl: + Opcode = X86ISD::AESENC256KL; + break; + case Intrinsic::x86_aesdec256kl: + Opcode = X86ISD::AESDEC256KL; + break; + } + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + MachineMemOperand *MMO = MemIntr->getMemOperand(); + EVT MemVT = MemIntr->getMemoryVT(); + SDValue Operation = DAG.getMemIntrinsicNode( + Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT, + MMO); + SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG); + + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {ZF, Operation.getValue(0), Operation.getValue(2)}); + } + case Intrinsic::x86_aesencwide128kl: + case Intrinsic::x86_aesdecwide128kl: + case Intrinsic::x86_aesencwide256kl: + case Intrinsic::x86_aesdecwide256kl: { + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList( + {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, + MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other}); + SDValue Chain = Op.getOperand(0); + unsigned Opcode; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_aesencwide128kl: + Opcode = X86ISD::AESENCWIDE128KL; + break; + case Intrinsic::x86_aesdecwide128kl: + Opcode = X86ISD::AESDECWIDE128KL; + break; + case Intrinsic::x86_aesencwide256kl: + Opcode = X86ISD::AESENCWIDE256KL; + break; + case Intrinsic::x86_aesdecwide256kl: + Opcode = X86ISD::AESDECWIDE256KL; + break; + } + + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); + MachineMemOperand *MMO = MemIntr->getMemOperand(); + EVT MemVT = MemIntr->getMemoryVT(); + SDValue Operation = DAG.getMemIntrinsicNode( + Opcode, DL, VTs, + {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Op.getOperand(5), Op.getOperand(6), Op.getOperand(7), + Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)}, + MemVT, MMO); + SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG); + + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {ZF, Operation.getValue(1), Operation.getValue(2), + Operation.getValue(3), Operation.getValue(4), + Operation.getValue(5), Operation.getValue(6), + Operation.getValue(7), Operation.getValue(8), + Operation.getValue(9)}); + } + case Intrinsic::x86_testui: { + SDLoc dl(Op); + SDValue Chain = Op.getOperand(0); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain); + SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, + Operation.getValue(1)); + } + } return SDValue(); } @@ -26733,8 +26733,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); - OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, - MachinePointerInfo(TrmpAddr, 2), Align(2)); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), Align(2)); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td @@ -26746,8 +26746,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); - OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, - MachinePointerInfo(TrmpAddr, 12), Align(2)); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), Align(2)); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... @@ -26789,7 +26789,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasAttribute(Idx, Attribute::InReg)) { - const DataLayout &DL = DAG.getDataLayout(); + const DataLayout &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } @@ -26827,20 +26827,20 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); - OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, - MachinePointerInfo(TrmpAddr, 1), Align(1)); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), Align(1)); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); - OutChains[2] = - DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, - MachinePointerInfo(TrmpAddr, 5), Align(1)); + OutChains[2] = + DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 5), Align(1)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); - OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, - MachinePointerInfo(TrmpAddr, 6), Align(1)); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), Align(1)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } @@ -27134,47 +27134,47 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDValue X = Op.getOperand(0), Y = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); - SDLoc DL(Op); - + SDLoc DL(Op); + if (VT.getScalarType() == MVT::i1) { switch (Opcode) { default: llvm_unreachable("Expected saturated arithmetic opcode"); case ISD::UADDSAT: case ISD::SADDSAT: // *addsat i1 X, Y --> X | Y - return DAG.getNode(ISD::OR, DL, VT, X, Y); + return DAG.getNode(ISD::OR, DL, VT, X, Y); case ISD::USUBSAT: case ISD::SSUBSAT: // *subsat i1 X, Y --> X & ~Y - return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT)); + return DAG.getNode(ISD::AND, DL, VT, X, DAG.getNOT(DL, Y, VT)); } } - if (VT == MVT::v32i16 || VT == MVT::v64i8 || - (VT.is256BitVector() && !Subtarget.hasInt256())) { - assert(Op.getSimpleValueType().isInteger() && - "Only handle AVX vector integer operation"); - return splitVectorIntBinary(Op, DAG); + if (VT == MVT::v32i16 || VT == MVT::v64i8 || + (VT.is256BitVector() && !Subtarget.hasInt256())) { + assert(Op.getSimpleValueType().isInteger() && + "Only handle AVX vector integer operation"); + return splitVectorIntBinary(Op, DAG); } - // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT SetCCResultType = - TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - - if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { - // usubsat X, Y --> (X >u Y) ? X - Y : 0 - SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); - SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); - // TODO: Move this to DAGCombiner? - if (SetCCResultType == VT && - DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) - return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); - return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); - } + // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT SetCCResultType = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - // Use default expansion. - return SDValue(); + if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { + // usubsat X, Y --> (X >u Y) ? X - Y : 0 + SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); + SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); + // TODO: Move this to DAGCombiner? + if (SetCCResultType == VT && + DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) + return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); + return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); + } + + // Use default expansion. + return SDValue(); } static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, @@ -27224,8 +27224,8 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { if (VT == MVT::v32i16 || VT == MVT::v64i8) return splitVectorIntBinary(Op, DAG); - // Default to expand. - return SDValue(); + // Default to expand. + return SDValue(); } static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, @@ -27597,8 +27597,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Entry.Node = StackPtr; - InChain = - DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); + InChain = + DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; @@ -27889,7 +27889,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, MVT VT = Amt.getSimpleValueType(); if (!(VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget.hasInt256() && VT == MVT::v16i16) || - (Subtarget.hasVBMI2() && VT == MVT::v32i16) || + (Subtarget.hasVBMI2() && VT == MVT::v32i16) || (!Subtarget.hasAVX512() && VT == MVT::v16i8))) return SDValue(); @@ -28467,12 +28467,12 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return Op; } - // AVX512 VBMI2 vXi16 - lower to funnel shifts. - if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) { - unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR); - return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); - } - + // AVX512 VBMI2 vXi16 - lower to funnel shifts. + if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) { + unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR); + return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); + } + assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. @@ -28499,8 +28499,8 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return splitVectorIntBinary(Op, DAG); assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || - ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || - VT == MVT::v32i16) && + ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || + VT == MVT::v32i16) && Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"); @@ -28797,8 +28797,8 @@ bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const { /// a) very likely accessed only by a single thread to minimize cache traffic, /// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, - const X86Subtarget &Subtarget, SDValue Chain, - const SDLoc &DL) { + const X86Subtarget &Subtarget, SDValue Chain, + const SDLoc &DL) { // Implementation notes: // 1) LOCK prefix creates a full read/write reordering barrier for memory // operations issued by the current processor. As such, the location @@ -29236,28 +29236,28 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SDValue In = Op.getOperand(0); SDLoc DL(Op); - assert(VT.getScalarType() == MVT::i8 && - "Only byte vector BITREVERSE supported"); - + assert(VT.getScalarType() == MVT::i8 && + "Only byte vector BITREVERSE supported"); + // Split v64i8 without BWI so that we can still use the PSHUFB lowering. if (VT == MVT::v64i8 && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. - if (VT == MVT::v32i8 && !Subtarget.hasInt256()) + if (VT == MVT::v32i8 && !Subtarget.hasInt256()) return splitVectorIntUnary(Op, DAG); - unsigned NumElts = VT.getVectorNumElements(); - - // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. - if (Subtarget.hasGFNI()) { - MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); - SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT); - Matrix = DAG.getBitcast(VT, Matrix); - return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, - DAG.getTargetConstant(0, DL, MVT::i8)); - } - + unsigned NumElts = VT.getVectorNumElements(); + + // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. + if (Subtarget.hasGFNI()) { + MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); + SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT); + Matrix = DAG.getBitcast(VT, Matrix); + return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, + DAG.getTargetConstant(0, DL, MVT::i8)); + } + // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each // 0-15 value (moved to the other nibble). @@ -29289,58 +29289,58 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } -static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - SDValue X = Op.getOperand(0); - MVT VT = Op.getSimpleValueType(); - - // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. - if (VT == MVT::i8 || - DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { - X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); - SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, - DAG.getConstant(0, DL, MVT::i8)); - // Copy the inverse of the parity flag into a register with setcc. - SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); - // Extend to the original type. - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); - } - - if (VT == MVT::i64) { - // Xor the high and low 16-bits together using a 32-bit operation. - SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, - DAG.getNode(ISD::SRL, DL, MVT::i64, X, - DAG.getConstant(32, DL, MVT::i8))); - SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); - X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); - } - - if (VT != MVT::i16) { - // Xor the high and low 16-bits together using a 32-bit operation. - SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X, - DAG.getConstant(16, DL, MVT::i8)); - X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16); - } else { - // If the input is 16-bits, we need to extend to use an i32 shift below. - X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X); - } - - // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. - // This should allow an h-reg to be used to save a shift. - SDValue Hi = DAG.getNode( - ISD::TRUNCATE, DL, MVT::i8, - DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8))); - SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); - SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); - SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); - - // Copy the inverse of the parity flag into a register with setcc. - SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); - // Extend to the original type. - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); -} - +static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. + if (VT == MVT::i8 || + DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, + DAG.getConstant(0, DL, MVT::i8)); + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Extend to the original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); + } + + if (VT == MVT::i64) { + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i64, X, + DAG.getConstant(32, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); + } + + if (VT != MVT::i16) { + // Xor the high and low 16-bits together using a 32-bit operation. + SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X, + DAG.getConstant(16, DL, MVT::i8)); + X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16); + } else { + // If the input is 16-bits, we need to extend to use an i32 shift below. + X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X); + } + + // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. + // This should allow an h-reg to be used to save a shift. + SDValue Hi = DAG.getNode( + ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8))); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); + SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); + SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); + + // Copy the inverse of the parity flag into a register with setcc. + SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); + // Extend to the original type. + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); +} + static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; @@ -29477,7 +29477,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Chain = DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr, - MPI, MaybeAlign(), MachineMemOperand::MOStore); + MPI, MaybeAlign(), MachineMemOperand::MOStore); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue LdOps[] = {Chain, StackPtr}; SDValue Value = @@ -29517,7 +29517,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); - unsigned Opc = Op.getOpcode(); + unsigned Opc = Op.getOpcode(); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -29532,14 +29532,14 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getAllOnesConstant(DL, CarryVT)); - bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY; - SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, - Op.getOperand(0), Op.getOperand(1), - Carry.getValue(1)); + bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY; + SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, + Op.getOperand(0), Op.getOperand(1), + Carry.getValue(1)); - bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY; - SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B, - Sum.getValue(1), DL, DAG); + bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY; + SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B, + Sum.getValue(1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -29944,7 +29944,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); - case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG); + case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); @@ -29979,8 +29979,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); - case ISD::FP_TO_SINT_SAT: - case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); + case ISD::FP_TO_SINT_SAT: + case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); case ISD::FP_EXTEND: case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FP_ROUND: @@ -30047,8 +30047,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); - case ISD::SADDO_CARRY: - case ISD::SSUBO_CARRY: + case ISD::SADDO_CARRY: + case ISD::SSUBO_CARRY: case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::ADD: @@ -30116,9 +30116,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Chain); return; } - case X86ISD::CVTPS2PH: - Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG)); - return; + case X86ISD::CVTPS2PH: + Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG)); + return; case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // Use a v2i64 if possible. @@ -30360,7 +30360,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); - SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG); + SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); @@ -30370,7 +30370,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); - Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG); + Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); @@ -30721,30 +30721,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); - - // In 64-bit mode we might need the base pointer in RBX, but we can't know - // until later. So we keep the RBX input in a vreg and use a custom - // inserter. - // Since RBX will be a reserved register the register allocator will not - // make sure its value will be properly saved and restored around this - // live-range. + + // In 64-bit mode we might need the base pointer in RBX, but we can't know + // until later. So we keep the RBX input in a vreg and use a custom + // inserter. + // Since RBX will be a reserved register the register allocator will not + // make sure its value will be properly saved and restored around this + // live-range. SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); - if (Regs64bit) { - SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, - swapInH.getValue(1)}; - Result = - DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); + if (Regs64bit) { + SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, + swapInH.getValue(1)}; + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); } else { - swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, + swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; - Result = - DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); + Result = + DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); } - + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); @@ -30989,9 +30989,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(COMI) NODE_NAME_CASE(UCOMI) NODE_NAME_CASE(CMPM) - NODE_NAME_CASE(CMPMM) + NODE_NAME_CASE(CMPMM) NODE_NAME_CASE(STRICT_CMPM) - NODE_NAME_CASE(CMPMM_SAE) + NODE_NAME_CASE(CMPMM_SAE) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETCC_CARRY) NODE_NAME_CASE(FSETCC) @@ -31109,7 +31109,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(XOR) NODE_NAME_CASE(AND) NODE_NAME_CASE(BEXTR) - NODE_NAME_CASE(BEXTRI) + NODE_NAME_CASE(BEXTRI) NODE_NAME_CASE(BZHI) NODE_NAME_CASE(PDEP) NODE_NAME_CASE(PEXT) @@ -31147,7 +31147,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VBROADCAST) NODE_NAME_CASE(VBROADCAST_LOAD) NODE_NAME_CASE(VBROADCASTM) - NODE_NAME_CASE(SUBV_BROADCAST_LOAD) + NODE_NAME_CASE(SUBV_BROADCAST_LOAD) NODE_NAME_CASE(VPERMILPV) NODE_NAME_CASE(VPERMILPI) NODE_NAME_CASE(VPERM2X128) @@ -31169,7 +31169,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DBPSADBW) NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) NODE_NAME_CASE(VAARG_64) - NODE_NAME_CASE(VAARG_X32) + NODE_NAME_CASE(VAARG_X32) NODE_NAME_CASE(WIN_ALLOCA) NODE_NAME_CASE(MEMBARRIER) NODE_NAME_CASE(MFENCE) @@ -31326,15 +31326,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ENQCMD) NODE_NAME_CASE(ENQCMDS) NODE_NAME_CASE(VP2INTERSECT) - NODE_NAME_CASE(AESENC128KL) - NODE_NAME_CASE(AESDEC128KL) - NODE_NAME_CASE(AESENC256KL) - NODE_NAME_CASE(AESDEC256KL) - NODE_NAME_CASE(AESENCWIDE128KL) - NODE_NAME_CASE(AESDECWIDE128KL) - NODE_NAME_CASE(AESENCWIDE256KL) - NODE_NAME_CASE(AESDECWIDE256KL) - NODE_NAME_CASE(TESTUI) + NODE_NAME_CASE(AESENC128KL) + NODE_NAME_CASE(AESDEC128KL) + NODE_NAME_CASE(AESENC256KL) + NODE_NAME_CASE(AESDEC256KL) + NODE_NAME_CASE(AESENCWIDE128KL) + NODE_NAME_CASE(AESDECWIDE128KL) + NODE_NAME_CASE(AESENCWIDE256KL) + NODE_NAME_CASE(AESDECWIDE256KL) + NODE_NAME_CASE(TESTUI) } return nullptr; #undef NODE_NAME_CASE @@ -31680,7 +31680,7 @@ static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); @@ -31760,8 +31760,8 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, } MachineBasicBlock * -X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, - MachineBasicBlock *MBB) const { +X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: @@ -31772,8 +31772,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) - assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!"); - static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands"); + assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!"); + static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands"); Register DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); @@ -31788,7 +31788,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); // Memory Reference - assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"); + assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"); MachineMemOperand *OldMMO = MI.memoperands().front(); @@ -31801,10 +31801,10 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - const TargetRegisterClass *AddrRegClass = - getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout())); + const TargetRegisterClass *AddrRegClass = + getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout())); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); // struct va_list { // i32 gp_offset @@ -31913,35 +31913,35 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Read the reg_save_area address. Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass); - BuildMI( - offsetMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), - RegSaveReg) + BuildMI( + offsetMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), + RegSaveReg) .add(Base) .add(Scale) .add(Index) - .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12) + .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12) .add(Segment) .setMemRefs(LoadOnlyMMO); - if (Subtarget.isTarget64BitLP64()) { - // Zero-extend the offset - Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); - BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) - .addImm(0) - .addReg(OffsetReg) - .addImm(X86::sub_32bit); - - // Add the offset to the reg_save_area to get the final address. - BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) - .addReg(OffsetReg64) - .addReg(RegSaveReg); - } else { - // Add the offset to the reg_save_area to get the final address. - BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg) - .addReg(OffsetReg) - .addReg(RegSaveReg); - } + if (Subtarget.isTarget64BitLP64()) { + // Zero-extend the offset + Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + .addImm(0) + .addReg(OffsetReg) + .addImm(X86::sub_32bit); + + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) + .addReg(OffsetReg64) + .addReg(RegSaveReg); + } else { + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg) + .addReg(OffsetReg) + .addReg(RegSaveReg); + } // Compute the offset for the next argument Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); @@ -31970,9 +31970,9 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Load the overflow_area address into a register. Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); - BuildMI(overflowMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), - OverflowAddrReg) + BuildMI(overflowMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), + OverflowAddrReg) .add(Base) .add(Scale) .add(Index) @@ -31987,17 +31987,17 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, Register TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) - BuildMI( - overflowMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), - TmpReg) + BuildMI( + overflowMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), + TmpReg) .addReg(OverflowAddrReg) .addImm(Alignment.value() - 1); - BuildMI( - overflowMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri), - OverflowDestReg) + BuildMI( + overflowMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri), + OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Alignment.value() - 1)); } else { @@ -32008,16 +32008,16 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass); - BuildMI( - overflowMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), - NextAddrReg) - .addReg(OverflowDestReg) - .addImm(ArgSizeA8); + BuildMI( + overflowMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), + NextAddrReg) + .addReg(OverflowDestReg) + .addImm(ArgSizeA8); // Store the new overflow address. - BuildMI(overflowMBB, DL, - TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr)) + BuildMI(overflowMBB, DL, + TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr)) .add(Base) .add(Scale) .add(Index) @@ -32073,10 +32073,10 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // Now add the instructions. const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); Register CountReg = MI.getOperand(0).getReg(); - int RegSaveFrameIndex = MI.getOperand(1).getImm(); + int RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { @@ -32385,7 +32385,7 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the @@ -32540,7 +32540,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); const unsigned ProbeSize = getStackProbeSize(*MF); @@ -32633,7 +32633,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); @@ -32668,7 +32668,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); - Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), @@ -32768,7 +32768,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && @@ -32806,7 +32806,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. @@ -32835,7 +32835,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); assert(MI.getOperand(3).isGlobal() && "This should be a global"); @@ -32974,7 +32974,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, MachineBasicBlock *BB) const { // Copy the virtual register into the R11 physical register and // call the retpoline thunk. - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); Register CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode()); @@ -33036,7 +33036,7 @@ X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, /// \param [in] MBB The Machine Basic Block that will be modified. void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -33079,7 +33079,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -33239,7 +33239,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -33420,7 +33420,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -33504,7 +33504,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); @@ -33553,7 +33553,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock * X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); @@ -33783,7 +33783,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); auto TMMImmToTMMReg = [](unsigned Imm) { assert (Imm < 8 && "Illegal tmm index"); @@ -33793,10 +33793,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: case X86::TLS_addr64: - case X86::TLS_addrX32: + case X86::TLS_addrX32: case X86::TLS_base_addr32: case X86::TLS_base_addr64: - case X86::TLS_base_addrX32: + case X86::TLS_base_addrX32: return EmitLoweredTLSAddr(MI, BB); case X86::INDIRECT_THUNK_CALL32: case X86::INDIRECT_THUNK_CALL64: @@ -33952,8 +33952,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: - case X86::VAARG_X32: - return EmitVAARGWithCustomInserter(MI, BB); + case X86::VAARG_X32: + return EmitVAARGWithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: @@ -33977,7 +33977,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::PATCHABLE_EVENT_CALL: case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: - return BB; + return BB; case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -34032,75 +34032,75 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } - case X86::LCMPXCHG16B_NO_RBX: { - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); - Register BasePtr = TRI->getBaseRegister(); - if (TRI->hasBasePointer(*MF) && - (BasePtr == X86::RBX || BasePtr == X86::EBX)) { - if (!BB->isLiveIn(BasePtr)) - BB->addLiveIn(BasePtr); - // Save RBX into a virtual register. - Register SaveRBX = - MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) - .addReg(X86::RBX); - Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst); - for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) - MIB.add(MI.getOperand(Idx)); - MIB.add(MI.getOperand(X86::AddrNumOperands)); - MIB.addReg(SaveRBX); - } else { - // Simple case, just copy the virtual register to RBX. - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX) - .add(MI.getOperand(X86::AddrNumOperands)); - MachineInstrBuilder MIB = - BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)); - for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) - MIB.add(MI.getOperand(Idx)); - } - MI.eraseFromParent(); + case X86::LCMPXCHG16B_NO_RBX: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + Register BasePtr = TRI->getBaseRegister(); + if (TRI->hasBasePointer(*MF) && + (BasePtr == X86::RBX || BasePtr == X86::EBX)) { + if (!BB->isLiveIn(BasePtr)) + BB->addLiveIn(BasePtr); + // Save RBX into a virtual register. + Register SaveRBX = + MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) + .addReg(X86::RBX); + Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst); + for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) + MIB.add(MI.getOperand(Idx)); + MIB.add(MI.getOperand(X86::AddrNumOperands)); + MIB.addReg(SaveRBX); + } else { + // Simple case, just copy the virtual register to RBX. + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX) + .add(MI.getOperand(X86::AddrNumOperands)); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)); + for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) + MIB.add(MI.getOperand(Idx)); + } + MI.eraseFromParent(); return BB; - } - case X86::MWAITX: { - const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); - Register BasePtr = TRI->getBaseRegister(); - bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX); - // If no need to save the base pointer, we generate MWAITXrrr, - // else we generate pseudo MWAITX_SAVE_RBX. - if (!IsRBX || !TRI->hasBasePointer(*MF)) { - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI.getOperand(0).getReg()); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI.getOperand(1).getReg()); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX) - .addReg(MI.getOperand(2).getReg()); - BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr)); - MI.eraseFromParent(); - } else { - if (!BB->isLiveIn(BasePtr)) { - BB->addLiveIn(BasePtr); - } - // Parameters can be copied into ECX and EAX but not EBX yet. - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI.getOperand(0).getReg()); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI.getOperand(1).getReg()); - assert(Subtarget.is64Bit() && "Expected 64-bit mode!"); - // Save RBX into a virtual register. - Register SaveRBX = - MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); - BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) - .addReg(X86::RBX); - // Generate mwaitx pseudo. - Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); - BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX)) - .addDef(Dst) // Destination tied in with SaveRBX. - .addReg(MI.getOperand(2).getReg()) // input value of EBX. - .addUse(SaveRBX); // Save of base pointer. - MI.eraseFromParent(); - } + } + case X86::MWAITX: { + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + Register BasePtr = TRI->getBaseRegister(); + bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX); + // If no need to save the base pointer, we generate MWAITXrrr, + // else we generate pseudo MWAITX_SAVE_RBX. + if (!IsRBX || !TRI->hasBasePointer(*MF)) { + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI.getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI.getOperand(1).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX) + .addReg(MI.getOperand(2).getReg()); + BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr)); + MI.eraseFromParent(); + } else { + if (!BB->isLiveIn(BasePtr)) { + BB->addLiveIn(BasePtr); + } + // Parameters can be copied into ECX and EAX but not EBX yet. + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI.getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI.getOperand(1).getReg()); + assert(Subtarget.is64Bit() && "Expected 64-bit mode!"); + // Save RBX into a virtual register. + Register SaveRBX = + MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) + .addReg(X86::RBX); + // Generate mwaitx pseudo. + Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); + BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX)) + .addDef(Dst) // Destination tied in with SaveRBX. + .addReg(MI.getOperand(2).getReg()) // input value of EBX. + .addUse(SaveRBX); // Save of base pointer. + MI.eraseFromParent(); + } return BB; } case TargetOpcode::PREALLOCATED_SETUP: { @@ -34365,11 +34365,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits Known2; if (!!DemandedLHS) { Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = KnownBits::commonBits(Known, Known2); } if (!!DemandedRHS) { Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = KnownBits::commonBits(Known, Known2); } if (Known.countMinLeadingZeros() < BitWidth) @@ -34412,11 +34412,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = KnownBits::commonBits(Known, Known2); break; } - case X86ISD::BEXTR: - case X86ISD::BEXTRI: { + case X86ISD::BEXTR: + case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); @@ -34438,28 +34438,28 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } break; } - case X86ISD::PDEP: { - KnownBits Known2; - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - // Zeros are retained from the mask operand. But not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - break; - } - case X86ISD::PEXT: { - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - // The result has as many leading zeros as the number of zeroes in the mask. - unsigned Count = Known.Zero.countPopulation(); - Known.Zero = APInt::getHighBitsSet(BitWidth, Count); - Known.One.clearAllBits(); - break; - } - case X86ISD::VTRUNC: - case X86ISD::VTRUNCS: - case X86ISD::VTRUNCUS: + case X86ISD::PDEP: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + // Zeros are retained from the mask operand. But not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } + case X86ISD::PEXT: { + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + // The result has as many leading zeros as the number of zeroes in the mask. + unsigned Count = Known.Zero.countPopulation(); + Known.Zero = APInt::getHighBitsSet(BitWidth, Count); + Known.One.clearAllBits(); + break; + } + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::CVTP2SI: @@ -34476,7 +34476,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::VMFPROUND: case X86ISD::CVTPS2PH: case X86ISD::MCVTPS2PH: { - // Truncations/Conversions - upper elements are known zero. + // Truncations/Conversions - upper elements are known zero. EVT SrcVT = Op.getOperand(0).getValueType(); if (SrcVT.isVector()) { unsigned NumSrcElts = SrcVT.getVectorNumElements(); @@ -34554,7 +34554,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, continue; KnownBits Known2 = DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = KnownBits::commonBits(Known, Known2); } } } @@ -34733,18 +34733,18 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); // Match against a VZEXT_MOVL vXi32 zero-extending instruction. - if (MaskEltSize == 32 && Mask[0] == 0) { - if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } + if (MaskEltSize == 32 && Mask[0] == 0) { + if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } } // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. @@ -34798,17 +34798,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; @@ -34817,17 +34817,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; @@ -34837,21 +34837,21 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( - MaskVT, Mask, - {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) { + MaskVT, Mask, + {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( - MaskVT, Mask, - {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) { + MaskVT, Mask, + {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; @@ -34933,10 +34933,10 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && - ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || - (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4); @@ -35006,31 +35006,31 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { - unsigned NumMaskElts = Mask.size(); + unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) && - Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { + if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) && + Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MVT::v2f64; return true; } - if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) && + if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MVT::v4f32; @@ -35064,46 +35064,46 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - // Attempt to match against a OR if we're performing a blend shuffle and the - // non-blended source element is zero in each case. - if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && - (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { - bool IsBlend = true; - unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); - unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); - unsigned Scale1 = NumV1Elts / NumMaskElts; - unsigned Scale2 = NumV2Elts / NumMaskElts; - APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts); - APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts); - for (unsigned i = 0; i != NumMaskElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - if (M == SM_SentinelZero) { - DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); - DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); - continue; - } - if (M == (int)i) { - DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); - continue; - } - if (M == (int)(i + NumMaskElts)) { - DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); - continue; - } - IsBlend = false; - break; - } - if (IsBlend && - DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && - DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { - Shuffle = ISD::OR; - SrcVT = DstVT = MaskVT.changeTypeToInteger(); - return true; - } - } - + // Attempt to match against a OR if we're performing a blend shuffle and the + // non-blended source element is zero in each case. + if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { + bool IsBlend = true; + unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); + unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); + unsigned Scale1 = NumV1Elts / NumMaskElts; + unsigned Scale2 = NumV2Elts / NumMaskElts; + APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts); + APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts); + for (unsigned i = 0; i != NumMaskElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + if (M == SM_SentinelZero) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == (int)i) { + DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); + continue; + } + if (M == (int)(i + NumMaskElts)) { + DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); + continue; + } + IsBlend = false; + break; + } + if (IsBlend && + DAG.computeKnownBits(V1, DemandedZeroV1).isZero() && + DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) { + Shuffle = ISD::OR; + SrcVT = DstVT = MaskVT.changeTypeToInteger(); + return true; + } + } + return false; } @@ -35292,16 +35292,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); - MVT RootVT = Root.getSimpleValueType(); - unsigned RootSizeInBits = RootVT.getSizeInBits(); - unsigned NumRootElts = RootVT.getVectorNumElements(); - - // Canonicalize shuffle input op to the requested type. - // TODO: Support cases where Op is smaller than VT. - auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { - return DAG.getBitcast(VT, Op); - }; - + MVT RootVT = Root.getSimpleValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + unsigned NumRootElts = RootVT.getVectorNumElements(); + + // Canonicalize shuffle input op to the requested type. + // TODO: Support cases where Op is smaller than VT. + auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { + return DAG.getBitcast(VT, Op); + }; + // Find the inputs that enter the chain. Note that multiple uses are OK // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); @@ -35311,8 +35311,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); - assert(VT1.getSizeInBits() == RootSizeInBits && - VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"); + assert(VT1.getSizeInBits() == RootSizeInBits && + VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"); SDLoc DL(Root); SDValue Res; @@ -35320,7 +35320,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); - return CanonicalizeShuffleInput(RootVT, V1); + return CanonicalizeShuffleInput(RootVT, V1); } bool OptForSize = DAG.shouldOptForSize(); @@ -35344,9 +35344,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // we can just use the broadcast directly. This works for smaller broadcast // elements as well as they already repeat across each mask element if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) && - (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && - V1.getValueSizeInBits() >= RootSizeInBits) { - return CanonicalizeShuffleInput(RootVT, V1); + (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + V1.getValueSizeInBits() >= RootSizeInBits) { + return CanonicalizeShuffleInput(RootVT, V1); } // Handle 128/256-bit lane shuffles of 512-bit vectors. @@ -35360,11 +35360,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return SDValue(); // Nothing to do! assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"); - Res = CanonicalizeShuffleInput(RootVT, V1); - unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts); + Res = CanonicalizeShuffleInput(RootVT, V1); + unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts); bool UseZero = isAnyZero(BaseMask); Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits); - return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); + return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); } // Narrow shuffle mask to v4x128. @@ -35373,8 +35373,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask); // Try to lower to vshuf64x2/vshuf32x4. - auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask, - SDValue V1, SDValue V2, SelectionDAG &DAG) { + auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG) { unsigned PermMask = 0; // Insure elements came from the same Op. SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; @@ -35397,8 +35397,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, - CanonicalizeShuffleInput(ShuffleVT, Ops[0]), - CanonicalizeShuffleInput(ShuffleVT, Ops[1]), + CanonicalizeShuffleInput(ShuffleVT, Ops[0]), + CanonicalizeShuffleInput(ShuffleVT, Ops[1]), DAG.getTargetConstant(PermMask, DL, MVT::i8)); }; @@ -35413,9 +35413,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2)); if (!isAnyZero(Mask) && !PreferPERMQ) { - if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) - return SDValue(); // Nothing to do! - MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) + return SDValue(); // Nothing to do! + MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG)) return DAG.getBitcast(RootVT, V); } @@ -35430,10 +35430,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) return SDValue(); // Nothing to do! assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"); - Res = CanonicalizeShuffleInput(RootVT, V1); - Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL); - return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, - DL, 256); + Res = CanonicalizeShuffleInput(RootVT, V1); + Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL); + return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, + DL, 256); } if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) @@ -35448,9 +35448,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); - return DAG.getNode( - X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1), - DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8)); + return DAG.getNode( + X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1), + DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8)); } if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) @@ -35466,12 +35466,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned PermMask = 0; PermMask |= ((BaseMask[0] & 3) << 0); PermMask |= ((BaseMask[1] & 3) << 4); - SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2; - SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2; - return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, - CanonicalizeShuffleInput(RootVT, LHS), - CanonicalizeShuffleInput(RootVT, RHS), - DAG.getTargetConstant(PermMask, DL, MVT::i8)); + SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2; + SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2; + return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, + CanonicalizeShuffleInput(RootVT, LHS), + CanonicalizeShuffleInput(RootVT, RHS), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } @@ -35533,7 +35533,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) && (!IsMaskedShuffle || NumRootElts == NumMaskElts)) { - if (isUndefOrEqual(Mask, 0)) { + if (isUndefOrEqual(Mask, 0)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && MayFoldLoad(V1.getOperand(0))) { @@ -35546,7 +35546,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Subtarget.hasAVX2()) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! - Res = CanonicalizeShuffleInput(MaskVT, V1); + Res = CanonicalizeShuffleInput(MaskVT, V1); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } @@ -35561,7 +35561,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); + Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); return DAG.getBitcast(RootVT, Res); } @@ -35573,7 +35573,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - Res = CanonicalizeShuffleInput(ShuffleVT, V1); + Res = CanonicalizeShuffleInput(ShuffleVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); @@ -35584,32 +35584,32 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // from a scalar. // TODO: Handle other insertions here as well? if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && - Subtarget.hasSSE41() && - !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) { - if (MaskEltSizeInBits == 32) { - SDValue SrcV1 = V1, SrcV2 = V2; - if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, - DAG) && - SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { - if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) - return SDValue(); // Nothing to do! - Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, - CanonicalizeShuffleInput(MVT::v4f32, SrcV1), - CanonicalizeShuffleInput(MVT::v4f32, SrcV2), - DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); - return DAG.getBitcast(RootVT, Res); - } - } - if (MaskEltSizeInBits == 64 && - isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) && - V2.getOpcode() == ISD::SCALAR_TO_VECTOR && - V2.getScalarValueSizeInBits() <= 32) { + Subtarget.hasSSE41() && + !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) { + if (MaskEltSizeInBits == 32) { + SDValue SrcV1 = V1, SrcV2 = V2; + if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, + DAG) && + SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) + return SDValue(); // Nothing to do! + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, + CanonicalizeShuffleInput(MVT::v4f32, SrcV1), + CanonicalizeShuffleInput(MVT::v4f32, SrcV2), + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } + if (MaskEltSizeInBits == 64 && + isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) && + V2.getOpcode() == ISD::SCALAR_TO_VECTOR && + V2.getScalarValueSizeInBits() <= 32) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! - PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0); + PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0); Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, - CanonicalizeShuffleInput(MVT::v4f32, V1), - CanonicalizeShuffleInput(MVT::v4f32, V2), + CanonicalizeShuffleInput(MVT::v4f32, V1), + CanonicalizeShuffleInput(MVT::v4f32, V2), DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -35623,8 +35623,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); - NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2); + NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); + NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2); return DAG.getBitcast(RootVT, Res); } @@ -35637,8 +35637,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1); - NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2); + NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1); + NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); @@ -35655,7 +35655,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Zeroable)) { if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! - V1 = CanonicalizeShuffleInput(IntMaskVT, V1); + V1 = CanonicalizeShuffleInput(IntMaskVT, V1); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); @@ -35665,8 +35665,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! - V1 = CanonicalizeShuffleInput(IntMaskVT, V1); - V2 = CanonicalizeShuffleInput(IntMaskVT, V2); + V1 = CanonicalizeShuffleInput(IntMaskVT, V1); + V2 = CanonicalizeShuffleInput(IntMaskVT, V2); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); @@ -35685,7 +35685,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC; if (Depth == 0 && Root.getOpcode() == Opc) return SDValue(); // Nothing to do! - V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); + V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); Res = DAG.getNode(Opc, DL, ShuffleVT, V1); if (ShuffleVT.getSizeInBits() < RootSizeInBits) Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits); @@ -35702,8 +35702,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return SDValue(); // Nothing to do! ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); - V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); - V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2); + V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); + V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2); ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts); Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2); @@ -35720,56 +35720,56 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Depth threshold above which we can efficiently use variable mask shuffles. int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; - // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a - // higher depth before combining them. - bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask); + // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a + // higher depth before combining them. + bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask); bool MaskContainsZeros = isAnyZero(Mask); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. - if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) { - if (Subtarget.hasAVX2() && - (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { - SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); - Res = CanonicalizeShuffleInput(MaskVT, V1); - Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); - return DAG.getBitcast(RootVT, Res); - } - // AVX512 variants (non-VLX will pad to 512-bit shuffles). - if ((Subtarget.hasAVX512() && - (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && - (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && - (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) { - V1 = CanonicalizeShuffleInput(MaskVT, V1); - V2 = DAG.getUNDEF(MaskVT); - Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); - return DAG.getBitcast(RootVT, Res); - } + if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros) { + if (Subtarget.hasAVX2() && + (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); + Res = CanonicalizeShuffleInput(MaskVT, V1); + Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); + return DAG.getBitcast(RootVT, Res); + } + // AVX512 variants (non-VLX will pad to 512-bit shuffles). + if ((Subtarget.hasAVX512() && + (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || + (Subtarget.hasBWI() && + (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (Subtarget.hasVBMI() && + (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) { + V1 = CanonicalizeShuffleInput(MaskVT, V1); + V2 = DAG.getUNDEF(MaskVT); + Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); + return DAG.getBitcast(RootVT, Res); + } } // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero - // vector as the second source (non-VLX will pad to 512-bit shuffles). + // vector as the second source (non-VLX will pad to 512-bit shuffles). if (UnaryShuffle && AllowVariableMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || - MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || + MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || + MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { + (Subtarget.hasBWI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { // Adjust shuffle mask - replace SM_SentinelZero with second source index. for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; - V1 = CanonicalizeShuffleInput(MaskVT, V1); - V2 = getZeroVector(MaskVT, Subtarget, DAG, DL); - Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); + V1 = CanonicalizeShuffleInput(MaskVT, V1); + V2 = getZeroVector(MaskVT, Subtarget, DAG, DL); + Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } @@ -35780,21 +35780,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG, Subtarget)) return WideShuffle; - // If we have a dual input lane-crossing shuffle then lower to VPERMV3, - // (non-VLX will pad to 512-bit shuffles). + // If we have a dual input lane-crossing shuffle then lower to VPERMV3, + // (non-VLX will pad to 512-bit shuffles). if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || - MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 || + MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || + MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || - (Subtarget.hasBWI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { - V1 = CanonicalizeShuffleInput(MaskVT, V1); - V2 = CanonicalizeShuffleInput(MaskVT, V2); - Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); + (Subtarget.hasBWI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { + V1 = CanonicalizeShuffleInput(MaskVT, V1); + V2 = CanonicalizeShuffleInput(MaskVT, V2); + Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } return SDValue(); @@ -35820,7 +35820,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, EltBits[i] = AllOnes; } SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); - Res = CanonicalizeShuffleInput(MaskVT, V1); + Res = CanonicalizeShuffleInput(MaskVT, V1); unsigned AndOpcode = MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); @@ -35840,7 +35840,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, VPermIdx.push_back(Idx); } SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); - Res = CanonicalizeShuffleInput(MaskVT, V1); + Res = CanonicalizeShuffleInput(MaskVT, V1); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); return DAG.getBitcast(RootVT, Res); } @@ -35872,8 +35872,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); VPerm2Idx.push_back(Index); } - V1 = CanonicalizeShuffleInput(MaskVT, V1); - V2 = CanonicalizeShuffleInput(MaskVT, V2); + V1 = CanonicalizeShuffleInput(MaskVT, V1); + V2 = CanonicalizeShuffleInput(MaskVT, V2); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getTargetConstant(M2ZImm, DL, MVT::i8)); @@ -35907,7 +35907,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); - Res = CanonicalizeShuffleInput(ByteVT, V1); + Res = CanonicalizeShuffleInput(ByteVT, V1); SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); return DAG.getBitcast(RootVT, Res); @@ -35937,8 +35937,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::v16i8; - V1 = CanonicalizeShuffleInput(ByteVT, V1); - V2 = CanonicalizeShuffleInput(ByteVT, V2); + V1 = CanonicalizeShuffleInput(ByteVT, V1); + V2 = CanonicalizeShuffleInput(ByteVT, V2); SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); return DAG.getBitcast(RootVT, Res); @@ -35951,22 +35951,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG, Subtarget)) return WideShuffle; - // If we have a dual input shuffle then lower to VPERMV3, - // (non-VLX will pad to 512-bit shuffles) + // If we have a dual input shuffle then lower to VPERMV3, + // (non-VLX will pad to 512-bit shuffles) if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && - (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 || - MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 || - MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || - MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || - MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && AllowBWIVPERMV3 && - (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { - V1 = CanonicalizeShuffleInput(MaskVT, V1); - V2 = CanonicalizeShuffleInput(MaskVT, V2); - Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); + (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 || + MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 || + MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || + MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || + MaskVT == MVT::v16i32)) || + (Subtarget.hasBWI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { + V1 = CanonicalizeShuffleInput(MaskVT, V1); + V2 = CanonicalizeShuffleInput(MaskVT, V2); + Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } @@ -35991,16 +35991,16 @@ static SDValue combineX86ShuffleChainWithExtract( if (NumInputs == 0) return SDValue(); - EVT RootVT = Root.getValueType(); - unsigned RootSizeInBits = RootVT.getSizeInBits(); - assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask"); - + EVT RootVT = Root.getValueType(); + unsigned RootSizeInBits = RootVT.getSizeInBits(); + assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask"); + SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end()); SmallVector<unsigned, 4> Offsets(NumInputs, 0); // Peek through subvectors. // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs? - unsigned WideSizeInBits = RootSizeInBits; + unsigned WideSizeInBits = RootSizeInBits; for (unsigned i = 0; i != NumInputs; ++i) { SDValue &Src = WideInputs[i]; unsigned &Offset = Offsets[i]; @@ -36082,149 +36082,149 @@ static SDValue combineX86ShuffleChainWithExtract( return SDValue(); } -// Canonicalize the combined shuffle mask chain with horizontal ops. -// NOTE: This may update the Ops and Mask. -static SDValue canonicalizeShuffleMaskWithHorizOp( - MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask, - unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (Mask.empty() || Ops.empty()) - return SDValue(); - - SmallVector<SDValue> BC; - for (SDValue Op : Ops) - BC.push_back(peekThroughBitcasts(Op)); - - // All ops must be the same horizop + type. - SDValue BC0 = BC[0]; - EVT VT0 = BC0.getValueType(); - unsigned Opcode0 = BC0.getOpcode(); - if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) { - return V.getOpcode() != Opcode0 || V.getValueType() != VT0; - })) - return SDValue(); - - bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); - bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); - if (!isHoriz && !isPack) - return SDValue(); - - int NumElts = VT0.getVectorNumElements(); - int NumLanes = VT0.getSizeInBits() / 128; - int NumEltsPerLane = NumElts / NumLanes; - int NumHalfEltsPerLane = NumEltsPerLane / 2; - - // See if we can remove the shuffle by resorting the HOP chain so that - // the HOP args are pre-shuffled. - // TODO: Generalize to any sized/depth chain. - // TODO: Add support for PACKSS/PACKUS. - if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() && - shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) { - SmallVector<int> ScaledMask; - if (scaleShuffleElements(Mask, 4, ScaledMask)) { - // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand. - auto GetHOpSrc = [&](int M) { - if (M == SM_SentinelUndef) - return DAG.getUNDEF(VT0); - if (M == SM_SentinelZero) - return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL); - SDValue Src0 = BC[M / NumElts]; - SDValue Src1 = Src0.getOperand((M % 4) >= 2); - if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode())) - return Src1.getOperand(M % 2); - return SDValue(); - }; - SDValue M0 = GetHOpSrc(ScaledMask[0]); - SDValue M1 = GetHOpSrc(ScaledMask[1]); - SDValue M2 = GetHOpSrc(ScaledMask[2]); - SDValue M3 = GetHOpSrc(ScaledMask[3]); - if (M0 && M1 && M2 && M3) { - SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1); - SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3); - return DAG.getNode(Opcode0, DL, VT0, LHS, RHS); - } - } - } - - if (2 < Ops.size()) - return SDValue(); - - SDValue BC1 = BC[BC.size() - 1]; - if (Mask.size() == VT0.getVectorNumElements()) { - // Canonicalize binary shuffles of horizontal ops that use the - // same sources to an unary shuffle. - // TODO: Try to perform this fold even if the shuffle remains. - if (Ops.size() == 2) { - auto ContainsOps = [](SDValue HOp, SDValue Op) { - return Op == HOp.getOperand(0) || Op == HOp.getOperand(1); - }; - // Commute if all BC0's ops are contained in BC1. - if (ContainsOps(BC1, BC0.getOperand(0)) && - ContainsOps(BC1, BC0.getOperand(1))) { - ShuffleVectorSDNode::commuteMask(Mask); - std::swap(Ops[0], Ops[1]); - std::swap(BC0, BC1); - } - - // If BC1 can be represented by BC0, then convert to unary shuffle. - if (ContainsOps(BC0, BC1.getOperand(0)) && - ContainsOps(BC0, BC1.getOperand(1))) { - for (int &M : Mask) { - if (M < NumElts) // BC0 element or UNDEF/Zero sentinel. - continue; - int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0; - M -= NumElts + (SubLane * NumHalfEltsPerLane); - if (BC1.getOperand(SubLane) != BC0.getOperand(0)) - M += NumHalfEltsPerLane; - } - } - } - - // Canonicalize unary horizontal ops to only refer to lower halves. - for (int i = 0; i != NumElts; ++i) { - int &M = Mask[i]; - if (isUndefOrZero(M)) - continue; - if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) && - (M % NumEltsPerLane) >= NumHalfEltsPerLane) - M -= NumHalfEltsPerLane; - if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) && - (M % NumEltsPerLane) >= NumHalfEltsPerLane) - M -= NumHalfEltsPerLane; - } - } - - // Combine binary shuffle of 2 similar 'Horizontal' instructions into a - // single instruction. Attempt to match a v2X64 repeating shuffle pattern that - // represents the LHS/RHS inputs for the lower/upper halves. - unsigned EltSizeInBits = RootSizeInBits / Mask.size(); - SmallVector<int, 16> TargetMask128, WideMask128; - if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) && - scaleShuffleElements(TargetMask128, 2, WideMask128)) { - assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle"); - bool SingleOp = (Ops.size() == 1); - if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { - SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1; - SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1; - Lo = Lo.getOperand(WideMask128[0] & 1); - Hi = Hi.getOperand(WideMask128[1] & 1); - if (SingleOp) { - MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); - SDValue Undef = DAG.getUNDEF(SrcVT); - SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); - Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo); - Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi); - Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo); - Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi); - } - return DAG.getNode(Opcode0, DL, VT0, Lo, Hi); - } - } - - return SDValue(); -} - +// Canonicalize the combined shuffle mask chain with horizontal ops. +// NOTE: This may update the Ops and Mask. +static SDValue canonicalizeShuffleMaskWithHorizOp( + MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask, + unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Mask.empty() || Ops.empty()) + return SDValue(); + + SmallVector<SDValue> BC; + for (SDValue Op : Ops) + BC.push_back(peekThroughBitcasts(Op)); + + // All ops must be the same horizop + type. + SDValue BC0 = BC[0]; + EVT VT0 = BC0.getValueType(); + unsigned Opcode0 = BC0.getOpcode(); + if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) { + return V.getOpcode() != Opcode0 || V.getValueType() != VT0; + })) + return SDValue(); + + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || + Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); + bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); + if (!isHoriz && !isPack) + return SDValue(); + + int NumElts = VT0.getVectorNumElements(); + int NumLanes = VT0.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + int NumHalfEltsPerLane = NumEltsPerLane / 2; + + // See if we can remove the shuffle by resorting the HOP chain so that + // the HOP args are pre-shuffled. + // TODO: Generalize to any sized/depth chain. + // TODO: Add support for PACKSS/PACKUS. + if (isHoriz && NumEltsPerLane == 4 && VT0.is128BitVector() && + shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget)) { + SmallVector<int> ScaledMask; + if (scaleShuffleElements(Mask, 4, ScaledMask)) { + // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand. + auto GetHOpSrc = [&](int M) { + if (M == SM_SentinelUndef) + return DAG.getUNDEF(VT0); + if (M == SM_SentinelZero) + return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL); + SDValue Src0 = BC[M / NumElts]; + SDValue Src1 = Src0.getOperand((M % 4) >= 2); + if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode())) + return Src1.getOperand(M % 2); + return SDValue(); + }; + SDValue M0 = GetHOpSrc(ScaledMask[0]); + SDValue M1 = GetHOpSrc(ScaledMask[1]); + SDValue M2 = GetHOpSrc(ScaledMask[2]); + SDValue M3 = GetHOpSrc(ScaledMask[3]); + if (M0 && M1 && M2 && M3) { + SDValue LHS = DAG.getNode(Opcode0, DL, VT0, M0, M1); + SDValue RHS = DAG.getNode(Opcode0, DL, VT0, M2, M3); + return DAG.getNode(Opcode0, DL, VT0, LHS, RHS); + } + } + } + + if (2 < Ops.size()) + return SDValue(); + + SDValue BC1 = BC[BC.size() - 1]; + if (Mask.size() == VT0.getVectorNumElements()) { + // Canonicalize binary shuffles of horizontal ops that use the + // same sources to an unary shuffle. + // TODO: Try to perform this fold even if the shuffle remains. + if (Ops.size() == 2) { + auto ContainsOps = [](SDValue HOp, SDValue Op) { + return Op == HOp.getOperand(0) || Op == HOp.getOperand(1); + }; + // Commute if all BC0's ops are contained in BC1. + if (ContainsOps(BC1, BC0.getOperand(0)) && + ContainsOps(BC1, BC0.getOperand(1))) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + std::swap(BC0, BC1); + } + + // If BC1 can be represented by BC0, then convert to unary shuffle. + if (ContainsOps(BC0, BC1.getOperand(0)) && + ContainsOps(BC0, BC1.getOperand(1))) { + for (int &M : Mask) { + if (M < NumElts) // BC0 element or UNDEF/Zero sentinel. + continue; + int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0; + M -= NumElts + (SubLane * NumHalfEltsPerLane); + if (BC1.getOperand(SubLane) != BC0.getOperand(0)) + M += NumHalfEltsPerLane; + } + } + } + + // Canonicalize unary horizontal ops to only refer to lower halves. + for (int i = 0; i != NumElts; ++i) { + int &M = Mask[i]; + if (isUndefOrZero(M)) + continue; + if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) && + (M % NumEltsPerLane) >= NumHalfEltsPerLane) + M -= NumHalfEltsPerLane; + if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) && + (M % NumEltsPerLane) >= NumHalfEltsPerLane) + M -= NumHalfEltsPerLane; + } + } + + // Combine binary shuffle of 2 similar 'Horizontal' instructions into a + // single instruction. Attempt to match a v2X64 repeating shuffle pattern that + // represents the LHS/RHS inputs for the lower/upper halves. + unsigned EltSizeInBits = RootSizeInBits / Mask.size(); + SmallVector<int, 16> TargetMask128, WideMask128; + if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) && + scaleShuffleElements(TargetMask128, 2, WideMask128)) { + assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle"); + bool SingleOp = (Ops.size() == 1); + if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { + SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1; + SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1; + Lo = Lo.getOperand(WideMask128[0] & 1); + Hi = Hi.getOperand(WideMask128[1] & 1); + if (SingleOp) { + MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); + SDValue Undef = DAG.getUNDEF(SrcVT); + SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); + Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo); + Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi); + Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo); + Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi); + } + return DAG.getNode(Opcode0, DL, VT0, Lo, Hi); + } + } + + return SDValue(); +} + // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. @@ -36316,14 +36316,14 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops, return DAG.getBitcast(VT, CstOp); } -namespace llvm { - namespace X86 { - enum { - MaxShuffleCombineDepth = 8 - }; - } -} // namespace llvm - +namespace llvm { + namespace X86 { + enum { + MaxShuffleCombineDepth = 8 + }; + } +} // namespace llvm + /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once @@ -36356,30 +36356,30 @@ namespace llvm { static SDValue combineX86ShufflesRecursively( ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth, - unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask, - SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned MaxDepth, bool HasVariableMask, bool AllowVariableMask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); - assert(Root.getSimpleValueType().isVector() && - "Shuffles operate on vector types!"); - unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. - if (Depth >= MaxDepth) + if (Depth >= MaxDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); - EVT VT = Op.getValueType(); - if (!VT.isVector() || !VT.isSimple()) - return SDValue(); // Bail if we hit a non-simple non-vector. + EVT VT = Op.getValueType(); + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); // Bail if we hit a non-simple non-vector. - assert((RootSizeInBits % VT.getSizeInBits()) == 0 && - "Can only combine shuffles upto size of the root op."); + assert((RootSizeInBits % VT.getSizeInBits()) == 0 && + "Can only combine shuffles upto size of the root op."); // Extract target shuffle mask and resolve sentinels and inputs. // TODO - determine Op's demanded elts from RootMask. @@ -36392,32 +36392,32 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); - // Shuffle inputs must not be larger than the shuffle result. - // TODO: Relax this for single input faux shuffles (trunc/extract_subvector). - if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { - return OpInput.getValueSizeInBits() > VT.getSizeInBits(); + // Shuffle inputs must not be larger than the shuffle result. + // TODO: Relax this for single input faux shuffles (trunc/extract_subvector). + if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { + return OpInput.getValueSizeInBits() > VT.getSizeInBits(); })) return SDValue(); - // If the shuffle result was smaller than the root, we need to adjust the - // mask indices and pad the mask with undefs. - if (RootSizeInBits > VT.getSizeInBits()) { - unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits(); - unsigned OpMaskSize = OpMask.size(); - if (OpInputs.size() > 1) { - unsigned PaddedMaskSize = NumSubVecs * OpMaskSize; - for (int &M : OpMask) { - if (M < 0) - continue; - int EltIdx = M % OpMaskSize; - int OpIdx = M / OpMaskSize; - M = (PaddedMaskSize * OpIdx) + EltIdx; - } - } - OpZero = OpZero.zext(NumSubVecs * OpMaskSize); - OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize); - OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef); - } + // If the shuffle result was smaller than the root, we need to adjust the + // mask indices and pad the mask with undefs. + if (RootSizeInBits > VT.getSizeInBits()) { + unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits(); + unsigned OpMaskSize = OpMask.size(); + if (OpInputs.size() > 1) { + unsigned PaddedMaskSize = NumSubVecs * OpMaskSize; + for (int &M : OpMask) { + if (M < 0) + continue; + int EltIdx = M % OpMaskSize; + int OpIdx = M / OpMaskSize; + M = (PaddedMaskSize * OpIdx) + EltIdx; + } + } + OpZero = OpZero.zext(NumSubVecs * OpMaskSize); + OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize); + OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef); + } SmallVector<int, 64> Mask; SmallVector<SDValue, 16> Ops; @@ -36577,7 +36577,7 @@ static SDValue combineX86ShufflesRecursively( // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. - if (Ops.size() < (MaxDepth - Depth)) { + if (Ops.size() < (MaxDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { // For empty roots, we need to resolve zeroable elements before combining // them with other shuffles. @@ -36589,7 +36589,7 @@ static SDValue combineX86ShufflesRecursively( SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } @@ -36600,24 +36600,24 @@ static SDValue combineX86ShufflesRecursively( Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) return Cst; - // Canonicalize the combined shuffle mask chain with horizontal ops. - // NOTE: This will update the Ops and Mask. - if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( - Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) - return DAG.getBitcast(Root.getValueType(), HOp); - - // Widen any subvector shuffle inputs we've collected. - if (any_of(Ops, [RootSizeInBits](SDValue Op) { - return Op.getValueSizeInBits() < RootSizeInBits; - })) { - for (SDValue &Op : Ops) - if (Op.getValueSizeInBits() < RootSizeInBits) - Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), - RootSizeInBits); - // Reresolve - we might have repeated subvector sources. - resolveTargetShuffleInputsAndMask(Ops, Mask); - } - + // Canonicalize the combined shuffle mask chain with horizontal ops. + // NOTE: This will update the Ops and Mask. + if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( + Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) + return DAG.getBitcast(Root.getValueType(), HOp); + + // Widen any subvector shuffle inputs we've collected. + if (any_of(Ops, [RootSizeInBits](SDValue Op) { + return Op.getValueSizeInBits() < RootSizeInBits; + })) { + for (SDValue &Op : Ops) + if (Op.getValueSizeInBits() < RootSizeInBits) + Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), + RootSizeInBits); + // Reresolve - we might have repeated subvector sources. + resolveTargetShuffleInputsAndMask(Ops, Mask); + } + // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier @@ -36625,10 +36625,10 @@ static SDValue combineX86ShufflesRecursively( // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - while (Mask.size() > 1) { - SmallVector<int, 64> WidenedMask; - if (!canWidenShuffleElements(Mask, WidenedMask)) - break; + while (Mask.size() > 1) { + SmallVector<int, 64> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + break; Mask = std::move(WidenedMask); } @@ -36655,7 +36655,7 @@ static SDValue combineX86ShufflesRecursively( static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, - X86::MaxShuffleCombineDepth, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } @@ -36889,61 +36889,61 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, return SDValue(); } -/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). -static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, - SelectionDAG &DAG, - const SDLoc &DL) { - assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle"); - - MVT VT = V.getSimpleValueType(); - SDValue Src0 = peekThroughBitcasts(V.getOperand(0)); - SDValue Src1 = peekThroughBitcasts(V.getOperand(1)); - unsigned SrcOpc0 = Src0.getOpcode(); - unsigned SrcOpc1 = Src1.getOpcode(); - EVT SrcVT0 = Src0.getValueType(); - EVT SrcVT1 = Src1.getValueType(); - - if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1)) - return SDValue(); - - switch (SrcOpc0) { - case X86ISD::MOVDDUP: { - SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0)); - SDValue RHS = - DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0)); - SDValue Res = - DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2)); - Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res)); - return DAG.getBitcast(VT, Res); - } - case X86ISD::VPERMILPI: - // TODO: Handle v4f64 permutes with different low/high lane masks. - if (SrcVT0 == MVT::v4f64) { - uint64_t Mask = Src0.getConstantOperandVal(1); - if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) - break; - } - LLVM_FALLTHROUGH; - case X86ISD::VSHLI: - case X86ISD::VSRLI: - case X86ISD::VSRAI: - case X86ISD::PSHUFD: - if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { - SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0)); - SDValue RHS = - DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0)); - SDValue Res = - DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2)); - Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res), - Src0.getOperand(1)); - return DAG.getBitcast(VT, Res); - } - break; - } - - return SDValue(); -} - +/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). +static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, + SelectionDAG &DAG, + const SDLoc &DL) { + assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle"); + + MVT VT = V.getSimpleValueType(); + SDValue Src0 = peekThroughBitcasts(V.getOperand(0)); + SDValue Src1 = peekThroughBitcasts(V.getOperand(1)); + unsigned SrcOpc0 = Src0.getOpcode(); + unsigned SrcOpc1 = Src1.getOpcode(); + EVT SrcVT0 = Src0.getValueType(); + EVT SrcVT1 = Src1.getValueType(); + + if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1)) + return SDValue(); + + switch (SrcOpc0) { + case X86ISD::MOVDDUP: { + SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0)); + SDValue RHS = + DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0)); + SDValue Res = + DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2)); + Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res)); + return DAG.getBitcast(VT, Res); + } + case X86ISD::VPERMILPI: + // TODO: Handle v4f64 permutes with different low/high lane masks. + if (SrcVT0 == MVT::v4f64) { + uint64_t Mask = Src0.getConstantOperandVal(1); + if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) + break; + } + LLVM_FALLTHROUGH; + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + case X86ISD::PSHUFD: + if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { + SDValue LHS = DAG.getBitcast(VT, Src0.getOperand(0)); + SDValue RHS = + DAG.getBitcast(VT, Src1.isUndef() ? Src1 : Src1.getOperand(0)); + SDValue Res = + DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, RHS, V.getOperand(2)); + Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res), + Src0.getOperand(1)); + return DAG.getBitcast(VT, Res); + } + break; + } + + return SDValue(); +} + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -37016,7 +37016,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, - X86::MaxShuffleCombineDepth, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -37046,8 +37046,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, for (SDNode *User : Src->uses()) if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && Src == User->getOperand(0) && - User->getValueSizeInBits(0).getFixedSize() > - VT.getFixedSizeInBits()) { + User->getValueSizeInBits(0).getFixedSize() > + VT.getFixedSizeInBits()) { return extractSubVector(SDValue(User, 0), 0, DAG, DL, VT.getSizeInBits()); } @@ -37133,8 +37133,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, LN->isSimple()) { unsigned Offset = ShiftAmt / 8; SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), - TypeSize::Fixed(Offset), DL); + SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), + TypeSize::Fixed(Offset), DL); SDValue Ops[] = { LN->getChain(), Ptr }; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, @@ -37166,16 +37166,16 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } // vbroadcast(vector load X) -> vbroadcast_load - if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || - SrcVT == MVT::v4i32) && - Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { + if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || + SrcVT == MVT::v4i32) && + Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast<LoadSDNode>(Src); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode( - X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); @@ -37262,27 +37262,27 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } } - // Pull subvector inserts into undef through VZEXT_MOVL by making it an - // insert into a zero vector. This helps get VZEXT_MOVL closer to - // scalar_to_vectors where 256/512 are canonicalized to an insert and a - // 128-bit scalar_to_vector. This reduces the number of isel patterns. - if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) { - SDValue V = peekThroughOneUseBitcasts(N0); - - if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() && - isNullConstant(V.getOperand(2))) { - SDValue In = V.getOperand(1); - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - In.getValueSizeInBits() / - VT.getScalarSizeInBits()); - In = DAG.getBitcast(SubVT, In); - SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), Movl, - V.getOperand(2)); - } - } - + // Pull subvector inserts into undef through VZEXT_MOVL by making it an + // insert into a zero vector. This helps get VZEXT_MOVL closer to + // scalar_to_vectors where 256/512 are canonicalized to an insert and a + // 128-bit scalar_to_vector. This reduces the number of isel patterns. + if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) { + SDValue V = peekThroughOneUseBitcasts(N0); + + if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() && + isNullConstant(V.getOperand(2))) { + SDValue In = V.getOperand(1); + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + In.getValueSizeInBits() / + VT.getScalarSizeInBits()); + In = DAG.getBitcast(SubVT, In); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), Movl, + V.getOperand(2)); + } + } + return SDValue(); } case X86ISD::BLENDI: { @@ -37324,51 +37324,51 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } case X86ISD::VPERM2X128: { - // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::BITCAST && - (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) { - EVT SrcVT = LHS.getOperand(0).getValueType(); - if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) { - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT, - DAG.getBitcast(SrcVT, LHS), - DAG.getBitcast(SrcVT, RHS), - N->getOperand(2))); - } - } - - // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). - if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) - return Res; - - // Fold vperm2x128 subvector shuffle with an inner concat pattern. - // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. - auto FindSubVector128 = [&](unsigned Idx) { - if (Idx > 3) - return SDValue(); - SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); - SmallVector<SDValue> SubOps; - if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2) - return SubOps[Idx & 1]; - unsigned NumElts = Src.getValueType().getVectorNumElements(); - if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && - Src.getOperand(1).getValueSizeInBits() == 128 && - Src.getConstantOperandAPInt(2) == (NumElts / 2)) { - return Src.getOperand(1); - } + // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::BITCAST && + (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) { + EVT SrcVT = LHS.getOperand(0).getValueType(); + if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) { + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT, + DAG.getBitcast(SrcVT, LHS), + DAG.getBitcast(SrcVT, RHS), + N->getOperand(2))); + } + } + + // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). + if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) + return Res; + + // Fold vperm2x128 subvector shuffle with an inner concat pattern. + // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. + auto FindSubVector128 = [&](unsigned Idx) { + if (Idx > 3) + return SDValue(); + SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); + SmallVector<SDValue> SubOps; + if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2) + return SubOps[Idx & 1]; + unsigned NumElts = Src.getValueType().getVectorNumElements(); + if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueSizeInBits() == 128 && + Src.getConstantOperandAPInt(2) == (NumElts / 2)) { + return Src.getOperand(1); + } return SDValue(); - }; - unsigned Imm = N.getConstantOperandVal(2); - if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) { - if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) { - MVT SubVT = VT.getHalfNumVectorElementsVT(); - SubLo = DAG.getBitcast(SubVT, SubLo); - SubHi = DAG.getBitcast(SubVT, SubHi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi); - } - } - return SDValue(); + }; + unsigned Imm = N.getConstantOperandVal(2); + if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) { + if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) { + MVT SubVT = VT.getHalfNumVectorElementsVT(); + SubLo = DAG.getBitcast(SubVT, SubLo); + SubHi = DAG.getBitcast(SubVT, SubHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi); + } + } + return SDValue(); } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: @@ -37811,12 +37811,12 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, /// Eliminate a redundant shuffle of a horizontal math op. static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { - // TODO: Can we use getTargetShuffleInputs instead? + // TODO: Can we use getTargetShuffleInputs instead? unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) - if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH) - if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) - return SDValue(); + if (Opcode != X86ISD::UNPCKL && Opcode != X86ISD::UNPCKH) + if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); // For a broadcast, peek through an extract element of index 0 to find the // horizontal op: broadcast (ext_vec_elt HOp, 0) @@ -37835,28 +37835,28 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB) return SDValue(); - // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)). - // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)). - // Don't fold if hop(x,y) == hop(z,w). - if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) { - SDValue HOp2 = N->getOperand(1); - if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32) - return SDValue(); - if (HOp == HOp2) - return SDValue(); - SDLoc DL(HOp); - unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1; - SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi), - HOp2.getOperand(LoHi)); - // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle - // combining and domain handling will simplify this later on. - EVT ShuffleVT = VT.changeVectorElementType(MVT::f32); - Res = DAG.getBitcast(ShuffleVT, Res); - Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res, - getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG)); - return DAG.getBitcast(VT, Res); - } - + // unpcklo(hop(x,y),hop(z,w)) -> permute(hop(x,z)). + // unpckhi(hop(x,y),hop(z,w)) -> permute(hop(y,w)). + // Don't fold if hop(x,y) == hop(z,w). + if (Opcode == X86ISD::UNPCKL || Opcode == X86ISD::UNPCKH) { + SDValue HOp2 = N->getOperand(1); + if (HOp.getOpcode() != HOp2.getOpcode() || VT.getScalarSizeInBits() != 32) + return SDValue(); + if (HOp == HOp2) + return SDValue(); + SDLoc DL(HOp); + unsigned LoHi = Opcode == X86ISD::UNPCKL ? 0 : 1; + SDValue Res = DAG.getNode(HOp.getOpcode(), DL, VT, HOp.getOperand(LoHi), + HOp2.getOperand(LoHi)); + // Use SHUFPS for the permute so this will work on SSE3 targets, shuffle + // combining and domain handling will simplify this later on. + EVT ShuffleVT = VT.changeVectorElementType(MVT::f32); + Res = DAG.getBitcast(ShuffleVT, Res); + Res = DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res, + getV4X86ShuffleImm8ForMask({0, 2, 1, 3}, DL, DAG)); + return DAG.getBitcast(VT, Res); + } + // 128-bit horizontal math instructions are defined to operate on adjacent // lanes of each operand as: // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] @@ -37889,8 +37889,8 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { // replicating low and high halves (and without changing the type/length of // the vector), we don't need the shuffle. if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { - if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector()) - return SDValue(); + if (Opcode == X86ISD::VBROADCAST && !VT.is128BitVector()) + return SDValue(); if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) { // movddup (hadd X, X) --> hadd X, X // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X @@ -37903,20 +37903,20 @@ static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); - + // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, // but this should be tied to whatever horizontal op matching and shuffle // canonicalization are producing. if (HOp.getValueSizeInBits() == 128 && - (isShuffleEquivalent(Mask, {0, 0}) || - isShuffleEquivalent(Mask, {0, 1, 0, 1}) || - isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}))) + (isShuffleEquivalent(Mask, {0, 0}) || + isShuffleEquivalent(Mask, {0, 1, 0, 1}) || + isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}))) return updateHOp(HOp, DAG); if (HOp.getValueSizeInBits() == 256 && - (isShuffleEquivalent(Mask, {0, 0, 2, 2}) || - isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) || - isShuffleEquivalent( + (isShuffleEquivalent(Mask, {0, 0, 2, 2}) || + isShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) || + isShuffleEquivalent( Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11}))) return updateHOp(HOp, DAG); @@ -37974,34 +37974,34 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG)) return HAddSub; - - // Merge shuffles through binops if its likely we'll be able to merge it - // with other shuffles (as long as they aren't splats). - // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) - // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE. - if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) { - unsigned SrcOpcode = N->getOperand(0).getOpcode(); - if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) && - N->isOnlyUserOf(N->getOperand(0).getNode()) && - N->isOnlyUserOf(N->getOperand(1).getNode())) { - SDValue Op00 = N->getOperand(0).getOperand(0); - SDValue Op10 = N->getOperand(1).getOperand(0); - SDValue Op01 = N->getOperand(0).getOperand(1); - SDValue Op11 = N->getOperand(1).getOperand(1); - auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00); - auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10); - auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01); - auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11); - if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) && - ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) { - SDLoc DL(N); - ArrayRef<int> Mask = SVN->getMask(); - SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask); - SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask); - return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS); - } - } - } + + // Merge shuffles through binops if its likely we'll be able to merge it + // with other shuffles (as long as they aren't splats). + // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) + // TODO: We might be able to move this to DAGCombiner::visitVECTOR_SHUFFLE. + if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N)) { + unsigned SrcOpcode = N->getOperand(0).getOpcode(); + if (SrcOpcode == N->getOperand(1).getOpcode() && TLI.isBinOp(SrcOpcode) && + N->isOnlyUserOf(N->getOperand(0).getNode()) && + N->isOnlyUserOf(N->getOperand(1).getNode())) { + SDValue Op00 = N->getOperand(0).getOperand(0); + SDValue Op10 = N->getOperand(1).getOperand(0); + SDValue Op01 = N->getOperand(0).getOperand(1); + SDValue Op11 = N->getOperand(1).getOperand(1); + auto *SVN00 = dyn_cast<ShuffleVectorSDNode>(Op00); + auto *SVN10 = dyn_cast<ShuffleVectorSDNode>(Op10); + auto *SVN01 = dyn_cast<ShuffleVectorSDNode>(Op01); + auto *SVN11 = dyn_cast<ShuffleVectorSDNode>(Op11); + if (((SVN00 && !SVN00->isSplat()) || (SVN10 && !SVN10->isSplat())) && + ((SVN01 && !SVN01->isSplat()) || (SVN11 && !SVN11->isSplat()))) { + SDLoc DL(N); + ArrayRef<int> Mask = SVN->getMask(); + SDValue LHS = DAG.getVectorShuffle(VT, DL, Op00, Op10, Mask); + SDValue RHS = DAG.getVectorShuffle(VT, DL, Op01, Op11, Mask); + return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS); + } + } + } } // Attempt to combine into a vector load/broadcast. @@ -38035,8 +38035,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // TODO - merge this into combineX86ShufflesRecursively. APInt KnownUndef, KnownZero; APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); - if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, - DCI)) + if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, + DCI)) return SDValue(N, 0); } @@ -38170,13 +38170,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, Depth + 1)) return true; - - // Aggressively peek through ops to get at the demanded elts. - if (!DemandedElts.isAllOnesValue()) - if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( - Src, DemandedElts, TLO.DAG, Depth + 1)) - return TLO.CombineTo( - Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); + + // Aggressively peek through ops to get at the demanded elts. + if (!DemandedElts.isAllOnesValue()) + if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( + Src, DemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); break; } case X86ISD::KSHIFTL: { @@ -38365,7 +38365,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) - break; + break; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { if (Src.getValueType() != VT) @@ -38418,62 +38418,62 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( ExtSizeInBits = SizeInBits / 4; switch (Opc) { - // Scalar broadcast. - case X86ISD::VBROADCAST: { + // Scalar broadcast. + case X86ISD::VBROADCAST: { SDLoc DL(Op); SDValue Src = Op.getOperand(0); if (Src.getValueSizeInBits() > ExtSizeInBits) Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); - EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), - ExtSizeInBits / VT.getScalarSizeInBits()); - SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src); - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, - TLO.DAG, DL, ExtSizeInBits)); - } - case X86ISD::VBROADCAST_LOAD: { - SDLoc DL(Op); - auto *MemIntr = cast<MemIntrinsicSDNode>(Op); - EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), - ExtSizeInBits / VT.getScalarSizeInBits()); - SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); - SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; - SDValue Bcst = TLO.DAG.getMemIntrinsicNode( - X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), - MemIntr->getMemOperand()); - TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), - Bcst.getValue(1)); - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, + EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), + ExtSizeInBits / VT.getScalarSizeInBits()); + SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, TLO.DAG, DL, ExtSizeInBits)); } - // Subvector broadcast. - case X86ISD::SUBV_BROADCAST_LOAD: { - auto *MemIntr = cast<MemIntrinsicSDNode>(Op); - EVT MemVT = MemIntr->getMemoryVT(); - if (ExtSizeInBits == MemVT.getStoreSizeInBits()) { - SDLoc DL(Op); - SDValue Ld = - TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(), - MemIntr->getBasePtr(), MemIntr->getMemOperand()); - TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), - Ld.getValue(1)); - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0, - TLO.DAG, DL, ExtSizeInBits)); - } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) { - SDLoc DL(Op); - EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), - ExtSizeInBits / VT.getScalarSizeInBits()); - SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); - SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; - SDValue Bcst = - TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, - Ops, MemVT, MemIntr->getMemOperand()); - TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), - Bcst.getValue(1)); - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, - TLO.DAG, DL, ExtSizeInBits)); - } - break; - } + case X86ISD::VBROADCAST_LOAD: { + SDLoc DL(Op); + auto *MemIntr = cast<MemIntrinsicSDNode>(Op); + EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), + ExtSizeInBits / VT.getScalarSizeInBits()); + SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); + SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; + SDValue Bcst = TLO.DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), + Bcst.getValue(1)); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, + TLO.DAG, DL, ExtSizeInBits)); + } + // Subvector broadcast. + case X86ISD::SUBV_BROADCAST_LOAD: { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op); + EVT MemVT = MemIntr->getMemoryVT(); + if (ExtSizeInBits == MemVT.getStoreSizeInBits()) { + SDLoc DL(Op); + SDValue Ld = + TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), MemIntr->getMemOperand()); + TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), + Ld.getValue(1)); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0, + TLO.DAG, DL, ExtSizeInBits)); + } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) { + SDLoc DL(Op); + EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), + ExtSizeInBits / VT.getScalarSizeInBits()); + SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); + SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; + SDValue Bcst = + TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, + Ops, MemVT, MemIntr->getMemOperand()); + TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), + Bcst.getValue(1)); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, + TLO.DAG, DL, ExtSizeInBits)); + } + break; + } // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -38525,8 +38525,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::BLENDI: - // Integer ops. - case X86ISD::AVG: + // Integer ops. + case X86ISD::AVG: case X86ISD::PACKSS: case X86ISD::PACKUS: // Horizontal Ops. @@ -38619,22 +38619,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // If we don't demand all elements, then attempt to combine to a simpler // shuffle. - // We need to convert the depth to something combineX86ShufflesRecursively - // can handle - so pretend its Depth == 0 again, and reduce the max depth - // to match. This prevents combineX86ShuffleChain from returning a - // combined shuffle that's the same as the original root, causing an - // infinite loop. - if (!DemandedElts.isAllOnesValue()) { - assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); - + // We need to convert the depth to something combineX86ShufflesRecursively + // can handle - so pretend its Depth == 0 again, and reduce the max depth + // to match. This prevents combineX86ShuffleChain from returning a + // combined shuffle that's the same as the original root, causing an + // infinite loop. + if (!DemandedElts.isAllOnesValue()) { + assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); + SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) DemandedMask[i] = i; SDValue NewShuffle = combineX86ShufflesRecursively( - {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, - /*HasVarMask*/ false, + {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, + /*HasVarMask*/ false, /*AllowVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) return TLO.CombineTo(Op, NewShuffle); @@ -38737,7 +38737,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( // Low bits known zero. Known.Zero.setLowBits(ShAmt); - return false; + return false; } case X86ISD::VSRLI: { unsigned ShAmt = Op.getConstantOperandVal(1); @@ -38756,7 +38756,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( // High bits known zero. Known.Zero.setHighBits(ShAmt); - return false; + return false; } case X86ISD::VSRAI: { SDValue Op0 = Op.getOperand(0); @@ -38805,7 +38805,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( // High bits are known one. if (Known.One[BitWidth - ShAmt - 1]) Known.One.setHighBits(ShAmt); - return false; + return false; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { @@ -38871,7 +38871,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( return true; KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); - Known = KnownBits::commonBits(KnownVec, KnownScl); + Known = KnownBits::commonBits(KnownVec, KnownScl); return false; } break; @@ -38951,83 +38951,83 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); return false; } - case X86ISD::BEXTR: - case X86ISD::BEXTRI: { + case X86ISD::BEXTR: + case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Only bottom 16-bits of the control bits are required. if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) { // NOTE: SimplifyDemandedBits won't do this for constants. - uint64_t Val1 = Cst1->getZExtValue(); - uint64_t MaskedVal1 = Val1 & 0xFFFF; - if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) { + uint64_t Val1 = Cst1->getZExtValue(); + uint64_t MaskedVal1 = Val1 & 0xFFFF; + if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) { SDLoc DL(Op); return TLO.CombineTo( Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0, TLO.DAG.getConstant(MaskedVal1, DL, VT))); } - - unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); - unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); - - // If the length is 0, the result is 0. - if (Length == 0) { - Known.setAllZero(); - return false; - } - - if ((Shift + Length) <= BitWidth) { - APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length); - if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1)) - return true; - - Known = Known.extractBits(Length, Shift); - Known = Known.zextOrTrunc(BitWidth); - return false; - } - } else { - assert(Opc == X86ISD::BEXTR && "Unexpected opcode!"); - KnownBits Known1; - APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16)); - if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1)) - return true; - - // If the length is 0, replace with 0. - KnownBits LengthBits = Known1.extractBits(8, 8); - if (LengthBits.isZero()) - return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); - } - - break; - } - case X86ISD::PDEP: { - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - - unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros(); - APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); - - // If the demanded bits has leading zeroes, we don't demand those from the - // mask. - if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) + + unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); + unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); + + // If the length is 0, the result is 0. + if (Length == 0) { + Known.setAllZero(); + return false; + } + + if ((Shift + Length) <= BitWidth) { + APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length); + if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1)) + return true; + + Known = Known.extractBits(Length, Shift); + Known = Known.zextOrTrunc(BitWidth); + return false; + } + } else { + assert(Opc == X86ISD::BEXTR && "Unexpected opcode!"); + KnownBits Known1; + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16)); + if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1)) + return true; + + // If the length is 0, replace with 0. + KnownBits LengthBits = Known1.extractBits(8, 8); + if (LengthBits.isZero()) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + } + + break; + } + case X86ISD::PDEP: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros(); + APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); + + // If the demanded bits has leading zeroes, we don't demand those from the + // mask. + if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) return true; - // The number of possible 1s in the mask determines the number of LSBs of - // operand 0 used. Undemanded bits from the mask don't matter so filter - // them before counting. - KnownBits Known2; - uint64_t Count = (~Known.Zero & LoMask).countPopulation(); - APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); - if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) - return true; + // The number of possible 1s in the mask determines the number of LSBs of + // operand 0 used. Undemanded bits from the mask don't matter so filter + // them before counting. + KnownBits Known2; + uint64_t Count = (~Known.Zero & LoMask).countPopulation(); + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); + if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) + return true; - // Zeroes are retained from the mask, but not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - return false; + // Zeroes are retained from the mask, but not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + return false; } } @@ -39438,8 +39438,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, // Convert build vector ops to MMX data in the bottom elements. SmallVector<SDValue, 8> Ops; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element. if (Splat) { if (Splat.isUndef()) @@ -39452,16 +39452,16 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, if (NumElts == 8) Splat = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, - DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL, - TLI.getPointerTy(DAG.getDataLayout())), - Splat, Splat); + DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL, + TLI.getPointerTy(DAG.getDataLayout())), + Splat, Splat); // Use PSHUFW to repeat 16-bit elements. unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, - DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, - TLI.getPointerTy(DAG.getDataLayout())), + DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, + TLI.getPointerTy(DAG.getDataLayout())), Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8)); } Ops.append(NumElts, Splat); @@ -39477,8 +39477,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd : Intrinsic::x86_mmx_punpcklbw)); - SDValue Intrin = DAG.getTargetConstant( - IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout())); + SDValue Intrin = DAG.getTargetConstant( + IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout())); for (unsigned i = 0; i != NumOps; i += 2) Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin, Ops[i], Ops[i + 1]); @@ -39492,7 +39492,7 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, // a vector/float/double that got truncated/extended/bitcast to/from a scalar // integer. If so, replace the scalar ops with bool vector equivalents back down // the chain. -static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, +static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -39545,10 +39545,10 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, case ISD::SHL: { // If we find a suitable source, a SHL becomes a KSHIFTL. SDValue Src0 = V.getOperand(0); - if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) || - ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI())) - break; - + if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) || + ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI())) + break; + if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1))) if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) return DAG.getNode( @@ -39891,8 +39891,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with // PHMINPOSUW. -static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // Bail without SSE41. if (!Subtarget.hasSSE41()) return SDValue(); @@ -39965,8 +39965,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, } // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. -static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // Bail without SSE2. if (!Subtarget.hasSSE2()) return SDValue(); @@ -40080,8 +40080,8 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; if (BinOp == ISD::XOR) { - // parity -> (PARITY(MOVMSK X)) - SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk); + // parity -> (PARITY(MOVMSK X)) + SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } @@ -40269,12 +40269,12 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? - if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 && - (SrcVT.getSizeInBits() % 128) == 0) { + if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 && + (SrcVT.getSizeInBits() % 128) == 0) { Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); - MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits); - return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src), - Idx); + MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits); + return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src), + Idx); } // Resolve the target shuffle inputs and mask. @@ -40350,7 +40350,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, - DAG.getTargetConstant(SrcIdx, dl, MVT::i8)); + DAG.getTargetConstant(SrcIdx, dl, MVT::i8)); return DAG.getZExtOrTrunc(ExtOp, dl, VT); } @@ -40457,8 +40457,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { /// Try to convert a vector reduction sequence composed of binops and shuffles /// into horizontal ops. -static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); // We need at least SSE2 to anything here. @@ -40466,8 +40466,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, return SDValue(); ISD::NodeType Opc; - SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, - {ISD::ADD, ISD::MUL, ISD::FADD}, true); + SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, + {ISD::ADD, ISD::MUL, ISD::FADD}, true); if (!Rdx) return SDValue(); @@ -40482,46 +40482,46 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, SDLoc DL(ExtElt); - // vXi8 mul reduction - promote to vXi16 mul reduction. - if (Opc == ISD::MUL) { - unsigned NumElts = VecVT.getVectorNumElements(); - if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts)) - return SDValue(); - if (VecVT.getSizeInBits() >= 128) { - EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2); - SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); - SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); - Lo = DAG.getBitcast(WideVT, Lo); - Hi = DAG.getBitcast(WideVT, Hi); - Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi); - while (Rdx.getValueSizeInBits() > 128) { - std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); - Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi); - } - } else { - if (VecVT == MVT::v4i8) - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, - DAG.getUNDEF(MVT::v4i8)); - Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, - DAG.getUNDEF(MVT::v8i8)); - Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8)); - Rdx = DAG.getBitcast(MVT::v8i16, Rdx); - } - if (NumElts >= 8) - Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, - DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, - {4, 5, 6, 7, -1, -1, -1, -1})); - Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, - DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, - {2, 3, -1, -1, -1, -1, -1, -1})); - Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, - DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, - {1, -1, -1, -1, -1, -1, -1, -1})); - Rdx = DAG.getBitcast(MVT::v16i8, Rdx); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); - } - - // vXi8 add reduction - sub 128-bit vector. + // vXi8 mul reduction - promote to vXi16 mul reduction. + if (Opc == ISD::MUL) { + unsigned NumElts = VecVT.getVectorNumElements(); + if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts)) + return SDValue(); + if (VecVT.getSizeInBits() >= 128) { + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2); + SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); + SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); + Lo = DAG.getBitcast(WideVT, Lo); + Hi = DAG.getBitcast(WideVT, Hi); + Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi); + while (Rdx.getValueSizeInBits() > 128) { + std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); + Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi); + } + } else { + if (VecVT == MVT::v4i8) + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, + DAG.getUNDEF(MVT::v4i8)); + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, + DAG.getUNDEF(MVT::v8i8)); + Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8)); + Rdx = DAG.getBitcast(MVT::v8i16, Rdx); + } + if (NumElts >= 8) + Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, + DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, + {4, 5, 6, 7, -1, -1, -1, -1})); + Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, + DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, + {2, 3, -1, -1, -1, -1, -1, -1})); + Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, + DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, + {1, -1, -1, -1, -1, -1, -1, -1})); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + + // vXi8 add reduction - sub 128-bit vector. if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { if (VecVT == MVT::v4i8) { // Pad with zero. @@ -40552,7 +40552,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, !isPowerOf2_32(VecVT.getVectorNumElements())) return SDValue(); - // vXi8 add reduction - sum lo/hi halves then use PSADBW. + // vXi8 add reduction - sum lo/hi halves then use PSADBW. if (VT == MVT::i8) { while (Rdx.getValueSizeInBits() > 128) { SDValue Lo, Hi; @@ -40658,7 +40658,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and + // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and // combineBasicSADPattern. return SDValue(); } @@ -40690,15 +40690,15 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SAD; // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. - if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) + if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) return Cmp; // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. - if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) + if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) return MinMax; - // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc.. - if (SDValue V = combineArithReduction(N, DAG, Subtarget)) + // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc.. + if (SDValue V = combineArithReduction(N, DAG, Subtarget)) return V; if (SDValue V = scalarizeExtEltFP(N, DAG)) @@ -40822,7 +40822,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (TValIsAllOnes && FValIsAllZeros) return DAG.getBitcast(VT, Cond); - if (!TLI.isTypeLegal(CondVT)) + if (!TLI.isTypeLegal(CondVT)) return SDValue(); // vselect Cond, 111..., X -> or Cond, X @@ -41145,36 +41145,36 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); } - // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) - // by forcing the unselected elements to zero. - // TODO: Can we handle more shuffles with this? - if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() && - LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB && - LHS.hasOneUse() && RHS.hasOneUse()) { - MVT SimpleVT = VT.getSimpleVT(); - bool LHSUnary, RHSUnary; - SmallVector<SDValue, 1> LHSOps, RHSOps; - SmallVector<int, 64> LHSMask, RHSMask, CondMask; - if (createShuffleMaskFromVSELECT(CondMask, Cond) && - getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask, - LHSUnary) && - getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask, - RHSUnary)) { - int NumElts = VT.getVectorNumElements(); - for (int i = 0; i != NumElts; ++i) { - if (CondMask[i] < NumElts) - RHSMask[i] = 0x80; - else - LHSMask[i] = 0x80; - } - LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), - getConstVector(LHSMask, SimpleVT, DAG, DL, true)); - RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0), - getConstVector(RHSMask, SimpleVT, DAG, DL, true)); - return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); - } - } - + // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) + // by forcing the unselected elements to zero. + // TODO: Can we handle more shuffles with this? + if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() && + LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB && + LHS.hasOneUse() && RHS.hasOneUse()) { + MVT SimpleVT = VT.getSimpleVT(); + bool LHSUnary, RHSUnary; + SmallVector<SDValue, 1> LHSOps, RHSOps; + SmallVector<int, 64> LHSMask, RHSMask, CondMask; + if (createShuffleMaskFromVSELECT(CondMask, Cond) && + getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask, + LHSUnary) && + getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask, + RHSUnary)) { + int NumElts = VT.getVectorNumElements(); + for (int i = 0; i != NumElts; ++i) { + if (CondMask[i] < NumElts) + RHSMask[i] = 0x80; + else + LHSMask[i] = 0x80; + } + LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), + getConstVector(LHSMask, SimpleVT, DAG, DL, true)); + RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0), + getConstVector(RHSMask, SimpleVT, DAG, DL, true)); + return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); + } + } + // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be @@ -41401,12 +41401,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineSelectOfTwoConstants(N, DAG)) return V; - // Canonicalize min/max: - // (x > 0) ? x : 0 -> (x >= 0) ? x : 0 - // (x < -1) ? x : -1 -> (x <= -1) ? x : -1 + // Canonicalize min/max: + // (x > 0) ? x : 0 -> (x >= 0) ? x : 0 + // (x < -1) ? x : -1 -> (x <= -1) ? x : -1 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates - // the need for an extra compare against zero. e.g. - // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0 + // the need for an extra compare against zero. e.g. + // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax @@ -41415,27 +41415,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi - // - // We can also canonicalize - // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1 - // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1 - // This allows the use of a test instruction for the compare. + // + // We can also canonicalize + // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1 + // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1 + // This allows the use of a test instruction for the compare. if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse() && - LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { + LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) || - (CC == ISD::SETLT && isAllOnesConstant(RHS))) { - ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE; + if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) || + (CC == ISD::SETLT && isAllOnesConstant(RHS))) { + ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS); } - if (CC == ISD::SETUGT && isOneConstant(RHS)) { - ISD::CondCode NewCC = ISD::SETUGE; - Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), - Cond.getOperand(0), Cond.getOperand(1), NewCC); - return DAG.getSelect(DL, VT, Cond, LHS, RHS); + if (CC == ISD::SETUGT && isOneConstant(RHS)) { + ISD::CondCode NewCC = ISD::SETUGE; + Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } @@ -41466,18 +41466,18 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return V; // select(~Cond, X, Y) -> select(Cond, Y, X) - if (CondVT.getScalarType() != MVT::i1) { + if (CondVT.getScalarType() != MVT::i1) { if (SDValue CondNot = IsNOT(Cond, DAG)) return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(CondVT, CondNot), RHS, LHS); - // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit. - if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() && - ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) { - Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT, - DAG.getConstant(0, DL, CondVT), Cond.getOperand(0)); - return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS); - } - } + // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit. + if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() && + ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) { + Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT, + DAG.getConstant(0, DL, CondVT), Cond.getOperand(0)); + return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS); + } + } // Try to optimize vXi1 selects if both operands are either all constants or // bitcasts from scalar integer type. In that case we can convert the operands @@ -43094,116 +43094,116 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); - assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || - X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || - X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && - "Unexpected hadd/hsub/pack opcode"); + assert((X86ISD::HADD == Opcode || X86ISD::FHADD == Opcode || + X86ISD::HSUB == Opcode || X86ISD::FHSUB == Opcode || + X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && + "Unexpected hadd/hsub/pack opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - EVT SrcVT = N0.getValueType(); + EVT SrcVT = N0.getValueType(); - // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) - // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for + // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) + // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for // truncation trees that help us avoid lane crossing shuffles. // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. - // TODO: We don't handle vXf64 shuffles yet. + // TODO: We don't handle vXf64 shuffles yet. if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getConstantOperandAPInt(1) == 0 && - N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() && + N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() && N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() && - N0.getOperand(0).getValueType().is256BitVector() && - SrcVT.getScalarSizeInBits() <= 32) { + N0.getOperand(0).getValueType().is256BitVector() && + SrcVT.getScalarSizeInBits() <= 32) { // TODO - support target/faux shuffles. SDValue Vec = peekThroughBitcasts(N0.getOperand(0)); if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) { - // To keep the HOP LHS/RHS coherency, we must be able to scale the unary + // To keep the HOP LHS/RHS coherency, we must be able to scale the unary // shuffle to a vXi64 width - we can probably relax this in the future. SmallVector<int, 4> ShuffleMask; if (SVN->getOperand(1).isUndef() && scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) { SDLoc DL(N); SDValue Lo, Hi; - MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL); Lo = DAG.getBitcast(N0.getValueType(), Lo); Hi = DAG.getBitcast(N1.getValueType(), Hi); SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); - Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); return DAG.getBitcast(VT, Res); } } } - // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)). - // TODO: Merge with binary shuffle folds below. - if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { - int PostShuffle[4] = {0, 1, 2, 3}; - - // If the op is an unary shuffle that can scale to v2x64, - // then we can perform this as a v4x32 post shuffle. - auto AdjustOp = [&](SDValue V, int Offset) { - auto *SVN = dyn_cast<ShuffleVectorSDNode>(V); - SmallVector<int, 2> ScaledMask; - if (!SVN || !SVN->getOperand(1).isUndef() || - !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) || - !N->isOnlyUserOf(V.getNode())) - return SDValue(); - PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0]; - PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1]; - return SVN->getOperand(0); - }; - - SDValue Src0 = AdjustOp(N0, 0); - SDValue Src1 = AdjustOp(N1, 2); - if (Src0 || Src1) { - Src0 = Src0 ? Src0 : N0; - Src1 = Src1 ? Src1 : N1; - SDLoc DL(N); - MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; - SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1); - Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle); - return DAG.getBitcast(VT, Res); - } - } - - // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). + // Attempt to fold HOP(SHUFFLE(X),SHUFFLE(Y)) -> SHUFFLE(HOP(X,Y)). + // TODO: Merge with binary shuffle folds below. + if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { + int PostShuffle[4] = {0, 1, 2, 3}; + + // If the op is an unary shuffle that can scale to v2x64, + // then we can perform this as a v4x32 post shuffle. + auto AdjustOp = [&](SDValue V, int Offset) { + auto *SVN = dyn_cast<ShuffleVectorSDNode>(V); + SmallVector<int, 2> ScaledMask; + if (!SVN || !SVN->getOperand(1).isUndef() || + !scaleShuffleElements(SVN->getMask(), 2, ScaledMask) || + !N->isOnlyUserOf(V.getNode())) + return SDValue(); + PostShuffle[Offset + 0] = ScaledMask[0] < 0 ? -1 : Offset + ScaledMask[0]; + PostShuffle[Offset + 1] = ScaledMask[1] < 0 ? -1 : Offset + ScaledMask[1]; + return SVN->getOperand(0); + }; + + SDValue Src0 = AdjustOp(N0, 0); + SDValue Src1 = AdjustOp(N1, 2); + if (Src0 || Src1) { + Src0 = Src0 ? Src0 : N0; + Src1 = Src1 ? Src1 : N1; + SDLoc DL(N); + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; + SDValue Res = DAG.getNode(Opcode, DL, VT, Src0, Src1); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle); + return DAG.getBitcast(VT, Res); + } + } + + // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles. - if (VT.is256BitVector() && Subtarget.hasInt256()) { - SmallVector<int> Mask0, Mask1; - SmallVector<SDValue> Ops0, Ops1; - if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && - getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && - !Ops0.empty() && !Ops1.empty()) { - SDValue Op00 = Ops0.front(), Op01 = Ops0.back(); - SDValue Op10 = Ops1.front(), Op11 = Ops1.back(); - SmallVector<int, 2> ShuffleMask0, ShuffleMask1; - if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT && - Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT && - scaleShuffleElements(Mask0, 2, ShuffleMask0) && - scaleShuffleElements(Mask1, 2, ShuffleMask1)) { - if ((Op00 == Op11) && (Op01 == Op10)) { - std::swap(Op10, Op11); - ShuffleVectorSDNode::commuteMask(ShuffleMask1); - } - if ((Op00 == Op10) && (Op01 == Op11)) { - SmallVector<int, 4> ShuffleMask; - ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); - ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); - SDLoc DL(N); - MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; - SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); - Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); - return DAG.getBitcast(VT, Res); + if (VT.is256BitVector() && Subtarget.hasInt256()) { + SmallVector<int> Mask0, Mask1; + SmallVector<SDValue> Ops0, Ops1; + if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && + getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && + !Ops0.empty() && !Ops1.empty()) { + SDValue Op00 = Ops0.front(), Op01 = Ops0.back(); + SDValue Op10 = Ops1.front(), Op11 = Ops1.back(); + SmallVector<int, 2> ShuffleMask0, ShuffleMask1; + if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT && + Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT && + scaleShuffleElements(Mask0, 2, ShuffleMask0) && + scaleShuffleElements(Mask1, 2, ShuffleMask1)) { + if ((Op00 == Op11) && (Op01 == Op10)) { + std::swap(Op10, Op11); + ShuffleVectorSDNode::commuteMask(ShuffleMask1); } + if ((Op00 == Op10) && (Op01 == Op11)) { + SmallVector<int, 4> ShuffleMask; + ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); + ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); + SDLoc DL(N); + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; + SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); + } } } } @@ -43285,7 +43285,7 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, } // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()). - if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) + if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) return V; // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular @@ -43307,28 +43307,28 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, } } - // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors. - if (VT.is128BitVector()) { - unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Src0, Src1; - if (N0.getOpcode() == ExtOpc && - N0.getOperand(0).getValueType().is64BitVector() && - N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { - Src0 = N0.getOperand(0); - } - if (N1.getOpcode() == ExtOpc && - N1.getOperand(0).getValueType().is64BitVector() && - N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { - Src1 = N1.getOperand(0); - } - if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) { - assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"); - Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType()); - Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType()); - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1); - } - } - + // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors. + if (VT.is128BitVector()) { + unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + SDValue Src0, Src1; + if (N0.getOpcode() == ExtOpc && + N0.getOperand(0).getValueType().is64BitVector() && + N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { + Src0 = N0.getOperand(0); + } + if (N1.getOpcode() == ExtOpc && + N1.getOperand(0).getValueType().is64BitVector() && + N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { + Src1 = N1.getOperand(0); + } + if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) { + assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"); + Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType()); + Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType()); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1); + } + } + // Attempt to combine as shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) @@ -43337,20 +43337,20 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || - X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && - "Unexpected horizontal add/sub opcode"); - - // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()). - if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) - return V; - - return SDValue(); -} - +static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || + X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && + "Unexpected horizontal add/sub opcode"); + + // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()). + if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) + return V; + + return SDValue(); +} + static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -44139,7 +44139,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (VT == SrcVecVT.getScalarType() && N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && - llvm::all_of(EltBits, [](const APInt &M) { + llvm::all_of(EltBits, [](const APInt &M) { return M.isNullValue() || M.isAllOnesValue(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); @@ -44158,7 +44158,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, - X86::MaxShuffleCombineDepth, + X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); @@ -44826,13 +44826,13 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); - if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2)) + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2)) return SDValue(); // InScalarVT is the intermediate type in AVG pattern and it should be greater // than the original input type (i8/i16). EVT InScalarVT = InVT.getVectorElementType(); - if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits()) + if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits()) return SDValue(); if (!Subtarget.hasSSE2()) @@ -44860,8 +44860,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, }; // Check if each element of the vector is right-shifted by one. - SDValue LHS = In.getOperand(0); - SDValue RHS = In.getOperand(1); + SDValue LHS = In.getOperand(0); + SDValue RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) return SDValue(); if (LHS.getOpcode() != ISD::ADD) @@ -44877,29 +44877,29 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops); }; - auto AVGSplitter = [&](SDValue Op0, SDValue Op1) { - // Pad to a power-of-2 vector, split+apply and extract the original vector. - unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); - EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); - if (NumElemsPow2 != NumElems) { - SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT)); - SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT)); - for (unsigned i = 0; i != NumElems; ++i) { - SDValue Idx = DAG.getIntPtrConstant(i, DL); - Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx); - Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx); - } - Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0); - Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1); - } - SDValue Res = - SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder); - if (NumElemsPow2 == NumElems) - return Res; - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); - }; - + auto AVGSplitter = [&](SDValue Op0, SDValue Op1) { + // Pad to a power-of-2 vector, split+apply and extract the original vector. + unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); + EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); + if (NumElemsPow2 != NumElems) { + SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT)); + SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT)); + for (unsigned i = 0; i != NumElems; ++i) { + SDValue Idx = DAG.getIntPtrConstant(i, DL); + Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx); + Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx); + } + Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0); + Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1); + } + SDValue Res = + SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder); + if (NumElemsPow2 == NumElems) + return Res; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + }; + // Take care of the case when one of the operands is a constant vector whose // element is in the range [1, 256]. if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && @@ -44910,7 +44910,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); - return AVGSplitter(Operands[0].getOperand(0), Operands[1]); + return AVGSplitter(Operands[0].getOperand(0), Operands[1]); } // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). @@ -44957,7 +44957,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, } // The pattern is detected, emit X86ISD::AVG instruction(s). - return AVGSplitter(Operands[0], Operands[1]); + return AVGSplitter(Operands[0], Operands[1]); } return SDValue(); @@ -44990,8 +44990,8 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, unsigned HalfOffset = 16; SDValue Ptr1 = Ld->getBasePtr(); - SDValue Ptr2 = - DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl); + SDValue Ptr2 = + DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems / 2); SDValue Load1 = @@ -45025,29 +45025,29 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, } } - // If we also broadcast this as a subvector to a wider type, then just extract - // the lowest subvector. - if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && - (RegVT.is128BitVector() || RegVT.is256BitVector())) { - SDValue Ptr = Ld->getBasePtr(); - SDValue Chain = Ld->getChain(); - for (SDNode *User : Ptr->uses()) { - if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && - cast<MemIntrinsicSDNode>(User)->getChain() == Chain && - cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == - MemVT.getSizeInBits() && - !User->hasAnyUseOfValue(1) && - User->getValueSizeInBits(0).getFixedSize() > - RegVT.getFixedSizeInBits()) { - SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), - RegVT.getSizeInBits()); - Extract = DAG.getBitcast(RegVT, Extract); - return DCI.CombineTo(N, Extract, SDValue(User, 1)); - } - } - } - + // If we also broadcast this as a subvector to a wider type, then just extract + // the lowest subvector. + if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && + (RegVT.is128BitVector() || RegVT.is256BitVector())) { + SDValue Ptr = Ld->getBasePtr(); + SDValue Chain = Ld->getChain(); + for (SDNode *User : Ptr->uses()) { + if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && + cast<MemIntrinsicSDNode>(User)->getChain() == Chain && + cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == + MemVT.getSizeInBits() && + !User->hasAnyUseOfValue(1) && + User->getValueSizeInBits(0).getFixedSize() > + RegVT.getFixedSizeInBits()) { + SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), + RegVT.getSizeInBits()); + Extract = DAG.getBitcast(RegVT, Extract); + return DCI.CombineTo(N, Extract, SDValue(User, 1)); + } + } + } + // Cast ptr32 and ptr64 pointers to the default address space before a load. unsigned AddrSpace = Ld->getAddressSpace(); if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || @@ -45089,7 +45089,7 @@ static int getOneTrueElt(SDValue V) { auto *ConstNode = dyn_cast<ConstantSDNode>(Op); if (!ConstNode) return -1; - if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) { + if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) { // If we already found a one, this is too many. if (TrueIndex >= 0) return -1; @@ -45105,8 +45105,8 @@ static int getOneTrueElt(SDValue V) { /// scalar element, and the alignment for the scalar memory access. static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, - SDValue &Index, Align &Alignment, - unsigned &Offset) { + SDValue &Index, Align &Alignment, + unsigned &Offset) { int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); if (TrueMaskElt < 0) return false; @@ -45114,17 +45114,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, // Get the address of the one scalar element that is specified by the mask // using the appropriate offset from the base pointer. EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); - Offset = 0; + Offset = 0; Addr = MaskedOp->getBasePtr(); if (TrueMaskElt != 0) { - Offset = TrueMaskElt * EltVT.getStoreSize(); - Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset), - SDLoc(MaskedOp)); + Offset = TrueMaskElt * EltVT.getStoreSize(); + Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset), + SDLoc(MaskedOp)); } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); - Alignment = commonAlignment(MaskedOp->getOriginalAlign(), - EltVT.getStoreSize()); + Alignment = commonAlignment(MaskedOp->getOriginalAlign(), + EltVT.getStoreSize()); return true; } @@ -45134,17 +45134,17 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; - Align Alignment; - unsigned Offset; - if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset)) + Align Alignment; + unsigned Offset; + if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Load the one scalar element that is specified by the mask using the @@ -45152,25 +45152,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, SDLoc DL(ML); EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); - - EVT CastVT = VT; - if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { - EltVT = MVT::f64; - CastVT = - EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); - } - + + EVT CastVT = VT; + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + CastVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); + } + SDValue Load = - DAG.getLoad(EltVT, DL, ML->getChain(), Addr, - ML->getPointerInfo().getWithOffset(Offset), + DAG.getLoad(EltVT, DL, ML->getChain(), Addr, + ML->getPointerInfo().getWithOffset(Offset), Alignment, ML->getMemOperand()->getFlags()); - SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); - + SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); + // Insert the loaded element into the appropriate place in the vector. - SDValue Insert = - DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex); - Insert = DAG.getBitcast(VT, Insert); + SDValue Insert = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex); + Insert = DAG.getBitcast(VT, Insert); return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } @@ -45233,8 +45233,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return SDValue(); if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { - if (SDValue ScalarLoad = - reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget)) + if (SDValue ScalarLoad = + reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget)) return ScalarLoad; // TODO: Do some AVX512 subsets benefit from this transform? @@ -45271,35 +45271,35 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; - Align Alignment; - unsigned Offset; - if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset)) + Align Alignment; + unsigned Offset; + if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Extract the one scalar element that is actually being stored. SDLoc DL(MS); - SDValue Value = MS->getValue(); - EVT VT = Value.getValueType(); + SDValue Value = MS->getValue(); + EVT VT = Value.getValueType(); EVT EltVT = VT.getVectorElementType(); - if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { - EltVT = MVT::f64; - EVT CastVT = - EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); - Value = DAG.getBitcast(CastVT, Value); - } - SDValue Extract = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex); + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + EVT CastVT = + EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements()); + Value = DAG.getBitcast(CastVT, Value); + } + SDValue Extract = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex); // Store that element at the appropriate offset from the base pointer. - return DAG.getStore(MS->getChain(), DL, Extract, Addr, - MS->getPointerInfo().getWithOffset(Offset), + return DAG.getStore(MS->getChain(), DL, Extract, Addr, + MS->getPointerInfo().getWithOffset(Offset), Alignment, MS->getMemOperand()->getFlags()); } @@ -45317,7 +45317,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, if (Mst->isTruncatingStore()) return SDValue(); - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) return ScalarStore; // If the mask value has been legalized to a non-boolean vector, try to @@ -45378,21 +45378,21 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && StoredVal.getOperand(0).getValueType() == MVT::i8) { - SDValue Val = StoredVal.getOperand(0); - // We must store zeros to the unused bits. - Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); - return DAG.getStore(St->getChain(), dl, Val, + SDValue Val = StoredVal.getOperand(0); + // We must store zeros to the unused bits. + Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); + return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. - if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && + if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / VT.getVectorNumElements(); - // We must store zeros to the unused bits. - SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT)); + // We must store zeros to the unused bits. + SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT)); Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), @@ -45414,7 +45414,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, Hi = combinevXi1ConstantToInteger(Hi, DAG); SDValue Ptr0 = St->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), @@ -45493,36 +45493,36 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, VT, St->getMemOperand(), DAG); } - // Try to fold a extract_element(VTRUNC) pattern into a truncating store. - if (!St->isTruncatingStore() && StoredVal.hasOneUse()) { - auto IsExtractedElement = [](SDValue V) { - if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse()) - V = V.getOperand(0); - unsigned Opc = V.getOpcode(); - if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) { - if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1))) - return V.getOperand(0); - } - return SDValue(); - }; - if (SDValue Extract = IsExtractedElement(StoredVal)) { - SDValue Trunc = peekThroughOneUseBitcasts(Extract); - if (Trunc.getOpcode() == X86ISD::VTRUNC) { - SDValue Src = Trunc.getOperand(0); - MVT DstVT = Trunc.getSimpleValueType(); - MVT SrcVT = Src.getSimpleValueType(); - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts; - MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts); - if (NumTruncBits == VT.getSizeInBits() && - TLI.isTruncStoreLegal(SrcVT, TruncVT)) { - return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(), - TruncVT, St->getMemOperand()); - } - } - } - } - + // Try to fold a extract_element(VTRUNC) pattern into a truncating store. + if (!St->isTruncatingStore() && StoredVal.hasOneUse()) { + auto IsExtractedElement = [](SDValue V) { + if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse()) + V = V.getOperand(0); + unsigned Opc = V.getOpcode(); + if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) { + if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1))) + return V.getOperand(0); + } + return SDValue(); + }; + if (SDValue Extract = IsExtractedElement(StoredVal)) { + SDValue Trunc = peekThroughOneUseBitcasts(Extract); + if (Trunc.getOpcode() == X86ISD::VTRUNC) { + SDValue Src = Trunc.getOperand(0); + MVT DstVT = Trunc.getSimpleValueType(); + MVT SrcVT = Src.getSimpleValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts; + MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts); + if (NumTruncBits == VT.getSizeInBits() && + TLI.isTruncStoreLegal(SrcVT, TruncVT)) { + return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(), + TruncVT, St->getMemOperand()); + } + } + } + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. @@ -45665,9 +45665,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, - SelectionDAG &DAG, const X86Subtarget &Subtarget, - bool IsCommutative, +static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + bool IsCommutative, SmallVectorImpl<int> &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) @@ -45815,39 +45815,39 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, } } - SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. - SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. + SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. bool IsIdentityPostShuffle = isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); if (IsIdentityPostShuffle) PostShuffleMask.clear(); - // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split). - if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && - isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) - return false; - - // If the source nodes are already used in HorizOps then always accept this. - // Shuffle folding should merge these back together. - bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) { - return User->getOpcode() == HOpcode && User->getValueType(0) == VT; - }); - bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) { - return User->getOpcode() == HOpcode && User->getValueType(0) == VT; - }); - bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS; - + // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split). + if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && + isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) + return false; + + // If the source nodes are already used in HorizOps then always accept this. + // Shuffle folding should merge these back together. + bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) { + return User->getOpcode() == HOpcode && User->getValueType(0) == VT; + }); + bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) { + return User->getOpcode() == HOpcode && User->getValueType(0) == VT; + }); + bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS; + // Assume a SingleSource HOP if we only shuffle one input and don't need to // shuffle the result. - if (!ForceHorizOp && - !shouldUseHorizontalOp(NewLHS == NewRHS && + if (!ForceHorizOp && + !shouldUseHorizontalOp(NewLHS == NewRHS && (NumShuffles < 2 || !IsIdentityPostShuffle), DAG, Subtarget)) return false; - LHS = DAG.getBitcast(VT, NewLHS); - RHS = DAG.getBitcast(VT, NewRHS); + LHS = DAG.getBitcast(VT, NewLHS); + RHS = DAG.getBitcast(VT, NewRHS); return true; } @@ -45865,8 +45865,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, SmallVector<int, 8> PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd, - PostShuffleMask)) { + isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd, + PostShuffleMask)) { SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); if (!PostShuffleMask.empty()) HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, @@ -46011,7 +46011,7 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, EVT OutSVT = OutVT.getVectorElementType(); EVT InSVT = InVT.getVectorElementType(); - if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) && + if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) && (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && NumElems >= 8)) return SDValue(); @@ -46073,13 +46073,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, // there's no harm in trying pack. if (Subtarget.hasAVX512() && !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && - InVT.is512BitVector())) { - // PACK should still be worth it for 128-bit vectors if the sources were - // originally concatenated from subvectors. - SmallVector<SDValue> ConcatOps; - if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps)) + InVT.is512BitVector())) { + // PACK should still be worth it for 128-bit vectors if the sources were + // originally concatenated from subvectors. + SmallVector<SDValue> ConcatOps; + if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps)) return SDValue(); - } + } unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; @@ -46101,23 +46101,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits()) return SDValue(); - unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits; - if (NumSignBits > MinSignBits) + unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits; + if (NumSignBits > MinSignBits) return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); - // If we have a srl that only generates signbits that we will discard in - // the truncation then we can use PACKSS by converting the srl to a sra. - // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. - if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode())) - if (const APInt *ShAmt = DAG.getValidShiftAmountConstant( - In, APInt::getAllOnesValue(VT.getVectorNumElements()))) { - if (*ShAmt == MinSignBits) { - SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops()); - return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG, - Subtarget); - } - } - + // If we have a srl that only generates signbits that we will discard in + // the truncation then we can use PACKSS by converting the srl to a sra. + // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. + if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode())) + if (const APInt *ShAmt = DAG.getValidShiftAmountConstant( + In, APInt::getAllOnesValue(VT.getVectorNumElements()))) { + if (*ShAmt == MinSignBits) { + SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops()); + return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG, + Subtarget); + } + } + return SDValue(); } @@ -47466,14 +47466,14 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; - if (VT.isVector()) { + if (VT.isVector()) { if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) return R; - if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) - return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); - } - + if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) + return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); + } + if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; @@ -47492,19 +47492,19 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, if (!TLI.isTypeLegal(VT)) return SDValue(); - SDValue A = N->getOperand(IsStrict ? 1 : 0); - SDValue B = N->getOperand(IsStrict ? 2 : 1); - SDValue C = N->getOperand(IsStrict ? 3 : 2); - - // If the operation allows fast-math and the target does not support FMA, - // split this into mul+add to avoid libcall(s). - SDNodeFlags Flags = N->getFlags(); - if (!IsStrict && Flags.hasAllowReassociation() && - TLI.isOperationExpand(ISD::FMA, VT)) { - SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags); - return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags); - } - + SDValue A = N->getOperand(IsStrict ? 1 : 0); + SDValue B = N->getOperand(IsStrict ? 2 : 1); + SDValue C = N->getOperand(IsStrict ? 3 : 2); + + // If the operation allows fast-math and the target does not support FMA, + // split this into mul+add to avoid libcall(s). + SDNodeFlags Flags = N->getFlags(); + if (!IsStrict && Flags.hasAllowReassociation() && + TLI.isOperationExpand(ISD::FMA, VT)) { + SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags); + return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags); + } + EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); @@ -47931,7 +47931,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); - // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results + // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results // with scalar comparisons. if (SDValue NotSrc = IsNOT(Src, DAG)) { SDLoc DL(N); @@ -47942,17 +47942,17 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, DAG.getConstant(NotMask, DL, VT)); } - // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk - // results with scalar comparisons. - if (Src.getOpcode() == X86ISD::PCMPGT && - ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) { - SDLoc DL(N); - APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); - return DAG.getNode(ISD::XOR, DL, VT, - DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)), - DAG.getConstant(NotMask, DL, VT)); - } - + // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk + // results with scalar comparisons. + if (Src.getOpcode() == X86ISD::PCMPGT && + ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) { + SDLoc DL(N); + APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); + return DAG.getNode(ISD::XOR, DL, VT, + DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)), + DAG.getConstant(NotMask, DL, VT)); + } + // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnesValue(NumBits)); @@ -47990,8 +47990,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL, Ops, Gather->getMemOperand(), - Gather->getIndexType(), - Gather->getExtensionType()); + Gather->getIndexType(), + Gather->getExtensionType()); } auto *Scatter = cast<MaskedScatterSDNode>(GorS); SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), @@ -47999,8 +47999,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL, Ops, Scatter->getMemOperand(), - Scatter->getIndexType(), - Scatter->isTruncatingStore()); + Scatter->getIndexType(), + Scatter->isTruncatingStore()); } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, @@ -48995,18 +48995,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); bool IsAdd = N->getOpcode() == ISD::ADD; - auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; + auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); SmallVector<int, 8> PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd, - PostShuffleMask)) { - auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops); + isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd, + PostShuffleMask)) { + auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops); }; SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); @@ -49071,11 +49071,11 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, if (!VT.isVector()) return SDValue(); - // PSUBUS is supported, starting from SSE2. + // PSUBUS is supported, starting from SSE2. EVT EltVT = VT.getVectorElementType(); - if (!(Subtarget.hasSSE2() && - (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 || - VT == MVT::v8i64 || VT == MVT::v16i32))) + if (!(Subtarget.hasSSE2() && + (EltVT == MVT::i8 || EltVT == MVT::i16 || VT == MVT::v8i32 || + VT == MVT::v8i64 || VT == MVT::v16i32))) return SDValue(); SDValue SubusLHS, SubusRHS; @@ -49111,9 +49111,9 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue MinLHS = Op1.getOperand(0).getOperand(0); SDValue MinRHS = Op1.getOperand(0).getOperand(1); EVT TruncVT = Op1.getOperand(0).getValueType(); - if (!(Subtarget.hasSSE2() && - (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 || - TruncVT == MVT::v16i32))) + if (!(Subtarget.hasSSE2() && + (TruncVT == MVT::v8i32 || TruncVT == MVT::v8i64 || + TruncVT == MVT::v16i32))) return SDValue(); SDValue OpToSaturate; if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && @@ -49151,7 +49151,7 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, // values, or first 48 bits for 64 bit values. KnownBits Known = DAG.computeKnownBits(SubusLHS); unsigned NumZeros = Known.countMinLeadingZeros(); - if (NumZeros < (VT.getScalarSizeInBits() - 16)) + if (NumZeros < (VT.getScalarSizeInBits() - 16)) return SDValue(); EVT ExtType = SubusLHS.getValueType(); @@ -49252,46 +49252,46 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); // Repeated subvectors. - if (IsSplat && - (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { - // If this broadcast is inserted into both halves, use a larger broadcast. - if (Op0.getOpcode() == X86ISD::VBROADCAST) + if (IsSplat && + (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { + // If this broadcast is inserted into both halves, use a larger broadcast. + if (Op0.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); - // If this scalar/subvector broadcast_load is inserted into both halves, use - // a larger broadcast_load. Update other uses to use an extracted subvector. - if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || - Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { + // If this scalar/subvector broadcast_load is inserted into both halves, use + // a larger broadcast_load. Update other uses to use an extracted subvector. + if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { auto *MemIntr = cast<MemIntrinsicSDNode>(Op0); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()}; - SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops, - MemIntr->getMemoryVT(), - MemIntr->getMemOperand()); + SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); DAG.ReplaceAllUsesOfValueWith( Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); return BcastLd; } - // If this is a simple subvector load repeated across multiple lanes, then - // broadcast the load. Update other uses to use an extracted subvector. - if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) { - if (Ld->isSimple() && !Ld->isNonTemporal() && - Ld->getExtensionType() == ISD::NON_EXTLOAD) { - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; - SDValue BcastLd = - DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, - Ld->getMemoryVT(), Ld->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith( - Op0, - extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1)); - return BcastLd; - } - } - + // If this is a simple subvector load repeated across multiple lanes, then + // broadcast the load. Update other uses to use an extracted subvector. + if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) { + if (Ld->isSimple() && !Ld->isNonTemporal() && + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, + Ld->getMemoryVT(), Ld->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith( + Op0, + extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) @@ -49369,38 +49369,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, return DAG.getBitcast(VT, Res); } break; - case X86ISD::VPERMV3: - if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { - MVT OpVT = Op0.getSimpleValueType(); - int NumSrcElts = OpVT.getVectorNumElements(); - SmallVector<int, 64> ConcatMask; - for (unsigned i = 0; i != NumOps; ++i) { - bool IsUnary; - SmallVector<int, 64> SubMask; - SmallVector<SDValue, 2> SubOps; - if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps, - SubMask, IsUnary)) - break; - for (int M : SubMask) { - if (0 <= M) { - M += M < NumSrcElts ? 0 : NumSrcElts; - M += i * NumSrcElts; - } - ConcatMask.push_back(M); - } - } - if (ConcatMask.size() == (NumOps * NumSrcElts)) { - SDValue Src0 = concatSubVectors(Ops[0].getOperand(0), - Ops[1].getOperand(0), DAG, DL); - SDValue Src1 = concatSubVectors(Ops[0].getOperand(2), - Ops[1].getOperand(2), DAG, DL); - MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); - SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); - return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1); - } - } - break; + case X86ISD::VPERMV3: + if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { + MVT OpVT = Op0.getSimpleValueType(); + int NumSrcElts = OpVT.getVectorNumElements(); + SmallVector<int, 64> ConcatMask; + for (unsigned i = 0; i != NumOps; ++i) { + bool IsUnary; + SmallVector<int, 64> SubMask; + SmallVector<SDValue, 2> SubOps; + if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps, + SubMask, IsUnary)) + break; + for (int M : SubMask) { + if (0 <= M) { + M += M < NumSrcElts ? 0 : NumSrcElts; + M += i * NumSrcElts; + } + ConcatMask.push_back(M); + } + } + if (ConcatMask.size() == (NumOps * NumSrcElts)) { + SDValue Src0 = concatSubVectors(Ops[0].getOperand(0), + Ops[1].getOperand(0), DAG, DL); + SDValue Src1 = concatSubVectors(Ops[0].getOperand(2), + Ops[1].getOperand(2), DAG, DL); + MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); + SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); + return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1); + } + } + break; case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: @@ -49433,33 +49433,33 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Op0.getOperand(1)); } break; - case ISD::AND: - case ISD::OR: - case ISD::XOR: - case X86ISD::ANDNP: - // TODO: Add 256-bit support. - if (!IsSplat && VT.is512BitVector()) { - SmallVector<SDValue, 2> LHS, RHS; - for (unsigned i = 0; i != NumOps; ++i) { - LHS.push_back(Ops[i].getOperand(0)); - RHS.push_back(Ops[i].getOperand(1)); - } - MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); - SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), - NumOps * SrcVT.getVectorNumElements()); - return DAG.getNode(Op0.getOpcode(), DL, VT, - DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), - DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); - } - break; - case X86ISD::HADD: - case X86ISD::HSUB: - case X86ISD::FHADD: - case X86ISD::FHSUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case X86ISD::ANDNP: + // TODO: Add 256-bit support. + if (!IsSplat && VT.is512BitVector()) { + SmallVector<SDValue, 2> LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumOps * SrcVT.getVectorNumElements()); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); + } + break; + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: case X86ISD::PACKSS: case X86ISD::PACKUS: - if (!IsSplat && VT.is256BitVector() && - (VT.isFloatingPoint() || Subtarget.hasInt256())) { + if (!IsSplat && VT.is256BitVector() && + (VT.isFloatingPoint() || Subtarget.hasInt256())) { SmallVector<SDValue, 2> LHS, RHS; for (unsigned i = 0; i != NumOps; ++i) { LHS.push_back(Ops[i].getOperand(0)); @@ -49494,20 +49494,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } - // Fold subvector loads into one. - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) { - bool Fast; - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *FirstLd->getMemOperand(), &Fast) && - Fast) { - if (SDValue Ld = - EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) - return Ld; - } - } - + // Fold subvector loads into one. + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) { + bool Fast; + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + *FirstLd->getMemOperand(), &Fast) && + Fast) { + if (SDValue Ld = + EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + return Ld; + } + } + return SDValue(); } @@ -49579,8 +49579,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ins = SubVec.getOperand(0); if (isNullConstant(Ins.getOperand(2)) && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && - Ins.getOperand(1).getValueSizeInBits().getFixedSize() <= - SubVecVT.getFixedSizeInBits()) + Ins.getOperand(1).getValueSizeInBits().getFixedSize() <= + SubVecVT.getFixedSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); @@ -49733,14 +49733,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, unsigned IdxVal = N->getConstantOperandVal(1); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); - unsigned SizeInBits = VT.getSizeInBits(); - unsigned InSizeInBits = InVecVT.getSizeInBits(); + unsigned SizeInBits = VT.getSizeInBits(); + unsigned InSizeInBits = InVecVT.getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) && - InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { - auto isConcatenatedNot = [](SDValue V) { + InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { + auto isConcatenatedNot = [](SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) return false; @@ -49783,7 +49783,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && - InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { + InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { SDLoc DL(N); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), @@ -49795,20 +49795,20 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // SimplifyDemandedVectorElts do more simplifications. if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) - return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); + return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); - // If we're extracting a broadcasted subvector, just use the lowest subvector. - if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT) - return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); + // If we're extracting a broadcasted subvector, just use the lowest subvector. + if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT) + return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // Attempt to extract from the source of a shuffle vector. - if ((InSizeInBits % SizeInBits) == 0 && + if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % VT.getVectorNumElements()) == 0) { SmallVector<int, 32> ShuffleMask; SmallVector<int, 32> ScaledMask; SmallVector<SDValue, 2> ShuffleInputs; - unsigned NumSubVecs = InSizeInBits / SizeInBits; + unsigned NumSubVecs = InSizeInBits / SizeInBits; // Decode the shuffle mask and scale it so its shuffling subvectors. if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { @@ -49818,18 +49818,18 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (ScaledMask[SubVecIdx] == SM_SentinelZero) return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; - if (Src.getValueSizeInBits() == InSizeInBits) { + if (Src.getValueSizeInBits() == InSizeInBits) { unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements(); return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, - SDLoc(N), SizeInBits); + SDLoc(N), SizeInBits); } } } // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. - unsigned InOpcode = InVec.getOpcode(); + unsigned InOpcode = InVec.getOpcode(); if (IdxVal == 0 && InVec.hasOneUse()) { if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). @@ -49854,14 +49854,14 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || InOpcode == ISD::SIGN_EXTEND || InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && - (SizeInBits == 128 || SizeInBits == 256) && - InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { - SDLoc DL(N); - SDValue Ext = InVec.getOperand(0); - if (Ext.getValueSizeInBits() > SizeInBits) - Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); + (SizeInBits == 128 || SizeInBits == 256) && + InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { + SDLoc DL(N); + SDValue Ext = InVec.getOperand(0); + if (Ext.getValueSizeInBits() > SizeInBits) + Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode); - return DAG.getNode(ExtOp, DL, VT, Ext); + return DAG.getNode(ExtOp, DL, VT, Ext); } if (InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && @@ -49873,27 +49873,27 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } - if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && - (VT.is128BitVector() || VT.is256BitVector())) { - SDLoc DL(N); - SDValue InVecSrc = InVec.getOperand(0); - unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; - SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); - return DAG.getNode(InOpcode, DL, VT, Ext); - } - } - - // Always split vXi64 logical shifts where we're extracting the upper 32-bits - // as this is very likely to fold into a shuffle/truncation. - if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) && - InVecVT.getScalarSizeInBits() == 64 && - InVec.getConstantOperandAPInt(1) == 32) { - SDLoc DL(N); - SDValue Ext = - extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); - return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1)); - } - + if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && + (VT.is128BitVector() || VT.is256BitVector())) { + SDLoc DL(N); + SDValue InVecSrc = InVec.getOperand(0); + unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; + SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext); + } + } + + // Always split vXi64 logical shifts where we're extracting the upper 32-bits + // as this is very likely to fold into a shuffle/truncation. + if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) && + InVecVT.getScalarSizeInBits() == 64 && + InVec.getConstantOperandAPInt(1) == 32) { + SDLoc DL(N); + SDValue Ext = + extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); + return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1)); + } + return SDValue(); } @@ -49975,7 +49975,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, // If the input is an extend_invec and the SimplifyDemandedBits call didn't // convert it to any_extend_invec, due to the LegalOperations check, do the // conversion directly to a vector shuffle manually. This exposes combine - // opportunities missed by combineEXTEND_VECTOR_INREG not calling + // opportunities missed by combineEXTEND_VECTOR_INREG not calling // combineX86ShufflesRecursively on SSE4.1 targets. // FIXME: This is basically a hack around several other issues related to // ANY_EXTEND_VECTOR_INREG. @@ -50003,13 +50003,13 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); - unsigned Opcode = N->getOpcode(); - unsigned InOpcode = In.getOpcode(); + unsigned Opcode = N->getOpcode(); + unsigned InOpcode = In.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Try to merge vector loads and extend_inreg to an extload. @@ -50018,7 +50018,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, auto *Ld = cast<LoadSDNode>(In); if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); - ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG + ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = @@ -50026,7 +50026,7 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), + Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; @@ -50034,23 +50034,23 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, } } - // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X). - if (Opcode == InOpcode) - return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0)); - - // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0)) - // -> EXTEND_VECTOR_INREG(X). - // TODO: Handle non-zero subvector indices. - if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 && - In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) && - In.getOperand(0).getOperand(0).getValueSizeInBits() == - In.getValueSizeInBits()) - return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0)); - + // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X). + if (Opcode == InOpcode) + return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0)); + + // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0)) + // -> EXTEND_VECTOR_INREG(X). + // TODO: Handle non-zero subvector indices. + if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 && + In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) && + In.getOperand(0).getOperand(0).getValueSizeInBits() == + In.getValueSizeInBits()) + return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0)); + // Attempt to combine as a shuffle. - // TODO: General ZERO_EXTEND_VECTOR_INREG support. - if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG || - (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) { + // TODO: General ZERO_EXTEND_VECTOR_INREG support. + if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG || + (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) @@ -50171,15 +50171,15 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); } -// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract -// from. Limit this to cases where the loads have the same input chain and the -// output chains are unused. This avoids any memory ordering issues. -static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || - N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && - "Unknown broadcast load type"); - +// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract +// from. Limit this to cases where the loads have the same input chain and the +// output chains are unused. This avoids any memory ordering issues. +static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || + N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && + "Unknown broadcast load type"); + // Only do this if the chain result is unused. if (N->hasAnyUseOfValue(1)) return SDValue(); @@ -50194,13 +50194,13 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, // Look at other users of our base pointer and try to find a wider broadcast. // The input chain and the size of the memory VT must match. for (SDNode *User : Ptr->uses()) - if (User != N && User->getOpcode() == N->getOpcode() && + if (User != N && User->getOpcode() == N->getOpcode() && cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && cast<MemIntrinsicSDNode>(User)->getChain() == Chain && cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && !User->hasAnyUseOfValue(1) && - User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) { + User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) { SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), VT.getSizeInBits()); Extract = DAG.getBitcast(VT, Extract); @@ -50271,17 +50271,17 @@ static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - unsigned NumBits = N->getSimpleValueType(0).getSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(SDValue(N, 0), - APInt::getAllOnesValue(NumBits), DCI)) - return SDValue(N, 0); - - return SDValue(); -} - +static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + unsigned NumBits = N->getSimpleValueType(0).getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBits), DCI)) + return SDValue(N, 0); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -50318,8 +50318,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); - case X86ISD::BEXTR: - case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); + case X86ISD::BEXTR: + case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); @@ -50364,17 +50364,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: - case ISD::ZERO_EXTEND_VECTOR_INREG: - return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); - case X86ISD::HADD: - case X86ISD::HSUB: - case X86ISD::FHADD: - case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget); + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget); case X86ISD::VSHL: case X86ISD::VSRA: case X86ISD::VSRL: @@ -50451,10 +50451,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); - case X86ISD::VBROADCAST_LOAD: - case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); + case X86ISD::VBROADCAST_LOAD: + case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); - case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); + case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); } return SDValue(); @@ -50743,7 +50743,7 @@ static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { .Case("{@ccnl}", X86::COND_GE) .Case("{@ccnle}", X86::COND_G) .Case("{@ccno}", X86::COND_NO) - .Case("{@ccnp}", X86::COND_NP) + .Case("{@ccnp}", X86::COND_NP) .Case("{@ccns}", X86::COND_NS) .Case("{@cco}", X86::COND_O) .Case("{@ccp}", X86::COND_P) @@ -50979,8 +50979,8 @@ LowerXConstraint(EVT ConstraintVT) const { // Lower @cc targets via setcc. SDValue X86TargetLowering::LowerAsmOutputForConstraint( - SDValue &Chain, SDValue &Flag, const SDLoc &DL, - const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { + SDValue &Chain, SDValue &Flag, const SDLoc &DL, + const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); if (Cond == X86::COND_INVALID) return SDValue(); @@ -51416,26 +51416,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Not found as a standard register? if (!Res.second) { - // Only match x87 registers if the VT is one SelectionDAGBuilder can convert - // to/from f80. - if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { - // Map st(0) -> st(7) -> ST0 - if (Constraint.size() == 7 && Constraint[0] == '{' && - tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && - Constraint[3] == '(' && - (Constraint[4] >= '0' && Constraint[4] <= '7') && - Constraint[5] == ')' && Constraint[6] == '}') { - // st(7) is not allocatable and thus not a member of RFP80. Return - // singleton class in cases where we have a reference to it. - if (Constraint[4] == '7') - return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); - return std::make_pair(X86::FP0 + Constraint[4] - '0', - &X86::RFP80RegClass); - } - - // GCC allows "st(0)" to be called just plain "st". - if (StringRef("{st}").equals_lower(Constraint)) - return std::make_pair(X86::FP0, &X86::RFP80RegClass); + // Only match x87 registers if the VT is one SelectionDAGBuilder can convert + // to/from f80. + if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { + // Map st(0) -> st(7) -> ST0 + if (Constraint.size() == 7 && Constraint[0] == '{' && + tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && + Constraint[3] == '(' && + (Constraint[4] >= '0' && Constraint[4] <= '7') && + Constraint[5] == ')' && Constraint[6] == '}') { + // st(7) is not allocatable and thus not a member of RFP80. Return + // singleton class in cases where we have a reference to it. + if (Constraint[4] == '7') + return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); + return std::make_pair(X86::FP0 + Constraint[4] - '0', + &X86::RFP80RegClass); + } + + // GCC allows "st(0)" to be called just plain "st". + if (StringRef("{st}").equals_lower(Constraint)) + return std::make_pair(X86::FP0, &X86::RFP80RegClass); } // flags -> EFLAGS @@ -51443,8 +51443,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); // dirflag -> DF - // Only allow for clobber. - if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other) + // Only allow for clobber. + if (StringRef("{dirflag}").equals_lower(Constraint) && VT == MVT::Other) return std::make_pair(X86::DF, &X86::DFCCRRegClass); // fpsr -> FPSW @@ -51718,10 +51718,10 @@ X86TargetLowering::getStackProbeSize(MachineFunction &MF) const { .getAsInteger(0, StackProbeSize); return StackProbeSize; } - -Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - if (ML->isInnermost() && - ExperimentalPrefInnermostLoopAlignment.getNumOccurrences()) - return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); - return TargetLowering::getPrefLoopAlignment(); -} + +Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + if (ML->isInnermost() && + ExperimentalPrefInnermostLoopAlignment.getNumOccurrences()) + return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); + return TargetLowering::getPrefLoopAlignment(); +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h index 76c83b7df9..8c2249a18f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86ISelLowering.h @@ -384,10 +384,10 @@ namespace llvm { /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, - // Vector mask comparison generating mask bits for FP values. - CMPMM, - // Vector mask comparison with SAE for FP values. - CMPMM_SAE, + // Vector mask comparison generating mask bits for FP values. + CMPMM, + // Vector mask comparison with SAE for FP values. + CMPMM_SAE, // Arithmetic operations with FLAGS results. ADD, @@ -402,7 +402,7 @@ namespace llvm { // Bit field extract. BEXTR, - BEXTRI, + BEXTRI, // Zero High Bits Starting with Specified Bit Position. BZHI, @@ -709,9 +709,9 @@ namespace llvm { // For avx512-vp2intersect VP2INTERSECT, - // User level interrupts - testui - TESTUI, - + // User level interrupts - testui + TESTUI, + /// X86 strict FP compare instructions. STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPS, @@ -751,9 +751,9 @@ namespace llvm { STRICT_CVTPS2PH, STRICT_CVTPH2PS, - // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and - // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. - + // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and + // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, @@ -774,12 +774,12 @@ namespace llvm { // extract_vector_elt, store. VEXTRACT_STORE, - // scalar broadcast from memory. + // scalar broadcast from memory. VBROADCAST_LOAD, - // subvector broadcast from memory. - SUBV_BROADCAST_LOAD, - + // subvector broadcast from memory. + SUBV_BROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, @@ -815,10 +815,10 @@ namespace llvm { /// specifies the type to store as. FST, - /// These instructions grab the address of the next argument + /// These instructions grab the address of the next argument /// from a va_list. (reads and modifies the va_list in memory) VAARG_64, - VAARG_X32, + VAARG_X32, // Vector truncating store with unsigned/signed saturation VTRUNCSTOREUS, @@ -831,16 +831,16 @@ namespace llvm { MGATHER, MSCATTER, - // Key locker nodes that produce flags. - AESENC128KL, - AESDEC128KL, - AESENC256KL, - AESDEC256KL, - AESENCWIDE128KL, - AESDECWIDE128KL, - AESENCWIDE256KL, - AESDECWIDE256KL, - + // Key locker nodes that produce flags. + AESENC128KL, + AESDEC128KL, + AESENC256KL, + AESDEC256KL, + AESENCWIDE128KL, + AESDECWIDE128KL, + AESENCWIDE256KL, + AESDECWIDE256KL, + // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all // opcodes will be thought as target memory ops! @@ -855,7 +855,7 @@ namespace llvm { /// Returns true of the given offset can be /// fit into displacement field of the instruction. bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, - bool hasSymbolicDisplacement); + bool hasSymbolicDisplacement); /// Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. @@ -1128,8 +1128,8 @@ namespace llvm { } /// Handle Lowering flag assembly outputs. - SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, - const SDLoc &DL, + SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, + const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override; @@ -1408,8 +1408,8 @@ namespace llvm { SDValue Addr, SelectionDAG &DAG) const override; - Align getPrefLoopAlignment(MachineLoop *ML) const override; - + Align getPrefLoopAlignment(MachineLoop *ML) const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -1501,7 +1501,7 @@ namespace llvm { SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; @@ -1583,7 +1583,7 @@ namespace llvm { // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock * - EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; + EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Utility function to emit the xmm reg save portion of va_start. MachineBasicBlock * @@ -1699,7 +1699,7 @@ namespace llvm { }; /// Generate unpacklo/unpackhi shuffle mask. - void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo, + void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo, bool Unary); /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation diff --git a/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp index 85410c54a4..bba0e8d30d 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -28,7 +28,7 @@ using namespace llvm; #define DEBUG_TYPE "x86-indirect-branch-tracking" -cl::opt<bool> IndirectBranchTracking( +cl::opt<bool> IndirectBranchTracking( "x86-indirect-branch-tracking", cl::init(false), cl::Hidden, cl::desc("Enable X86 indirect branch tracking pass.")); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp index 004e6fa5eb..1c25934ded 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InsertPrefetch.cpp @@ -214,10 +214,10 @@ bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true); MachineInstrBuilder MIB(MF, PFetch); - static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 && - X86::AddrIndexReg == 2 && X86::AddrDisp == 3 && - X86::AddrSegmentReg == 4, - "Unexpected change in X86 operand offset order."); + static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 && + X86::AddrIndexReg == 2 && X86::AddrDisp == 3 && + X86::AddrSegmentReg == 4, + "Unexpected change in X86 operand offset order."); // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc. // FIXME(mtrofin): consider adding a: diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp index 56d2709f59..fe9bcfd0fd 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InsertWait.cpp @@ -115,7 +115,7 @@ bool WaitInsert::runOnMachineFunction(MachineFunction &MF) { return false; const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - const X86InstrInfo *TII = ST.getInstrInfo(); + const X86InstrInfo *TII = ST.getInstrInfo(); bool Changed = false; for (MachineBasicBlock &MBB : MF) { diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp index c4150ed528..c6388617c6 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1,2017 +1,2017 @@ -//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements a TargetTransformInfo analysis pass specific to the -/// X86 target machine. It uses the target's detailed information to provide -/// more precise answers to certain TTI queries, while letting the target -/// independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// - -#include "X86TargetTransformInfo.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsX86.h" -#include "llvm/Support/KnownBits.h" -#include "llvm/Transforms/InstCombine/InstCombiner.h" - -using namespace llvm; - -#define DEBUG_TYPE "x86tti" - -/// Return a constant boolean vector that has true elements in all positions -/// where the input constant data vector has an element with the sign bit set. -static Constant *getNegativeIsTrueBoolVec(Constant *V) { - VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); - V = ConstantExpr::getBitCast(V, IntTy); - V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), - V); - return V; -} - -/// Convert the x86 XMM integer vector mask to a vector of bools based on -/// each element's most significant bit (the sign bit). -static Value *getBoolVecFromMask(Value *Mask) { - // Fold Constant Mask. - if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) - return getNegativeIsTrueBoolVec(ConstantMask); - - // Mask was extended from a boolean vector. - Value *ExtMask; - if (PatternMatch::match( - Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && - ExtMask->getType()->isIntOrIntVectorTy(1)) - return ExtMask; - - return nullptr; -} - -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Constant *ZeroVec = Constant::getNullValue(II.getType()); - - // Zero Mask - masked load instruction creates a zero vector. - if (isa<ConstantAggregateZero>(Mask)) - return IC.replaceInstUsesWith(II, ZeroVec); - - // The mask is constant or extended from a bool vector. Convert this x86 - // intrinsic to the LLVM intrinsic to allow target-independent optimizations. - if (Value *BoolMask = getBoolVecFromMask(Mask)) { - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - // The pass-through vector for an x86 masked load is a zero vector. - CallInst *NewMaskedLoad = - IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); - return IC.replaceInstUsesWith(II, NewMaskedLoad); - } - - return nullptr; -} - -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Value *Vec = II.getOperand(2); - - // Zero Mask - this masked store instruction does nothing. - if (isa<ConstantAggregateZero>(Mask)) { - IC.eraseInstFromFunction(II); - return true; - } - - // The SSE2 version is too weird (eg, unaligned but non-temporal) to do - // anything else at this level. - if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) - return false; - - // The mask is constant or extended from a bool vector. Convert this x86 - // intrinsic to the LLVM intrinsic to allow target-independent optimizations. - if (Value *BoolMask = getBoolVecFromMask(Mask)) { - unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); - - // 'Replace uses' doesn't work for stores. Erase the original masked store. - IC.eraseInstFromFunction(II); - return true; - } - - return false; -} - -static Value *simplifyX86immShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - bool IsImm = false; - - switch (II.getIntrinsicID()) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast<FixedVectorType>(Vec->getType()); - auto SVT = VT->getElementType(); - auto AmtVT = Amt->getType(); - unsigned VWidth = VT->getNumElements(); - unsigned BitWidth = SVT->getPrimitiveSizeInBits(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. If its guaranteed to be out of range, logical shifts combine - // to zero and arithmetic shifts are clamped to (BitWidth - 1). - if (IsImm) { - assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); - KnownBits KnownAmtBits = - llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); - if (KnownAmtBits.getMaxValue().ult(BitWidth)) { - Amt = Builder.CreateZExtOrTrunc(Amt, SVT); - Amt = Builder.CreateVectorSplat(VWidth, Amt); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - if (KnownAmtBits.getMinValue().uge(BitWidth)) { - if (LogicalShift) - return ConstantAggregateZero::get(VT); - Amt = ConstantInt::get(SVT, BitWidth - 1); - return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); - } - } else { - // Ensure the first element has an in-range value and the rest of the - // elements in the bottom 64 bits are zero. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast<VectorType>(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); - APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); - APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); - KnownBits KnownLowerBits = llvm::computeKnownBits( - Amt, DemandedLower, II.getModule()->getDataLayout()); - KnownBits KnownUpperBits = llvm::computeKnownBits( - Amt, DemandedUpper, II.getModule()->getDataLayout()); - if (KnownLowerBits.getMaxValue().ult(BitWidth) && - (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { - SmallVector<int, 16> ZeroSplat(VWidth, 0); - Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - } - - // Simplify if count is constant vector. - auto CDV = dyn_cast<ConstantDataVector>(Amt); - if (!CDV) - return nullptr; - - // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast<VectorType>(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - - // Concatenate the sub-elements to create the 64-bit value. - APInt Count(64, 0); - for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { - unsigned SubEltIdx = (NumSubElts - 1) - i; - auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); - Count <<= BitWidth; - Count |= SubElt->getValue().zextOrTrunc(64); - } - - // If shift-by-zero then just return the original value. - if (Count.isNullValue()) - return Vec; - - // Handle cases when Shift >= BitWidth. - if (Count.uge(BitWidth)) { - // If LogicalShift - just return zero. - if (LogicalShift) - return ConstantAggregateZero::get(VT); - - // If ArithmeticShift - clamp Shift to (BitWidth - 1). - Count = APInt(64, BitWidth - 1); - } - - // Get a constant vector of the same type as the first operand. - auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); - auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. -// Unlike the generic IR shifts, the intrinsics have defined behaviour for out -// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). -static Value *simplifyX86varShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - - switch (II.getIntrinsicID()) { - default: - llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast<FixedVectorType>(II.getType()); - auto SVT = VT->getElementType(); - int NumElts = VT->getNumElements(); - int BitWidth = SVT->getIntegerBitWidth(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. - APInt UpperBits = - APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); - if (llvm::MaskedValueIsZero(Amt, UpperBits, - II.getModule()->getDataLayout())) { - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - - // Simplify if all shift amounts are constant/undef. - auto *CShift = dyn_cast<Constant>(Amt); - if (!CShift) - return nullptr; - - // Collect each element's shift amount. - // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. - bool AnyOutOfRange = false; - SmallVector<int, 8> ShiftAmts; - for (int I = 0; I < NumElts; ++I) { - auto *CElt = CShift->getAggregateElement(I); - if (isa_and_nonnull<UndefValue>(CElt)) { - ShiftAmts.push_back(-1); - continue; - } - - auto *COp = dyn_cast_or_null<ConstantInt>(CElt); - if (!COp) - return nullptr; - - // Handle out of range shifts. - // If LogicalShift - set to BitWidth (special case). - // If ArithmeticShift - set to (BitWidth - 1) (sign splat). - APInt ShiftVal = COp->getValue(); - if (ShiftVal.uge(BitWidth)) { - AnyOutOfRange = LogicalShift; - ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); - continue; - } - - ShiftAmts.push_back((int)ShiftVal.getZExtValue()); - } - - // If all elements out of range or UNDEF, return vector of zeros/undefs. - // ArithmeticShift should only hit this if they are all UNDEF. - auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; - if (llvm::all_of(ShiftAmts, OutOfRange)) { - SmallVector<Constant *, 8> ConstantVec; - for (int Idx : ShiftAmts) { - if (Idx < 0) { - ConstantVec.push_back(UndefValue::get(SVT)); - } else { - assert(LogicalShift && "Logical shift expected"); - ConstantVec.push_back(ConstantInt::getNullValue(SVT)); - } - } - return ConstantVector::get(ConstantVec); - } - - // We can't handle only some out of range values with generic logical shifts. - if (AnyOutOfRange) - return nullptr; - - // Build the shift amount constant vector. - SmallVector<Constant *, 8> ShiftVecAmts; - for (int Idx : ShiftAmts) { - if (Idx < 0) - ShiftVecAmts.push_back(UndefValue::get(SVT)); - else - ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); - } - auto ShiftVec = ConstantVector::get(ShiftVecAmts); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -static Value *simplifyX86pack(IntrinsicInst &II, - InstCombiner::BuilderTy &Builder, bool IsSigned) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Type *ResTy = II.getType(); - - // Fast all undef handling. - if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) - return UndefValue::get(ResTy); - - auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); - unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; - unsigned NumSrcElts = ArgTy->getNumElements(); - assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && - "Unexpected packing types"); - - unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; - unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); - unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); - assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && - "Unexpected packing types"); - - // Constant folding. - if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) - return nullptr; - - // Clamp Values - signed/unsigned both use signed clamp values, but they - // differ on the min/max values. - APInt MinValue, MaxValue; - if (IsSigned) { - // PACKSS: Truncate signed value with signed saturation. - // Source values less than dst minint are saturated to minint. - // Source values greater than dst maxint are saturated to maxint. - MinValue = - APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - MaxValue = - APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - } else { - // PACKUS: Truncate signed value with unsigned saturation. - // Source values less than zero are saturated to zero. - // Source values greater than dst maxuint are saturated to maxuint. - MinValue = APInt::getNullValue(SrcScalarSizeInBits); - MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); - } - - auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); - auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); - - // Shuffle clamped args together at the lane level. - SmallVector<int, 32> PackMask; - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); - } - auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); - - // Truncate to dst size. - return Builder.CreateTrunc(Shuffle, ResTy); -} - -static Value *simplifyX86movmsk(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *Arg = II.getArgOperand(0); - Type *ResTy = II.getType(); - - // movmsk(undef) -> zero as we must ensure the upper bits are zero. - if (isa<UndefValue>(Arg)) - return Constant::getNullValue(ResTy); - - auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); - // We can't easily peek through x86_mmx types. - if (!ArgTy) - return nullptr; - - // Expand MOVMSK to compare/bitcast/zext: - // e.g. PMOVMSKB(v16i8 x): - // %cmp = icmp slt <16 x i8> %x, zeroinitializer - // %int = bitcast <16 x i1> %cmp to i16 - // %res = zext i16 %int to i32 - unsigned NumElts = ArgTy->getNumElements(); - Type *IntegerVecTy = VectorType::getInteger(ArgTy); - Type *IntegerTy = Builder.getIntNTy(NumElts); - - Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); - Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); - Res = Builder.CreateBitCast(Res, IntegerTy); - Res = Builder.CreateZExtOrTrunc(Res, ResTy); - return Res; -} - -static Value *simplifyX86addcarry(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *CarryIn = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - Value *Op2 = II.getArgOperand(2); - Type *RetTy = II.getType(); - Type *OpTy = Op1->getType(); - assert(RetTy->getStructElementType(0)->isIntegerTy(8) && - RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && - "Unexpected types for x86 addcarry"); - - // If carry-in is zero, this is just an unsigned add with overflow. - if (match(CarryIn, PatternMatch::m_ZeroInt())) { - Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, - {Op1, Op2}); - // The types have to be adjusted to match the x86 call types. - Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); - Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), - Builder.getInt8Ty()); - Value *Res = UndefValue::get(RetTy); - Res = Builder.CreateInsertValue(Res, UAddOV, 0); - return Builder.CreateInsertValue(Res, UAddResult, 1); - } - - return nullptr; -} - -static Value *simplifyX86insertps(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); - if (!CInt) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - - // The immediate permute control byte looks like this: - // [3:0] - zero mask for each 32-bit lane - // [5:4] - select one 32-bit destination lane - // [7:6] - select one 32-bit source lane - - uint8_t Imm = CInt->getZExtValue(); - uint8_t ZMask = Imm & 0xf; - uint8_t DestLane = (Imm >> 4) & 0x3; - uint8_t SourceLane = (Imm >> 6) & 0x3; - - ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); - - // If all zero mask bits are set, this was just a weird way to - // generate a zero vector. - if (ZMask == 0xf) - return ZeroVector; - - // Initialize by passing all of the first source bits through. - int ShuffleMask[4] = {0, 1, 2, 3}; - - // We may replace the second operand with the zero vector. - Value *V1 = II.getArgOperand(1); - - if (ZMask) { - // If the zero mask is being used with a single input or the zero mask - // overrides the destination lane, this is a shuffle with the zero vector. - if ((II.getArgOperand(0) == II.getArgOperand(1)) || - (ZMask & (1 << DestLane))) { - V1 = ZeroVector; - // We may still move 32-bits of the first source vector from one lane - // to another. - ShuffleMask[DestLane] = SourceLane; - // The zero mask may override the previous insert operation. - for (unsigned i = 0; i < 4; ++i) - if ((ZMask >> i) & 0x1) - ShuffleMask[i] = i + 4; - } else { - // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? - return nullptr; - } - } else { - // Replace the selected destination lane with the selected source lane. - ShuffleMask[DestLane] = SourceLane + 4; - } - - return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); -} - -/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding -/// or conversion to a shuffle vector. -static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, - ConstantInt *CILength, ConstantInt *CIIndex, - InstCombiner::BuilderTy &Builder) { - auto LowConstantHighUndef = [&](uint64_t Val) { - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - }; - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast<Constant>(Op0); - ConstantInt *CI0 = - C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) - : nullptr; - - // Attempt to constant fold. - if (CILength && CIIndex) { - // From AMD documentation: "The bit index and field length are each six - // bits in length other bits of the field are ignored." - APInt APIndex = CIIndex->getValue().zextOrTrunc(6); - APInt APLength = CILength->getValue().zextOrTrunc(6); - - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize EXTRQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector<int, 16> ShuffleMask; - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + Index); - for (int i = Length; i != 8; ++i) - ShuffleMask.push_back(i + 16); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector( - Builder.CreateBitCast(Op0, ShufTy), - ConstantAggregateZero::get(ShufTy), ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // Constant Fold - shift Index'th bit to lowest position and mask off - // Length bits. - if (CI0) { - APInt Elt = CI0->getValue(); - Elt.lshrInPlace(Index); - Elt = Elt.zextOrTrunc(Length); - return LowConstantHighUndef(Elt.getZExtValue()); - } - - // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { - Value *Args[] = {Op0, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); - return Builder.CreateCall(F, Args); - } - } - - // Constant Fold - extraction from zero is always {zero, undef}. - if (CI0 && CI0->isZero()) - return LowConstantHighUndef(0); - - return nullptr; -} - -/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant -/// folding or conversion to a shuffle vector. -static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, - APInt APLength, APInt APIndex, - InstCombiner::BuilderTy &Builder) { - // From AMD documentation: "The bit index and field length are each six bits - // in length other bits of the field are ignored." - APIndex = APIndex.zextOrTrunc(6); - APLength = APLength.zextOrTrunc(6); - - // Attempt to constant fold. - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize INSERTQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector<int, 16> ShuffleMask; - for (int i = 0; i != (int)Index; ++i) - ShuffleMask.push_back(i); - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + 16); - for (int i = Index + Length; i != 8; ++i) - ShuffleMask.push_back(i); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), - Builder.CreateBitCast(Op1, ShufTy), - ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast<Constant>(Op0); - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CI00 = - C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CI10 = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) - : nullptr; - - // Constant Fold - insert bottom Length bits starting at the Index'th bit. - if (CI00 && CI10) { - APInt V00 = CI00->getValue(); - APInt V10 = CI10->getValue(); - APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); - V00 = V00 & ~Mask; - V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); - APInt Val = V00 | V10; - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - } - - // If we were an INSERTQ call, we'll save demanded elements if we convert to - // INSERTQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - Constant *CILength = ConstantInt::get(IntTy8, Length, false); - Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); - - Value *Args[] = {Op0, Op1, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return Builder.CreateCall(F, Args); - } - - return nullptr; -} - -/// Attempt to convert pshufb* to shufflevector if the mask is constant. -static Value *simplifyX86pshufb(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && - "Unexpected number of elements in shuffle mask!"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - // Each byte in the shuffle control mask forms an index to permute the - // corresponding byte in the destination operand. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); - - // If the most significant bit (bit[7]) of each byte of the shuffle - // control mask is set, then zero is written in the result byte. - // The zero vector is in the right-hand side of the resulting - // shufflevector. - - // The value of each index for the high 128-bit lane is the least - // significant 4 bits of the respective shuffle control byte. - Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - auto V2 = Constant::getNullValue(VecTy); - return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. -static Value *simplifyX86vpermilvar(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - bool IsPD = VecTy->getScalarType()->isDoubleTy(); - unsigned NumLaneElts = IsPD ? 2 : 4; - assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[16]; - - // The intrinsics only read one or two bits, clear the rest. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - APInt Index = cast<ConstantInt>(COp)->getValue(); - Index = Index.zextOrTrunc(32).getLoBits(2); - - // The PD variants uses bit 1 to select per-lane element index, so - // shift down to convert to generic shuffle mask index. - if (IsPD) - Index.lshrInPlace(1); - - // The _256 variants are a bit trickier since the mask bits always index - // into the corresponding 128 half. In order to convert to a generic - // shuffle, we have to make that explicit. - Index += APInt(32, (I / NumLaneElts) * NumLaneElts); - - Indexes[I] = Index.getZExtValue(); - } - - auto V1 = II.getArgOperand(0); - return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. -static Value *simplifyX86vpermv(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *V = dyn_cast<Constant>(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast<FixedVectorType>(II.getType()); - unsigned Size = VecTy->getNumElements(); - assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && - "Unexpected shuffle mask size"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - for (unsigned I = 0; I < Size; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) - return nullptr; - - if (isa<UndefValue>(COp)) { - Indexes[I] = -1; - continue; - } - - uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); - Index &= Size - 1; - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); -} - -Optional<Instruction *> -X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { - auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, - unsigned DemandedWidth) { - APInt UndefElts(Width, 0); - APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); - return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); - }; - - Intrinsic::ID IID = II.getIntrinsicID(); - switch (IID) { - case Intrinsic::x86_bmi_bextr_32: - case Intrinsic::x86_bmi_bextr_64: - case Intrinsic::x86_tbm_bextri_u32: - case Intrinsic::x86_tbm_bextri_u64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - uint64_t Shift = C->getZExtValue(); - uint64_t Length = (Shift >> 8) & 0xff; - Shift &= 0xff; - unsigned BitWidth = II.getType()->getIntegerBitWidth(); - // If the length is 0 or the shift is out of range, replace with zero. - if (Length == 0 || Shift >= BitWidth) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Result = InC->getZExtValue() >> Shift; - if (Length > BitWidth) - Length = BitWidth; - Result &= maskTrailingOnes<uint64_t>(Length); - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we - // are only masking bits that a shift already cleared? - } - break; - - case Intrinsic::x86_bmi_bzhi_32: - case Intrinsic::x86_bmi_bzhi_64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - uint64_t Index = C->getZExtValue() & 0xff; - unsigned BitWidth = II.getType()->getIntegerBitWidth(); - if (Index >= BitWidth) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - if (Index == 0) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Result = InC->getZExtValue(); - Result &= maskTrailingOnes<uint64_t>(Index); - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - // TODO should we convert this to an AND if the RHS is constant? - } - break; - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - if (MaskC->getValue().isShiftedMask()) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); - Value *Input = II.getArgOperand(0); - Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); - Value *Shifted = IC.Builder.CreateLShr(Masked, - ConstantInt::get(II.getType(), - ShiftAmount)); - return IC.replaceInstUsesWith(II, Shifted); - } - - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToSet = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToTest = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToSet <<= 1; - // Clear lowest set bit. - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - if (MaskC->getValue().isShiftedMask()) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); - Value *Input = II.getArgOperand(0); - Value *Shifted = IC.Builder.CreateShl(Input, - ConstantInt::get(II.getType(), - ShiftAmount)); - Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); - return IC.replaceInstUsesWith(II, Masked); - } - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToTest = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToSet = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToTest <<= 1; - // Clear lowest set bit; - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - - case Intrinsic::x86_sse_cvtss2si: - case Intrinsic::x86_sse_cvtss2si64: - case Intrinsic::x86_sse_cvttss2si: - case Intrinsic::x86_sse_cvttss2si64: - case Intrinsic::x86_sse2_cvtsd2si: - case Intrinsic::x86_sse2_cvtsd2si64: - case Intrinsic::x86_sse2_cvttsd2si: - case Intrinsic::x86_sse2_cvttsd2si64: - case Intrinsic::x86_avx512_vcvtss2si32: - case Intrinsic::x86_avx512_vcvtss2si64: - case Intrinsic::x86_avx512_vcvtss2usi32: - case Intrinsic::x86_avx512_vcvtss2usi64: - case Intrinsic::x86_avx512_vcvtsd2si32: - case Intrinsic::x86_avx512_vcvtsd2si64: - case Intrinsic::x86_avx512_vcvtsd2usi32: - case Intrinsic::x86_avx512_vcvtsd2usi64: - case Intrinsic::x86_avx512_cvttss2si: - case Intrinsic::x86_avx512_cvttss2si64: - case Intrinsic::x86_avx512_cvttss2usi: - case Intrinsic::x86_avx512_cvttss2usi64: - case Intrinsic::x86_avx512_cvttsd2si: - case Intrinsic::x86_avx512_cvttsd2si64: - case Intrinsic::x86_avx512_cvttsd2usi: - case Intrinsic::x86_avx512_cvttsd2usi64: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - Value *Arg = II.getArgOperand(0); - unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx2_pmovmskb: - if (Value *V = simplifyX86movmsk(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomineq_sd: - case Intrinsic::x86_avx512_vcomi_ss: - case Intrinsic::x86_avx512_vcomi_sd: - case Intrinsic::x86_avx512_mask_cmp_ss: - case Intrinsic::x86_avx512_mask_cmp_sd: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - bool MadeChange = false; - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - case Intrinsic::x86_avx512_div_pd_512: - case Intrinsic::x86_avx512_mul_pd_512: - case Intrinsic::x86_avx512_sub_pd_512: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { - if (R->getValue() == 4) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - - Value *V; - switch (IID) { - default: - llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - V = IC.Builder.CreateFAdd(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_sub_pd_512: - V = IC.Builder.CreateFSub(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_mul_pd_512: - V = IC.Builder.CreateFMul(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_div_pd_512: - V = IC.Builder.CreateFDiv(Arg0, Arg1); - break; - } - - return IC.replaceInstUsesWith(II, V); - } - } - break; - - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { - if (R->getValue() == 4) { - // Extract the element as scalars. - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); - Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); - - Value *V; - switch (IID) { - default: - llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - V = IC.Builder.CreateFAdd(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - V = IC.Builder.CreateFSub(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - V = IC.Builder.CreateFMul(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - V = IC.Builder.CreateFDiv(LHS, RHS); - break; - } - - // Handle the masking aspect of the intrinsic. - Value *Mask = II.getArgOperand(3); - auto *C = dyn_cast<ConstantInt>(Mask); - // We don't need a select if we know the mask bit is a 1. - if (!C || !C->getValue()[0]) { - // Cast the mask to an i1 vector and then extract the lowest element. - auto *MaskTy = FixedVectorType::get( - IC.Builder.getInt1Ty(), - cast<IntegerType>(Mask->getType())->getBitWidth()); - Mask = IC.Builder.CreateBitCast(Mask, MaskTy); - Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); - // Extract the lowest element from the passthru operand. - Value *Passthru = - IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); - V = IC.Builder.CreateSelect(Mask, V, Passthru); - } - - // Insert the result back into the original argument 0. - V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); - - return IC.replaceInstUsesWith(II, V); - } - } - break; - - // Constant fold ashr( <A x Bi>, Ci ). - // Constant fold lshr( <A x Bi>, Ci ). - // Constant fold shl( <A x Bi>, Ci ). - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - if (Value *V = simplifyX86immShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: { - if (Value *V = simplifyX86immShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - Value *Arg1 = II.getArgOperand(1); - assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && - "Unexpected packed shift size"); - unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); - - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { - return IC.replaceOperand(II, 1, V); - } - break; - } - - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - if (Value *V = simplifyX86varShift(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - if (Value *V = simplifyX86pack(II, IC.Builder, true)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: - if (Value *V = simplifyX86pack(II, IC.Builder, false)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_pclmulqdq: - case Intrinsic::x86_pclmulqdq_256: - case Intrinsic::x86_pclmulqdq_512: { - if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { - unsigned Imm = C->getZExtValue(); - - bool MadeChange = false; - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - unsigned VWidth = - cast<FixedVectorType>(Arg0->getType())->getNumElements(); - - APInt UndefElts1(VWidth, 0); - APInt DemandedElts1 = - APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); - if (Value *V = - IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - - APInt UndefElts2(VWidth, 0); - APInt DemandedElts2 = - APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); - if (Value *V = - IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - - // If either input elements are undef, the result is zero. - if (DemandedElts1.isSubsetOf(UndefElts1) || - DemandedElts2.isSubsetOf(UndefElts2)) { - return IC.replaceInstUsesWith(II, - ConstantAggregateZero::get(II.getType())); - } - - if (MadeChange) { - return &II; - } - } - break; - } - - case Intrinsic::x86_sse41_insertps: - if (Value *V = simplifyX86insertps(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_sse4a_extrq: { - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 16 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CILength = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CIIndex = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or EXTRQI call. - if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // EXTRQ only uses the lowest 64-bits of the first 128-bit vector - // operands and the lowest 16-bits of the second. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_sse4a_extrqi: { - // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining - // bits of the lower 64-bits. The upper 64-bits are undefined. - Value *Op0 = II.getArgOperand(0); - unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); - ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); - - // Attempt to simplify to a constant or shuffle vector. - if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - - // EXTRQI only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_sse4a_insertq: { - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast<Constant>(Op1); - ConstantInt *CI11 = - C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or INSERTQI call. - if (CI11) { - const APInt &V11 = CI11->getValue(); - APInt Len = V11.zextOrTrunc(6); - APInt Idx = V11.lshr(8).zextOrTrunc(6); - if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - } - - // INSERTQ only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { - return IC.replaceOperand(II, 0, V); - } - break; - } - - case Intrinsic::x86_sse4a_insertqi: { - // INSERTQI: Extract lowest Length bits from lower half of second source and - // insert over first source starting at Index bit. The upper 64-bits are - // undefined. - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 2 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); - ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); - - // Attempt to simplify to a constant or shuffle vector. - if (CILength && CIIndex) { - APInt Len = CILength->getValue().zextOrTrunc(6); - APInt Idx = CIIndex->getValue().zextOrTrunc(6); - if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - } - - // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector - // operands. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - IC.replaceOperand(II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { - IC.replaceOperand(II, 1, V); - MadeChange = true; - } - if (MadeChange) { - return &II; - } - break; - } - - case Intrinsic::x86_sse41_pblendvb: - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_avx_blendv_ps_256: - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx2_pblendvb: { - // fold (blend A, A, Mask) -> A - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - Value *Mask = II.getArgOperand(2); - if (Op0 == Op1) { - return IC.replaceInstUsesWith(II, Op0); - } - - // Zero Mask - select 1st argument. - if (isa<ConstantAggregateZero>(Mask)) { - return IC.replaceInstUsesWith(II, Op0); - } - - // Constant Mask - select 1st/2nd argument lane based on top bit of mask. - if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { - Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); - return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); - } - - // Convert to a vector select if we can bypass casts and find a boolean - // vector condition value. - Value *BoolVec; - Mask = InstCombiner::peekThroughBitcast(Mask); - if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && - BoolVec->getType()->isVectorTy() && - BoolVec->getType()->getScalarSizeInBits() == 1) { - assert(Mask->getType()->getPrimitiveSizeInBits() == - II.getType()->getPrimitiveSizeInBits() && - "Not expecting mask and operands with different sizes"); - - unsigned NumMaskElts = - cast<FixedVectorType>(Mask->getType())->getNumElements(); - unsigned NumOperandElts = - cast<FixedVectorType>(II.getType())->getNumElements(); - if (NumMaskElts == NumOperandElts) { - return SelectInst::Create(BoolVec, Op1, Op0); - } - - // If the mask has less elements than the operands, each mask bit maps to - // multiple elements of the operands. Bitcast back and forth. - if (NumMaskElts < NumOperandElts) { - Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); - Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); - Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); - return new BitCastInst(Sel, II.getType()); - } - } - - break; - } - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - if (Value *V = simplifyX86pshufb(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - case Intrinsic::x86_avx512_permvar_df_256: - case Intrinsic::x86_avx512_permvar_df_512: - case Intrinsic::x86_avx512_permvar_di_256: - case Intrinsic::x86_avx512_permvar_di_512: - case Intrinsic::x86_avx512_permvar_hi_128: - case Intrinsic::x86_avx512_permvar_hi_256: - case Intrinsic::x86_avx512_permvar_hi_512: - case Intrinsic::x86_avx512_permvar_qi_128: - case Intrinsic::x86_avx512_permvar_qi_256: - case Intrinsic::x86_avx512_permvar_qi_512: - case Intrinsic::x86_avx512_permvar_sf_512: - case Intrinsic::x86_avx512_permvar_si_512: - if (Value *V = simplifyX86vpermv(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - case Intrinsic::x86_avx_maskload_ps: - case Intrinsic::x86_avx_maskload_pd: - case Intrinsic::x86_avx_maskload_ps_256: - case Intrinsic::x86_avx_maskload_pd_256: - case Intrinsic::x86_avx2_maskload_d: - case Intrinsic::x86_avx2_maskload_q: - case Intrinsic::x86_avx2_maskload_d_256: - case Intrinsic::x86_avx2_maskload_q_256: - if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { - return I; - } - break; - - case Intrinsic::x86_sse2_maskmov_dqu: - case Intrinsic::x86_avx_maskstore_ps: - case Intrinsic::x86_avx_maskstore_pd: - case Intrinsic::x86_avx_maskstore_ps_256: - case Intrinsic::x86_avx_maskstore_pd_256: - case Intrinsic::x86_avx2_maskstore_d: - case Intrinsic::x86_avx2_maskstore_q: - case Intrinsic::x86_avx2_maskstore_d_256: - case Intrinsic::x86_avx2_maskstore_q_256: - if (simplifyX86MaskedStore(II, IC)) { - return nullptr; - } - break; - - case Intrinsic::x86_addcarry_32: - case Intrinsic::x86_addcarry_64: - if (Value *V = simplifyX86addcarry(II, IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - - default: - break; - } - return None; -} - -Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( - InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, - bool &KnownBitsComputed) const { - switch (II.getIntrinsicID()) { - default: - break; - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx2_pmovmskb: { - // MOVMSK copies the vector elements' sign bits to the low bits - // and zeros the high bits. - unsigned ArgWidth; - if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { - ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. - } else { - auto Arg = II.getArgOperand(0); - auto ArgType = cast<FixedVectorType>(Arg->getType()); - ArgWidth = ArgType->getNumElements(); - } - - // If we don't need any of low bits then return zero, - // we know that DemandedMask is non-zero already. - APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); - Type *VTy = II.getType(); - if (DemandedElts.isNullValue()) { - return ConstantInt::getNullValue(VTy); - } - - // We know that the upper bits are set to zero. - Known.Zero.setBitsFrom(ArgWidth); - KnownBitsComputed = true; - break; - } - } - return None; -} - -Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( - InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, - APInt &UndefElts2, APInt &UndefElts3, - std::function<void(Instruction *, unsigned, APInt, APInt &)> - simplifyAndSetOp) const { - unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); - switch (II.getIntrinsicID()) { - default: - break; - case Intrinsic::x86_xop_vfrcz_ss: - case Intrinsic::x86_xop_vfrcz_sd: - // The instructions for these intrinsics are speced to zero upper bits not - // pass them through like other scalar intrinsics. So we shouldn't just - // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. - // Instead we should return a zero vector. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return ConstantAggregateZero::get(II.getType()); - } - - // Only the lower element is used. - DemandedElts = 1; - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // Only the lower element is undefined. The high elements are zero. - UndefElts = UndefElts[0]; - break; - - // Unary scalar-as-vector operations that work column-wise. - case Intrinsic::x86_sse_rcp_ss: - case Intrinsic::x86_sse_rsqrt_ss: - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions - // checks). - break; - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0. The low element is a function of both - // operands. - case Intrinsic::x86_sse_min_ss: - case Intrinsic::x86_sse_max_ss: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse2_min_sd: - case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse2_cmp_sd: { - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - - // Lower element is undefined if both lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0]) - UndefElts.clearBit(0); - - break; - } - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element comes from operand 1. - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: { - // Don't use the low element of operand 0. - APInt DemandedElts2 = DemandedElts; - DemandedElts2.clearBit(0); - simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - - // Take the high undef elements from operand 0 and take the lower element - // from operand 1. - UndefElts.clearBit(0); - UndefElts |= UndefElts2[0]; - break; - } - - // Three input scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element is a function of all - // three inputs. - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_max_ss_round: - case Intrinsic::x86_avx512_mask_min_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - case Intrinsic::x86_avx512_mask_max_sd_round: - case Intrinsic::x86_avx512_mask_min_sd_round: - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - IC.addToWorklist(&II); - return II.getArgOperand(0); - } - - // Only lower element is used for operand 1 and 2. - DemandedElts = 1; - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); - - // Lower element is undefined if all three lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0] || !UndefElts3[0]) - UndefElts.clearBit(0); - break; - - // TODO: Add fmaddsub support? - case Intrinsic::x86_sse3_addsub_pd: - case Intrinsic::x86_sse3_addsub_ps: - case Intrinsic::x86_avx_addsub_pd_256: - case Intrinsic::x86_avx_addsub_ps_256: { - // If none of the even or none of the odd lanes are required, turn this - // into a generic FP math instruction. - APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); - APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); - bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); - bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); - if (IsSubOnly || IsAddOnly) { - assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); - IRBuilderBase::InsertPointGuard Guard(IC.Builder); - IC.Builder.SetInsertPoint(&II); - Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); - return IC.Builder.CreateBinOp( - IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); - } - - simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); - UndefElts &= UndefElts2; - break; - } - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: { - auto *Ty0 = II.getArgOperand(0)->getType(); - unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); - assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); - - unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; - unsigned VWidthPerLane = VWidth / NumLanes; - unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; - - // Per lane, pack the elements of the first input and then the second. - // e.g. - // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) - // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) - for (int OpNum = 0; OpNum != 2; ++OpNum) { - APInt OpDemandedElts(InnerVWidth, 0); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned LaneIdx = Lane * VWidthPerLane; - for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { - unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; - if (DemandedElts[Idx]) - OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); - } - } - - // Demand elements from the operand. - APInt OpUndefElts(InnerVWidth, 0); - simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); - - // Pack the operand's UNDEF elements, one lane at a time. - OpUndefElts = OpUndefElts.zext(VWidth); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); - LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); - LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); - UndefElts |= LaneElts; - } - } - break; - } - - // PSHUFB - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - // PERMILVAR - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - // PERMV - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: { - simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); - break; - } - - // SSE4A instructions leave the upper 64-bits of the 128-bit result - // in an undefined state. - case Intrinsic::x86_sse4a_extrq: - case Intrinsic::x86_sse4a_extrqi: - case Intrinsic::x86_sse4a_insertq: - case Intrinsic::x86_sse4a_insertqi: - UndefElts.setHighBits(VWidth / 2); - break; - } - return None; -} +//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "X86TargetTransformInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86tti" + +/// Return a constant boolean vector that has true elements in all positions +/// where the input constant data vector has an element with the sign bit set. +static Constant *getNegativeIsTrueBoolVec(Constant *V) { + VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); + V = ConstantExpr::getBitCast(V, IntTy); + V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), + V); + return V; +} + +/// Convert the x86 XMM integer vector mask to a vector of bools based on +/// each element's most significant bit (the sign bit). +static Value *getBoolVecFromMask(Value *Mask) { + // Fold Constant Mask. + if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) + return getNegativeIsTrueBoolVec(ConstantMask); + + // Mask was extended from a boolean vector. + Value *ExtMask; + if (PatternMatch::match( + Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && + ExtMask->getType()->isIntOrIntVectorTy(1)) + return ExtMask; + + return nullptr; +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Constant *ZeroVec = Constant::getNullValue(II.getType()); + + // Zero Mask - masked load instruction creates a zero vector. + if (isa<ConstantAggregateZero>(Mask)) + return IC.replaceInstUsesWith(II, ZeroVec); + + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); + return IC.replaceInstUsesWith(II, NewMaskedLoad); + } + + return nullptr; +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Value *Vec = II.getOperand(2); + + // Zero Mask - this masked store instruction does nothing. + if (isa<ConstantAggregateZero>(Mask)) { + IC.eraseInstFromFunction(II); + return true; + } + + // The SSE2 version is too weird (eg, unaligned but non-temporal) to do + // anything else at this level. + if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) + return false; + + // The mask is constant or extended from a bool vector. Convert this x86 + // intrinsic to the LLVM intrinsic to allow target-independent optimizations. + if (Value *BoolMask = getBoolVecFromMask(Mask)) { + unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); + + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; + } + + return false; +} + +static Value *simplifyX86immShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + bool IsImm = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast<FixedVectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + auto AmtVT = Amt->getType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. If its guaranteed to be out of range, logical shifts combine + // to zero and arithmetic shifts are clamped to (BitWidth - 1). + if (IsImm) { + assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); + KnownBits KnownAmtBits = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmtBits.getMaxValue().ult(BitWidth)) { + Amt = Builder.CreateZExtOrTrunc(Amt, SVT); + Amt = Builder.CreateVectorSplat(VWidth, Amt); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + if (KnownAmtBits.getMinValue().uge(BitWidth)) { + if (LogicalShift) + return ConstantAggregateZero::get(VT); + Amt = ConstantInt::get(SVT, BitWidth - 1); + return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); + } + } else { + // Ensure the first element has an in-range value and the rest of the + // elements in the bottom 64 bits are zero. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast<VectorType>(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); + APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); + APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); + KnownBits KnownLowerBits = llvm::computeKnownBits( + Amt, DemandedLower, II.getModule()->getDataLayout()); + KnownBits KnownUpperBits = llvm::computeKnownBits( + Amt, DemandedUpper, II.getModule()->getDataLayout()); + if (KnownLowerBits.getMaxValue().ult(BitWidth) && + (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { + SmallVector<int, 16> ZeroSplat(VWidth, 0); + Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + } + + // Simplify if count is constant vector. + auto CDV = dyn_cast<ConstantDataVector>(Amt); + if (!CDV) + return nullptr; + + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast<VectorType>(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + + // Concatenate the sub-elements to create the 64-bit value. + APInt Count(64, 0); + for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count <<= BitWidth; + Count |= SubElt->getValue().zextOrTrunc(64); + } + + // If shift-by-zero then just return the original value. + if (Count.isNullValue()) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. +// Unlike the generic IR shifts, the intrinsics have defined behaviour for out +// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). +static Value *simplifyX86varShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast<FixedVectorType>(II.getType()); + auto SVT = VT->getElementType(); + int NumElts = VT->getNumElements(); + int BitWidth = SVT->getIntegerBitWidth(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. + APInt UpperBits = + APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); + if (llvm::MaskedValueIsZero(Amt, UpperBits, + II.getModule()->getDataLayout())) { + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + + // Simplify if all shift amounts are constant/undef. + auto *CShift = dyn_cast<Constant>(Amt); + if (!CShift) + return nullptr; + + // Collect each element's shift amount. + // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. + bool AnyOutOfRange = false; + SmallVector<int, 8> ShiftAmts; + for (int I = 0; I < NumElts; ++I) { + auto *CElt = CShift->getAggregateElement(I); + if (isa_and_nonnull<UndefValue>(CElt)) { + ShiftAmts.push_back(-1); + continue; + } + + auto *COp = dyn_cast_or_null<ConstantInt>(CElt); + if (!COp) + return nullptr; + + // Handle out of range shifts. + // If LogicalShift - set to BitWidth (special case). + // If ArithmeticShift - set to (BitWidth - 1) (sign splat). + APInt ShiftVal = COp->getValue(); + if (ShiftVal.uge(BitWidth)) { + AnyOutOfRange = LogicalShift; + ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); + continue; + } + + ShiftAmts.push_back((int)ShiftVal.getZExtValue()); + } + + // If all elements out of range or UNDEF, return vector of zeros/undefs. + // ArithmeticShift should only hit this if they are all UNDEF. + auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; + if (llvm::all_of(ShiftAmts, OutOfRange)) { + SmallVector<Constant *, 8> ConstantVec; + for (int Idx : ShiftAmts) { + if (Idx < 0) { + ConstantVec.push_back(UndefValue::get(SVT)); + } else { + assert(LogicalShift && "Logical shift expected"); + ConstantVec.push_back(ConstantInt::getNullValue(SVT)); + } + } + return ConstantVector::get(ConstantVec); + } + + // We can't handle only some out of range values with generic logical shifts. + if (AnyOutOfRange) + return nullptr; + + // Build the shift amount constant vector. + SmallVector<Constant *, 8> ShiftVecAmts; + for (int Idx : ShiftAmts) { + if (Idx < 0) + ShiftVecAmts.push_back(UndefValue::get(SVT)); + else + ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); + } + auto ShiftVec = ConstantVector::get(ShiftVecAmts); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *simplifyX86pack(IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) + return UndefValue::get(ResTy); + + auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumSrcElts = ArgTy->getNumElements(); + assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && + "Unexpected packing types"); + + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); + assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) + return nullptr; + + // Clamp Values - signed/unsigned both use signed clamp values, but they + // differ on the min/max values. + APInt MinValue, MaxValue; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + MinValue = + APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + MaxValue = + APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + MinValue = APInt::getNullValue(SrcScalarSizeInBits); + MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); + } + + auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); + auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + + // Shuffle clamped args together at the lane level. + SmallVector<int, 32> PackMask; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); + } + auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); + + // Truncate to dst size. + return Builder.CreateTrunc(Shuffle, ResTy); +} + +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg = II.getArgOperand(0); + Type *ResTy = II.getType(); + + // movmsk(undef) -> zero as we must ensure the upper bits are zero. + if (isa<UndefValue>(Arg)) + return Constant::getNullValue(ResTy); + + auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); + // We can't easily peek through x86_mmx types. + if (!ArgTy) + return nullptr; + + // Expand MOVMSK to compare/bitcast/zext: + // e.g. PMOVMSKB(v16i8 x): + // %cmp = icmp slt <16 x i8> %x, zeroinitializer + // %int = bitcast <16 x i1> %cmp to i16 + // %res = zext i16 %int to i32 + unsigned NumElts = ArgTy->getNumElements(); + Type *IntegerVecTy = VectorType::getInteger(ArgTy); + Type *IntegerTy = Builder.getIntNTy(NumElts); + + Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); + Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); + Res = Builder.CreateBitCast(Res, IntegerTy); + Res = Builder.CreateZExtOrTrunc(Res, ResTy); + return Res; +} + +static Value *simplifyX86addcarry(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *CarryIn = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Op2 = II.getArgOperand(2); + Type *RetTy = II.getType(); + Type *OpTy = Op1->getType(); + assert(RetTy->getStructElementType(0)->isIntegerTy(8) && + RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && + "Unexpected types for x86 addcarry"); + + // If carry-in is zero, this is just an unsigned add with overflow. + if (match(CarryIn, PatternMatch::m_ZeroInt())) { + Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, + {Op1, Op2}); + // The types have to be adjusted to match the x86 call types. + Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); + Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), + Builder.getInt8Ty()); + Value *Res = UndefValue::get(RetTy); + Res = Builder.CreateInsertValue(Res, UAddOV, 0); + return Builder.CreateInsertValue(Res, UAddResult, 1); + } + + return nullptr; +} + +static Value *simplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); + if (!CInt) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // Initialize by passing all of the first source bits through. + int ShuffleMask[4] = {0, 1, 2, 3}; + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); +} + +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector<int, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + Index); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back(i + 16); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt.lshrInPlace(Index); + Elt = Elt.zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->isZero()) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector<int, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + +/// Attempt to convert pshufb* to shufflevector if the mask is constant. +static Value *simplifyX86pshufb(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && + "Unexpected number of elements in shuffle mask!"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); + + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + auto V2 = Constant::getNullValue(VecTy); + return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. +static Value *simplifyX86vpermilvar(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + bool IsPD = VecTy->getScalarType()->isDoubleTy(); + unsigned NumLaneElts = IsPD ? 2 : 4; + assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[16]; + + // The intrinsics only read one or two bits, clear the rest. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + APInt Index = cast<ConstantInt>(COp)->getValue(); + Index = Index.zextOrTrunc(32).getLoBits(2); + + // The PD variants uses bit 1 to select per-lane element index, so + // shift down to convert to generic shuffle mask index. + if (IsPD) + Index.lshrInPlace(1); + + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + Index += APInt(32, (I / NumLaneElts) * NumLaneElts); + + Indexes[I] = Index.getZExtValue(); + } + + auto V1 = II.getArgOperand(0); + return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. +static Value *simplifyX86vpermv(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *V = dyn_cast<Constant>(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast<FixedVectorType>(II.getType()); + unsigned Size = VecTy->getNumElements(); + assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && + "Unexpected shuffle mask size"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + for (unsigned I = 0; I < Size; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return nullptr; + + if (isa<UndefValue>(COp)) { + Indexes[I] = -1; + continue; + } + + uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); + Index &= Size - 1; + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size)); +} + +Optional<Instruction *> +X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, + unsigned DemandedWidth) { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + case Intrinsic::x86_bmi_bextr_32: + case Intrinsic::x86_bmi_bextr_64: + case Intrinsic::x86_tbm_bextri_u32: + case Intrinsic::x86_tbm_bextri_u64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + uint64_t Shift = C->getZExtValue(); + uint64_t Length = (Shift >> 8) & 0xff; + Shift &= 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + // If the length is 0 or the shift is out of range, replace with zero. + if (Length == 0 || Shift >= BitWidth) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue() >> Shift; + if (Length > BitWidth) + Length = BitWidth; + Result &= maskTrailingOnes<uint64_t>(Length); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we + // are only masking bits that a shift already cleared? + } + break; + + case Intrinsic::x86_bmi_bzhi_32: + case Intrinsic::x86_bmi_bzhi_64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + uint64_t Index = C->getZExtValue() & 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + if (Index >= BitWidth) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + if (Index == 0) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue(); + Result &= maskTrailingOnes<uint64_t>(Index); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we convert this to an AND if the RHS is constant? + } + break; + case Intrinsic::x86_bmi_pext_32: + case Intrinsic::x86_bmi_pext_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); + Value *Shifted = IC.Builder.CreateLShr(Masked, + ConstantInt::get(II.getType(), + ShiftAmount)); + return IC.replaceInstUsesWith(II, Shifted); + } + + + if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToSet = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToTest = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToSet <<= 1; + // Clear lowest set bit. + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + case Intrinsic::x86_bmi_pdep_32: + case Intrinsic::x86_bmi_pdep_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + if (MaskC->getValue().isShiftedMask()) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + unsigned ShiftAmount = MaskC->getValue().countTrailingZeros(); + Value *Input = II.getArgOperand(0); + Value *Shifted = IC.Builder.CreateShl(Input, + ConstantInt::get(II.getType(), + ShiftAmount)); + Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); + return IC.replaceInstUsesWith(II, Masked); + } + + if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToTest = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToSet = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToTest <<= 1; + // Clear lowest set bit; + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + case Intrinsic::x86_avx512_vcvtss2si32: + case Intrinsic::x86_avx512_vcvtss2si64: + case Intrinsic::x86_avx512_vcvtss2usi32: + case Intrinsic::x86_avx512_vcvtss2usi64: + case Intrinsic::x86_avx512_vcvtsd2si32: + case Intrinsic::x86_avx512_vcvtsd2si64: + case Intrinsic::x86_avx512_vcvtsd2usi32: + case Intrinsic::x86_avx512_vcvtsd2usi64: + case Intrinsic::x86_avx512_cvttss2si: + case Intrinsic::x86_avx512_cvttss2si64: + case Intrinsic::x86_avx512_cvttss2usi: + case Intrinsic::x86_avx512_cvttss2usi64: + case Intrinsic::x86_avx512_cvttsd2si: + case Intrinsic::x86_avx512_cvttsd2si64: + case Intrinsic::x86_avx512_cvttsd2usi: + case Intrinsic::x86_avx512_cvttsd2usi64: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + Value *Arg = II.getArgOperand(0); + unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx2_pmovmskb: + if (Value *V = simplifyX86movmsk(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomineq_sd: + case Intrinsic::x86_avx512_vcomi_ss: + case Intrinsic::x86_avx512_vcomi_sd: + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + case Intrinsic::x86_avx512_div_pd_512: + case Intrinsic::x86_avx512_mul_pd_512: + case Intrinsic::x86_avx512_sub_pd_512: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + if (R->getValue() == 4) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + V = IC.Builder.CreateFAdd(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_sub_pd_512: + V = IC.Builder.CreateFSub(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_mul_pd_512: + V = IC.Builder.CreateFMul(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_div_pd_512: + V = IC.Builder.CreateFDiv(Arg0, Arg1); + break; + } + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { + if (R->getValue() == 4) { + // Extract the element as scalars. + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); + Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + V = IC.Builder.CreateFAdd(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + V = IC.Builder.CreateFSub(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + V = IC.Builder.CreateFMul(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + V = IC.Builder.CreateFDiv(LHS, RHS); + break; + } + + // Handle the masking aspect of the intrinsic. + Value *Mask = II.getArgOperand(3); + auto *C = dyn_cast<ConstantInt>(Mask); + // We don't need a select if we know the mask bit is a 1. + if (!C || !C->getValue()[0]) { + // Cast the mask to an i1 vector and then extract the lowest element. + auto *MaskTy = FixedVectorType::get( + IC.Builder.getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Mask = IC.Builder.CreateBitCast(Mask, MaskTy); + Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); + // Extract the lowest element from the passthru operand. + Value *Passthru = + IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); + V = IC.Builder.CreateSelect(Mask, V, Passthru); + } + + // Insert the result back into the original argument 0. + V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II.getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + return IC.replaceOperand(II, 1, V); + } + break; + } + + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + if (Value *V = simplifyX86varShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, true)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, false)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_pclmulqdq: + case Intrinsic::x86_pclmulqdq_256: + case Intrinsic::x86_pclmulqdq_512: { + if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + unsigned Imm = C->getZExtValue(); + + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = + cast<FixedVectorType>(Arg0->getType())->getNumElements(); + + APInt UndefElts1(VWidth, 0); + APInt DemandedElts1 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + + APInt UndefElts2(VWidth, 0); + APInt DemandedElts2 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + + // If either input elements are undef, the result is zero. + if (DemandedElts1.isSubsetOf(UndefElts1) || + DemandedElts2.isSubsetOf(UndefElts2)) { + return IC.replaceInstUsesWith(II, + ConstantAggregateZero::get(II.getType())); + } + + if (MadeChange) { + return &II; + } + } + break; + } + + case Intrinsic::x86_sse41_insertps: + if (Value *V = simplifyX86insertps(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II.getArgOperand(0); + unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + const APInt &V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertqi: { + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // fold (blend A, A, Mask) -> A + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Mask = II.getArgOperand(2); + if (Op0 == Op1) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. + if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { + Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); + } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + Mask = InstCombiner::peekThroughBitcast(Mask); + if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && + BoolVec->getType()->isVectorTy() && + BoolVec->getType()->getScalarSizeInBits() == 1) { + assert(Mask->getType()->getPrimitiveSizeInBits() == + II.getType()->getPrimitiveSizeInBits() && + "Not expecting mask and operands with different sizes"); + + unsigned NumMaskElts = + cast<FixedVectorType>(Mask->getType())->getNumElements(); + unsigned NumOperandElts = + cast<FixedVectorType>(II.getType())->getNumElements(); + if (NumMaskElts == NumOperandElts) { + return SelectInst::Create(BoolVec, Op1, Op0); + } + + // If the mask has less elements than the operands, each mask bit maps to + // multiple elements of the operands. Bitcast back and forth. + if (NumMaskElts < NumOperandElts) { + Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); + Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); + Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); + return new BitCastInst(Sel, II.getType()); + } + } + + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + if (Value *V = simplifyX86pshufb(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + case Intrinsic::x86_avx512_permvar_df_256: + case Intrinsic::x86_avx512_permvar_df_512: + case Intrinsic::x86_avx512_permvar_di_256: + case Intrinsic::x86_avx512_permvar_di_512: + case Intrinsic::x86_avx512_permvar_hi_128: + case Intrinsic::x86_avx512_permvar_hi_256: + case Intrinsic::x86_avx512_permvar_hi_512: + case Intrinsic::x86_avx512_permvar_qi_128: + case Intrinsic::x86_avx512_permvar_qi_256: + case Intrinsic::x86_avx512_permvar_qi_512: + case Intrinsic::x86_avx512_permvar_sf_512: + case Intrinsic::x86_avx512_permvar_si_512: + if (Value *V = simplifyX86vpermv(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_maskload_ps: + case Intrinsic::x86_avx_maskload_pd: + case Intrinsic::x86_avx_maskload_ps_256: + case Intrinsic::x86_avx_maskload_pd_256: + case Intrinsic::x86_avx2_maskload_d: + case Intrinsic::x86_avx2_maskload_q: + case Intrinsic::x86_avx2_maskload_d_256: + case Intrinsic::x86_avx2_maskload_q_256: + if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { + return I; + } + break; + + case Intrinsic::x86_sse2_maskmov_dqu: + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + case Intrinsic::x86_avx2_maskstore_d: + case Intrinsic::x86_avx2_maskstore_q: + case Intrinsic::x86_avx2_maskstore_d_256: + case Intrinsic::x86_avx2_maskstore_q_256: + if (simplifyX86MaskedStore(II, IC)) { + return nullptr; + } + break; + + case Intrinsic::x86_addcarry_32: + case Intrinsic::x86_addcarry_64: + if (Value *V = simplifyX86addcarry(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + default: + break; + } + return None; +} + +Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const { + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx2_pmovmskb: { + // MOVMSK copies the vector elements' sign bits to the low bits + // and zeros the high bits. + unsigned ArgWidth; + if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { + ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. + } else { + auto Arg = II.getArgOperand(0); + auto ArgType = cast<FixedVectorType>(Arg->getType()); + ArgWidth = ArgType->getNumElements(); + } + + // If we don't need any of low bits then return zero, + // we know that DemandedMask is non-zero already. + APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); + Type *VTy = II.getType(); + if (DemandedElts.isNullValue()) { + return ConstantInt::getNullValue(VTy); + } + + // We know that the upper bits are set to zero. + Known.Zero.setBitsFrom(ArgWidth); + KnownBitsComputed = true; + break; + } + } + return None; +} + +Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + simplifyAndSetOp) const { + unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + // The instructions for these intrinsics are speced to zero upper bits not + // pass them through like other scalar intrinsics. So we shouldn't just + // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. + // Instead we should return a zero vector. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return ConstantAggregateZero::get(II.getType()); + } + + // Only the lower element is used. + DemandedElts = 1; + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // Only the lower element is undefined. The high elements are zero. + UndefElts = UndefElts[0]; + break; + + // Unary scalar-as-vector operations that work column-wise. + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rsqrt_ss: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions + // checks). + break; + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0. The low element is a function of both + // operands. + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: { + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Lower element is undefined if both lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0]) + UndefElts.clearBit(0); + + break; + } + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element comes from operand 1. + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // Don't use the low element of operand 0. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(0); + simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Take the high undef elements from operand 0 and take the lower element + // from operand 1. + UndefElts.clearBit(0); + UndefElts |= UndefElts2[0]; + break; + } + + // Three input scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element is a function of all + // three inputs. + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1 and 2. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); + break; + + // TODO: Add fmaddsub support? + case Intrinsic::x86_sse3_addsub_pd: + case Intrinsic::x86_sse3_addsub_ps: + case Intrinsic::x86_avx_addsub_pd_256: + case Intrinsic::x86_avx_addsub_ps_256: { + // If none of the even or none of the odd lanes are required, turn this + // into a generic FP math instruction. + APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); + APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); + bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); + bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); + if (IsSubOnly || IsAddOnly) { + assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(&II); + Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); + return IC.Builder.CreateBinOp( + IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); + } + + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + UndefElts &= UndefElts2; + break; + } + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: { + auto *Ty0 = II.getArgOperand(0)->getType(); + unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (int OpNum = 0; OpNum != 2; ++OpNum) { + APInt OpDemandedElts(InnerVWidth, 0); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned LaneIdx = Lane * VWidthPerLane; + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; + if (DemandedElts[Idx]) + OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + } + + // Demand elements from the operand. + APInt OpUndefElts(InnerVWidth, 0); + simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); + + // Pack the operand's UNDEF elements, one lane at a time. + OpUndefElts = OpUndefElts.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); + LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); + LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); + UndefElts |= LaneElts; + } + } + break; + } + + // PSHUFB + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + // PERMILVAR + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + // PERMV + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: { + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); + break; + } + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts.setHighBits(VWidth / 2); + break; + } + return None; +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td index e4f3290cab..e9ec53476c 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrAMX.td @@ -16,21 +16,21 @@ let Predicates = [HasAMXTILE, In64BitMode] in { let SchedRW = [WriteSystem] in { - let hasSideEffects = 1, - Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + let hasSideEffects = 1, + Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "ldtilecfg\t$src", [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS; - let hasSideEffects = 1 in + let hasSideEffects = 1 in def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "sttilecfg\t$src", [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD; - let mayLoad = 1 in + let mayLoad = 1 in def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloadd\t{$src, $dst|$dst, $src}", []>, VEX, T8XD; - let mayLoad = 1 in + let mayLoad = 1 in def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloaddt1\t{$src, $dst|$dst, $src}", []>, @@ -38,7 +38,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in { let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS; - let mayStore = 1 in + let mayStore = 1 in def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), (ins sibmem:$dst, TILE:$src), "tilestored\t{$src, $dst|$dst, $src}", []>, @@ -47,25 +47,25 @@ let Predicates = [HasAMXTILE, In64BitMode] in { "tilezero\t$dst", []>, VEX, T8XD; - // Pseduo instruction for RA. - let hasSideEffects = 1, mayLoad = 1, - Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in - def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; - - let hasSideEffects = 1, mayStore = 1 in - def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; - - def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, - opaquemem:$src3, - TILECFG:$cfg), []>; - def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, - GR16:$src2, opaquemem:$src3, - TILE:$src4, TILECFG:$cfg), []>; - def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, - TILECFG:$cfg), []>; - + // Pseduo instruction for RA. + let hasSideEffects = 1, mayLoad = 1, + Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; + + let hasSideEffects = 1, mayStore = 1 in + def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; + + def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3, + TILECFG:$cfg), []>; + def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, + GR16:$src2, opaquemem:$src3, + TILE:$src4, TILECFG:$cfg), []>; + def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, + TILECFG:$cfg), []>; + let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp @@ -74,7 +74,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in { sibmem:$src2), []>; def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), - [(int_x86_tilezero timm:$src)]>; + [(int_x86_tilezero timm:$src)]>; } } // SchedRW } // HasAMXTILE @@ -100,31 +100,31 @@ let Predicates = [HasAMXINT8, In64BitMode] in { VEX_4V, T8PS; } - // Pseduo instruction for RA. - let Constraints = "$src4 = $dst" in - def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; - + // Pseduo instruction for RA. + let Constraints = "$src4 = $dst" in + def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; + let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbssd timm:$src1, - timm:$src2, timm:$src3)]>; + [(int_x86_tdpbssd timm:$src1, + timm:$src2, timm:$src3)]>; def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbsud timm:$src1, - timm:$src2, timm:$src3)]>; + [(int_x86_tdpbsud timm:$src1, + timm:$src2, timm:$src3)]>; def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbusd timm:$src1, - timm:$src2, timm:$src3)]>; + [(int_x86_tdpbusd timm:$src1, + timm:$src2, timm:$src3)]>; def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbuud timm:$src1, - timm:$src2, timm:$src3)]>; + [(int_x86_tdpbuud timm:$src1, + timm:$src2, timm:$src3)]>; } } } // HasAMXTILE @@ -142,8 +142,8 @@ let Predicates = [HasAMXBF16, In64BitMode] in { // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbf16ps timm:$src1, - timm:$src2, timm:$src3)]>; + [(int_x86_tdpbf16ps timm:$src1, + timm:$src2, timm:$src3)]>; } } } // HasAMXTILE, HasAMXBF16 diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td index 19012797ae..654dcc1b39 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrAVX512.td @@ -1123,10 +1123,10 @@ defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, EXTRACT_get_vextract256_imm, [HasAVX512]>; // vextractps - extract 32 bits from XMM -def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst), +def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32orGR64:$dst), (ins VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX, VEX_WIG, Sched<[WriteVecExtract]>; def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs), @@ -1414,12 +1414,12 @@ defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, - X86VectorVTInfo _Dst, - X86VectorVTInfo _Src> { + SDPatternOperator OpNode, + X86VectorVTInfo _Dst, + X86VectorVTInfo _Src> { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", - (_Dst.VT (OpNode addr:$src))>, + (_Dst.VT (OpNode addr:$src))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1428,14 +1428,14 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, // the unmasked patterns so that we only use the DQ instructions when masking // is requested. multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, - X86VectorVTInfo _Dst, - X86VectorVTInfo _Src> { + SDPatternOperator OpNode, + X86VectorVTInfo _Dst, + X86VectorVTInfo _Src> { let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", (null_frag), - (_Dst.VT (OpNode addr:$src))>, + (_Dst.VT (OpNode addr:$src))>, Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } @@ -1445,194 +1445,194 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, // defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - X86SubVBroadcastld128, v16i32_info, v4i32x_info>, + X86SubVBroadcastld128, v16i32_info, v4i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", - X86SubVBroadcastld128, v16f32_info, v4f32x_info>, + X86SubVBroadcastld128, v16f32_info, v4f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", - X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W, + X86SubVBroadcastld256, v8i64_info, v4i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", - X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W, + X86SubVBroadcastld256, v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { -def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)), +def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)), - (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)), +def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)), +def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)), +def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTI64X4rm addr:$src)>; -def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4rm addr:$src)>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), (v16f32 immAllZerosV)), (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), (v16i32 immAllZerosV)), (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), (v8f64 immAllZerosV)), (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), (v8i64 immAllZerosV)), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } let Predicates = [HasVLX] in { defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - X86SubVBroadcastld128, v8i32x_info, v4i32x_info>, + X86SubVBroadcastld128, v8i32x_info, v4i32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", - X86SubVBroadcastld128, v8f32x_info, v4f32x_info>, + X86SubVBroadcastld128, v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; -def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), - (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4Z256rm addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4Z256rm addr:$src)>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), + (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), (v8f32 immAllZerosV)), (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), + (bc_v8f32 (v4f64 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), + (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), (v8i32 immAllZerosV)), (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), + (bc_v8i32 (v4i64 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; } let Predicates = [HasVLX, HasDQI] in { defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, + X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, + X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), + (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), (v4f64 immAllZerosV)), (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), + (bc_v4f64 (v8f32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), + (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), (v4i64 immAllZerosV)), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), + (bc_v4i64 (v8i32 (X86SubVBroadcastld128 addr:$src))), VR256X:$src0), (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } let Predicates = [HasDQI] in { defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W, + X86SubVBroadcastld128, v8i64_info, v2i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", - X86SubVBroadcastld256, v16i32_info, v8i32x_info>, + X86SubVBroadcastld256, v16i32_info, v8i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W, + X86SubVBroadcastld128, v8f64_info, v2f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", - X86SubVBroadcastld256, v16f32_info, v8f32x_info>, + X86SubVBroadcastld256, v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), (v16f32 immAllZerosV)), (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), + (bc_v16f32 (v8f64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), (v16i32 immAllZerosV)), (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), + (bc_v16i32 (v8i64 (X86SubVBroadcastld256 addr:$src))), VR512:$src0), (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), (v8f64 immAllZerosV)), (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), + (bc_v8f64 (v16f32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), (v8i64 immAllZerosV)), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; def : Pat<(vselect_mask VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), + (bc_v8i64 (v16i32 (X86SubVBroadcastld128 addr:$src))), VR512:$src0), (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -2531,71 +2531,71 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in { (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - - // Patterns for mask intrinsics. - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, - (_.KVT immAllOnesV)), - (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>; - - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1, - _.RC:$src2, timm:$cc)>; - - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc, - (_.KVT immAllOnesV)), - (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>; - - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc, - _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, - addr:$src2, timm:$cc)>; - - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc, - (_.KVT immAllOnesV)), - (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>; - - def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc, - _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, - addr:$src2, timm:$cc)>; - - // Patterns for mask intrinsics with loads in other operand. - def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, - (_.KVT immAllOnesV)), - (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, - (X86cmpm_imm_commute timm:$cc))>; - - def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, - _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, - _.RC:$src1, addr:$src2, - (X86cmpm_imm_commute timm:$cc))>; - - def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, - (_.KVT immAllOnesV)), - (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, - (X86cmpm_imm_commute timm:$cc))>; - - def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, - _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, - _.RC:$src1, addr:$src2, - (X86cmpm_imm_commute timm:$cc))>; + + // Patterns for mask intrinsics. + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, + (_.KVT immAllOnesV)), + (!cast<Instruction>(Name#_.ZSuffix#"rri") _.RC:$src1, _.RC:$src2, timm:$cc)>; + + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix#"rrik") _.KRCWM:$mask, _.RC:$src1, + _.RC:$src2, timm:$cc)>; + + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc, + (_.KVT immAllOnesV)), + (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, timm:$cc)>; + + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc, + _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, + addr:$src2, timm:$cc)>; + + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc, + (_.KVT immAllOnesV)), + (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, timm:$cc)>; + + def : Pat<(X86cmpmm (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc, + _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, + addr:$src2, timm:$cc)>; + + // Patterns for mask intrinsics with loads in other operand. + def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, + (_.KVT immAllOnesV)), + (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, + (X86cmpm_imm_commute timm:$cc))>; + + def : Pat<(X86cmpmm (_.VT (_.LdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, + _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + (X86cmpm_imm_commute timm:$cc))>; + + def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, + (_.KVT immAllOnesV)), + (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, + (X86cmpm_imm_commute timm:$cc))>; + + def : Pat<(X86cmpmm (_.VT (_.BroadcastLdFrag addr:$src2)), (_.VT _.RC:$src1), timm:$cc, + _.KRCWM:$mask), + (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + (X86cmpm_imm_commute timm:$cc))>; } multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { // comparison code form (VCMP[EQ/LT/LE/...] let Uses = [MXCSR] in - defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), - (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc), + defm rrib : AVX512_maskable_custom_cmp<0xC2, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1", "$src1, $src2, {sae}, $cc", - [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1), - (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))], - [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1), - (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>, + [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1), + (_.VT _.RC:$src2), timm:$cc, (_.KVT immAllOnesV)))], + [(set _.KRC:$dst, (X86cmpmmSAE (_.VT _.RC:$src1), + (_.VT _.RC:$src2), timm:$cc, _.KRCWM:$mask))]>, EVEX_B, Sched<[sched]>; } @@ -2855,8 +2855,8 @@ def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>; -def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>; +def : Pat<(i8 (trunc (i16 (bitconvert (v16i1 VK16:$src))))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_8bit)>; def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>; @@ -2937,9 +2937,9 @@ let Predicates = [HasAVX512] in { def : Pat<(insert_subvector (v16i1 immAllZerosV), (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)), - (KMOVWkr (AND32ri8 - (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), - (i32 1)))>; + (KMOVWkr (AND32ri8 + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), + (i32 1)))>; } // Mask unary operation @@ -6504,8 +6504,8 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512vl_f64_info, "PD">, VEX_W; } -defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma, - fma, X86FmaddRnd>; +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma, + fma, X86FmaddRnd>; defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub, X86Fmsub, X86FmsubRnd>; defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, @@ -6595,8 +6595,8 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512vl_f64_info, "PD">, VEX_W; } -defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma, - fma, X86FmaddRnd>; +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma, + fma, X86FmaddRnd>; defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub, X86Fmsub, X86FmsubRnd>; defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, @@ -6687,8 +6687,8 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512vl_f64_info, "PD">, VEX_W; } -defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma, - fma, X86FmaddRnd>; +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma, + fma, X86FmaddRnd>; defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub, X86Fmsub, X86FmsubRnd>; defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, @@ -6790,7 +6790,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", any_fma, X86FmaddRnd>; defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>; defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>; defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>; @@ -6998,7 +6998,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp, } } -defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", +defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", "SS", X86Movss, v4f32x_info, fp32imm0>; defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", "SS", X86Movss, v4f32x_info, fp32imm0>; @@ -7007,7 +7007,7 @@ defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMA defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS", X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", +defm : avx512_scalar_fma_patterns<any_fma, fma, X86FmaddRnd, "VFMADD", "SD", X86Movsd, v2f64x_info, fp64imm0>; defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", "SD", X86Movsd, v2f64x_info, fp64imm0>; @@ -7540,7 +7540,7 @@ multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86FoldableSchedWrite sched, X86VectorVTInfo _src, X86VectorVTInfo _dst> { - let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>, avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; @@ -7551,7 +7551,7 @@ multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _src, X86VectorVTInfo _dst> { - let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { + let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in { defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>, avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>, EVEX_CD8<32, CD8VT1>, XS; @@ -10879,7 +10879,7 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, def mr : AVX512Ii8<opc, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))), + [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))), addr:$dst)]>, EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>; } @@ -10890,7 +10890,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { (ins _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, - (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>, + (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>, EVEX, TAPD, Sched<[WriteVecExtract]>; defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; @@ -10903,7 +10903,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { (ins _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, - (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>, + (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>, EVEX, PD, Sched<[WriteVecExtract]>; let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in @@ -10943,13 +10943,13 @@ defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, PatFrag LdFrag, - SDPatternOperator immoperator> { + X86VectorVTInfo _, PatFrag LdFrag, + SDPatternOperator immoperator> { def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, - (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } @@ -10960,10 +10960,10 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, - (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V, + (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V, Sched<[WriteVecInsert]>; - defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>; + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>; } } @@ -10978,7 +10978,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, EVEX_4V, TAPD, Sched<[WriteVecInsert]>; defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, - _.ScalarLdFrag, imm>, TAPD; + _.ScalarLdFrag, imm>, TAPD; } } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td index e83e1e74ff..37768af345 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrArithmetic.td @@ -1182,15 +1182,15 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, X86sub_flag, sub, 0, 1, 0>; } -// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of -// __builtin_parity where the last step xors an h-register with an l-register. -let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst", - Defs = [EFLAGS], isCommutable = 1 in -def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst), - (ins GR8_NOREX:$src1, GR8_NOREX:$src2), - "xor{b}\t{$src2, $dst|$dst, $src2}", []>, - Sched<[WriteALU]>; - +// Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of +// __builtin_parity where the last step xors an h-register with an l-register. +let isCodeGenOnly = 1, hasSideEffects = 0, Constraints = "$src1 = $dst", + Defs = [EFLAGS], isCommutable = 1 in +def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst), + (ins GR8_NOREX:$src1, GR8_NOREX:$src2), + "xor{b}\t{$src2, $dst|$dst, $src2}", []>, + Sched<[WriteALU]>; + // Arithmetic. defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, 1, 0>; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td index dc6361aecc..49ab46522c 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrCompiler.td @@ -73,32 +73,32 @@ let usesCustomInserter = 1, Defs = [EFLAGS] in { def VASTART_SAVE_XMM_REGS : I<0, Pseudo, (outs), (ins GR8:$al, - i32imm:$regsavefi, i32imm:$offset, + i32imm:$regsavefi, i32imm:$offset, variable_ops), "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", [(X86vastart_save_xmm_regs GR8:$al, - timm:$regsavefi, - timm:$offset), + timm:$regsavefi, + timm:$offset), (implicit EFLAGS)]>; -// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the -// va_list, and place the address of the next argument into a register. -let Defs = [EFLAGS] in { +// The VAARG_64 and VAARG_X32 pseudo-instructions take the address of the +// va_list, and place the address of the next argument into a register. +let Defs = [EFLAGS] in { def VAARG_64 : I<0, Pseudo, (outs GR64:$dst), (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), "#VAARG_64 $dst, $ap, $size, $mode, $align", [(set GR64:$dst, - (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)), - (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>; -def VAARG_X32 : I<0, Pseudo, - (outs GR32:$dst), - (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), - "#VAARG_X32 $dst, $ap, $size, $mode, $align", - [(set GR32:$dst, - (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)), - (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>; -} + (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)), + (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>; +def VAARG_X32 : I<0, Pseudo, + (outs GR32:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_X32 $dst, $ap, $size, $mode, $align", + [(set GR32:$dst, + (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)), + (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>; +} // When using segmented stacks these are lowered into instructions which first // check if the current stacklet has enough free memory. If it does, memory is @@ -474,19 +474,19 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, - Requires<[In64BitMode, IsLP64]>; + Requires<[In64BitMode, IsLP64]>; def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_base_addr64", [(X86tlsbaseaddr tls64baseaddr:$sym)]>, - Requires<[In64BitMode, IsLP64]>; -def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym), - "# TLS_addrX32", - [(X86tlsaddr tls32addr:$sym)]>, - Requires<[In64BitMode, NotLP64]>; -def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym), - "# TLS_base_addrX32", - [(X86tlsbaseaddr tls32baseaddr:$sym)]>, - Requires<[In64BitMode, NotLP64]>; + Requires<[In64BitMode, IsLP64]>; +def TLS_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addrX32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[In64BitMode, NotLP64]>; +def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addrX32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[In64BitMode, NotLP64]>; } // Darwin TLS Support @@ -847,21 +847,21 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in { } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], - Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, usesCustomInserter = 1 in { -def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), - "cmpxchg8b\t$ptr", - [(X86cas8 addr:$ptr)]>, TB, LOCK; -} - -let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in { -def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), - "cmpxchg16b\t$ptr", - []>, TB, LOCK; + Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, usesCustomInserter = 1 in { +def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), + "cmpxchg8b\t$ptr", + [(X86cas8 addr:$ptr)]>, TB, LOCK; } +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in { +def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), + "cmpxchg16b\t$ptr", + []>, TB, LOCK; +} + // This pseudo must be used when the frame uses RBX as // the base pointer. Indeed, in such situation RBX is a reserved // register and the register allocator will ignore any use/def of @@ -869,64 +869,64 @@ def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), // RBX that will happen when setting the arguments for the instrucion. // // Unlike the actual related instruction, we mark that this one -// defines RBX (instead of using RBX). +// defines RBX (instead of using RBX). // The rationale is that we will define RBX during the expansion of -// the pseudo. The argument feeding RBX is rbx_input. +// the pseudo. The argument feeding RBX is rbx_input. // -// The additional argument, $rbx_save, is a temporary register used to +// The additional argument, $rbx_save, is a temporary register used to // save the value of RBX across the actual instruction. // -// To make sure the register assigned to $rbx_save does not interfere with +// To make sure the register assigned to $rbx_save does not interfere with // the definition of the actual instruction, we use a definition $dst which // is tied to $rbx_save. That way, the live-range of $rbx_save spans across // the instruction and we are sure we will have a valid register to restore // the value of RBX. -let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, isPseudo = 1, - mayLoad = 1, mayStore = 1, hasSideEffects = 0, - Constraints = "$rbx_save = $dst" in { -def LCMPXCHG16B_SAVE_RBX : - I<0, Pseudo, (outs GR64:$dst), - (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>; -} - -// Pseudo instruction that doesn't read/write RBX. Will be turned into either -// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter. -let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX], - Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], - isCodeGenOnly = 1, isPseudo = 1, - mayLoad = 1, mayStore = 1, hasSideEffects = 0, +let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, + Constraints = "$rbx_save = $dst" in { +def LCMPXCHG16B_SAVE_RBX : + I<0, Pseudo, (outs GR64:$dst), + (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>; +} + +// Pseudo instruction that doesn't read/write RBX. Will be turned into either +// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter. +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW], + isCodeGenOnly = 1, isPseudo = 1, + mayLoad = 1, mayStore = 1, hasSideEffects = 0, usesCustomInserter = 1 in { -def LCMPXCHG16B_NO_RBX : - I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "", - [(X86cas16 addr:$ptr, GR64:$rbx_input)]>; +def LCMPXCHG16B_NO_RBX : + I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "", + [(X86cas16 addr:$ptr, GR64:$rbx_input)]>; } -// This pseudo must be used when the frame uses RBX/EBX as -// the base pointer. -// cf comment for LCMPXCHG16B_SAVE_RBX. -let Defs = [EBX], Uses = [ECX, EAX], - Predicates = [HasMWAITX], SchedRW = [WriteSystem], - isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in { -def MWAITX_SAVE_RBX : - I<0, Pseudo, (outs GR64:$dst), - (ins GR32:$ebx_input, GR64:$rbx_save), - "mwaitx", - []>; +// This pseudo must be used when the frame uses RBX/EBX as +// the base pointer. +// cf comment for LCMPXCHG16B_SAVE_RBX. +let Defs = [EBX], Uses = [ECX, EAX], + Predicates = [HasMWAITX], SchedRW = [WriteSystem], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst" in { +def MWAITX_SAVE_RBX : + I<0, Pseudo, (outs GR64:$dst), + (ins GR32:$ebx_input, GR64:$rbx_save), + "mwaitx", + []>; } -// Pseudo mwaitx instruction to use for custom insertion. -let Predicates = [HasMWAITX], SchedRW = [WriteSystem], - isCodeGenOnly = 1, isPseudo = 1, +// Pseudo mwaitx instruction to use for custom insertion. +let Predicates = [HasMWAITX], SchedRW = [WriteSystem], + isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1 in { -def MWAITX : - I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx), - "mwaitx", - [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>; +def MWAITX : + I<0, Pseudo, (outs), (ins GR32:$ecx, GR32:$eax, GR32:$ebx), + "mwaitx", + [(int_x86_mwaitx GR32:$ecx, GR32:$eax, GR32:$ebx)]>; } - + defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>; // Atomic exchange and add @@ -1213,49 +1213,49 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), return true; }]>; -def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), - (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>, +def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>, Requires<[Not64BitMode, NotUseIndirectThunkCalls]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. -def : Pat<(X86tcret (load addr:$dst), timm:$off), - (TCRETURNmi addr:$dst, timm:$off)>, +def : Pat<(X86tcret (load addr:$dst), timm:$off), + (TCRETURNmi addr:$dst, timm:$off)>, Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>; -def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off), - (TCRETURNdi tglobaladdr:$dst, timm:$off)>, +def : Pat<(X86tcret (i32 tglobaladdr:$dst), timm:$off), + (TCRETURNdi tglobaladdr:$dst, timm:$off)>, Requires<[NotLP64]>; -def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off), - (TCRETURNdi texternalsym:$dst, timm:$off)>, +def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off), + (TCRETURNdi texternalsym:$dst, timm:$off)>, Requires<[NotLP64]>; -def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), - (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>, +def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>, Requires<[In64BitMode, NotUseIndirectThunkCalls]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. -def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off), - (TCRETURNmi64 addr:$dst, timm:$off)>, +def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off), + (TCRETURNmi64 addr:$dst, timm:$off)>, Requires<[In64BitMode, NotUseIndirectThunkCalls]>; -def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), - (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>, +def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), + (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>, Requires<[In64BitMode, UseIndirectThunkCalls]>; -def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), - (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>, +def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), + (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, timm:$off)>, Requires<[Not64BitMode, UseIndirectThunkCalls]>; -def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off), - (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>, +def : Pat<(X86tcret (i64 tglobaladdr:$dst), timm:$off), + (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>, Requires<[IsLP64]>; -def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off), - (TCRETURNdi64 texternalsym:$dst, timm:$off)>, +def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off), + (TCRETURNdi64 texternalsym:$dst, timm:$off)>, Requires<[IsLP64]>; // Normal calls, with various flavors of addresses. @@ -1344,18 +1344,18 @@ def : Pat<(i32 (anyext_sdiv GR8:$src)), (MOVSX32rr8 GR8:$src)>; // Any instruction that defines a 32-bit result leaves the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying -// anything about the upper 32 bits, they're probably just qualifying a -// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit -// operation will zero-extend up to 64 bits. +// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying +// anything about the upper 32 bits, they're probably just qualifying a +// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit +// operation will zero-extend up to 64 bits. def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && N->getOpcode() != ISD::AssertSext && - N->getOpcode() != ISD::AssertZext && - N->getOpcode() != ISD::AssertAlign && - N->getOpcode() != ISD::FREEZE; + N->getOpcode() != ISD::AssertZext && + N->getOpcode() != ISD::AssertAlign && + N->getOpcode() != ISD::FREEZE; }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, @@ -1732,16 +1732,16 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>, Requires<[In64BitMode]>; -// Special pattern to catch the last step of __builtin_parity handling. Our -// goal is to use an xor of an h-register with the corresponding l-register. -// The above patterns would handle this on non 64-bit targets, but for 64-bit -// we need to be more careful. We're using a NOREX instruction here in case -// register allocation fails to keep the two registers together. So we need to -// make sure we can't accidentally mix R8-R15 with an h-register. -def : Pat<(X86xor_flag (i8 (trunc GR32:$src)), - (i8 (trunc (srl_su GR32:$src, (i8 8))))), - (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit), - (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; +// Special pattern to catch the last step of __builtin_parity handling. Our +// goal is to use an xor of an h-register with the corresponding l-register. +// The above patterns would handle this on non 64-bit targets, but for 64-bit +// we need to be more careful. We're using a NOREX instruction here in case +// register allocation fails to keep the two registers together. So we need to +// make sure we can't accidentally mix R8-R15 with an h-register. +def : Pat<(X86xor_flag (i8 (trunc GR32:$src)), + (i8 (trunc (srl_su GR32:$src, (i8 8))))), + (XOR8rr_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit), + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; // (shl x, 1) ==> (add x, x) // Note that if x is undef (immediate or otherwise), we could theoretically diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td index f9be3a7832..bfaa75c0ce 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFMA.td @@ -123,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS", - loadv4f32, loadv8f32, any_fma, v4f32, v8f32, + loadv4f32, loadv8f32, any_fma, v4f32, v8f32, SchedWriteFMA>; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32, @@ -138,7 +138,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD", - loadv2f64, loadv4f64, any_fma, v2f64, + loadv2f64, loadv4f64, any_fma, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", loadv2f64, loadv4f64, X86any_Fmsub, v2f64, @@ -319,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, VR128, sdmem, sched>, VEX_W; } -defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma, +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", any_fma, SchedWriteFMA.Scl>, VEX_LIG; defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub, SchedWriteFMA.Scl>, VEX_LIG; @@ -372,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, } } -defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<any_fma, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; -defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<any_fma, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; @@ -538,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { let ExeDomain = SSEPackedSingle in { // Scalar Instructions - defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32, + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, any_fma, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; @@ -555,7 +555,7 @@ let ExeDomain = SSEPackedSingle in { fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32, + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", any_fma, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -571,7 +571,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Scalar Instructions - defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64, + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, any_fma, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; @@ -588,7 +588,7 @@ let ExeDomain = SSEPackedDouble in { fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64, + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", any_fma, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -629,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name, } } -defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<any_fma, "VFMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; -defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<any_fma, "VFMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td index 961b4e5903..cfbfe39f88 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFPStack.td @@ -392,13 +392,13 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; let SchedRW = [WriteMicrocoded] in { let Defs = [FPSW, FPCW], mayLoad = 1 in { -def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">; -def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">; +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins anymem:$src), "fldenv\t$src">; +def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins anymem:$src), "frstor\t$src">; } let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in { -def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">; -def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">; +def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins anymem:$dst), "fnstenv\t$dst">; +def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins anymem:$dst), "fnsave\t$dst">; } let Uses = [FPSW], mayStore = 1 in diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp index 17fe7f0bd3..1e3fb7f227 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFoldTables.cpp @@ -300,13 +300,13 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, - { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, @@ -359,8 +359,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::VEXTRACTI64x4Zrr, X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSrr, X86::VEXTRACTPSmr, TB_FOLDED_STORE }, - { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, - { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOV64toSDZrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOV64toSDrr, X86::MOV64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -371,8 +371,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, - { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, - { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVDI2SSZrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVDI2SSrr, X86::MOV32mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, @@ -3748,26 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 }, { X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 }, { X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 }, - { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 }, + { X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 }, { X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 }, { X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 }, { X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 }, - { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 }, - { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 }, + { X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 }, + { X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 }, { X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 }, { X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 }, { X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 }, - { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 }, - { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 }, + { X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 }, + { X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 }, { X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 }, { X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 }, { X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 }, - { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 }, - { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 }, + { X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 }, + { X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 }, { X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 }, { X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 }, { X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 }, - { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 }, + { X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 }, { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td index 686b19fc0a..23cec4e363 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFormats.td @@ -216,7 +216,7 @@ class T8XS : T8 { Prefix OpPrefix = XS; } class TAPS : TA { Prefix OpPrefix = PS; } class TAPD : TA { Prefix OpPrefix = PD; } class TAXD : TA { Prefix OpPrefix = XD; } -class TAXS : TA { Prefix OpPrefix = XS; } +class TAXS : TA { Prefix OpPrefix = XS; } class VEX { Encoding OpEnc = EncVEX; } class VEX_W { bit HasVEX_W = 1; } class VEX_WIG { bit IgnoresVEX_W = 1; } @@ -264,9 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; } // Prevent EVEX->VEX conversion from considering this instruction. class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; } -// Force the instruction to use VEX encoding. -class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; } - +// Force the instruction to use VEX encoding. +class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; } + class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, Domain d = GenericDomain> : Instruction { @@ -351,7 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction? bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion. - bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding. + bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding. // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; @@ -380,7 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{51-45} = CD8_Scale; let TSFlags{52} = hasEVEX_RC; let TSFlags{53} = hasNoTrackPrefix; - let TSFlags{54} = ExplicitVEXPrefix; + let TSFlags{54} = ExplicitVEXPrefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td index 777c5a158b..0d9595128f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -87,16 +87,16 @@ def X86multishift : SDNode<"X86ISD::MULTISHIFT", SDTCisSameAs<1,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, - SDTCisVT<2, i8>]>>; + SDTCisVT<2, i8>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, - SDTCisVT<2, i8>]>>; + SDTCisVT<2, i8>]>>; def X86pinsrb : SDNode<"X86ISD::PINSRB", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>; + SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>; def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>; + SDTCisVT<2, i32>, SDTCisVT<3, i8>]>>; def X86insertps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>; @@ -109,8 +109,8 @@ def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -209,21 +209,21 @@ def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; -def X86MaskCmpMaskCC : - SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, - SDTCisVec<1>, SDTCisSameAs<2, 1>, - SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>; +def X86MaskCmpMaskCC : + SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisSameAs<4, 0>]>; def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>; +def X86cmpmm : SDNode<"X86ISD::CMPMM", X86MaskCmpMaskCC>; def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>; def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3), [(X86strict_cmpm node:$src1, node:$src2, node:$src3), (X86cmpm node:$src1, node:$src2, node:$src3)]>; -def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>; +def X86cmpmmSAE : SDNode<"X86ISD::CMPMM_SAE", X86MaskCmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>; @@ -961,16 +961,16 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src), return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; }]>; -def X86SubVBroadcastld128 : PatFrag<(ops node:$src), - (X86SubVBroadcastld node:$src), [{ - return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16; -}]>; - -def X86SubVBroadcastld256 : PatFrag<(ops node:$src), - (X86SubVBroadcastld node:$src), [{ - return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32; -}]>; - +def X86SubVBroadcastld128 : PatFrag<(ops node:$src), + (X86SubVBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 16; +}]>; + +def X86SubVBroadcastld256 : PatFrag<(ops node:$src), + (X86SubVBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 32; +}]>; + // Scalar SSE intrinsic fragments to match several different types of loads. // Used by scalar SSE intrinsic instructions which have 128 bit types, but // only load a single element. diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp index d9bab14f0c..283ca4164f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.cpp @@ -28,7 +28,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" @@ -947,9 +947,9 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, } /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. -static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { +static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. - if (!BaseReg.isVirtual()) + if (!BaseReg.isVirtual()) return false; bool isPICBase = false; for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), @@ -1127,8 +1127,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); - if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != - MachineBasicBlock::LQR_Dead) { + if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != + MachineBasicBlock::LQR_Dead) { // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. int Value; @@ -1206,7 +1206,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, isKill = Src.isKill(); assert(!Src.isUndef() && "Undef op doesn't need optimization"); - if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) + if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) return false; return true; @@ -1214,7 +1214,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // This is for an LEA64_32r and incoming registers are 32-bit. One way or // another we need to add 64-bit registers to the final MI. - if (SrcReg.isPhysical()) { + if (SrcReg.isPhysical()) { ImplicitOp = Src; ImplicitOp.setImplicit(); @@ -1409,8 +1409,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. - if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( - Src.getReg(), &X86::GR64_NOSPRegClass)) + if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass( + Src.getReg(), &X86::GR64_NOSPRegClass)) return nullptr; NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) @@ -2566,10 +2566,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); - case X86::VPDPWSSDYrr: - case X86::VPDPWSSDrr: - case X86::VPDPWSSDSYrr: - case X86::VPDPWSSDSrr: + case X86::VPDPWSSDYrr: + case X86::VPDPWSSDrr: + case X86::VPDPWSSDSYrr: + case X86::VPDPWSSDSrr: case X86::VPDPWSSDZ128r: case X86::VPDPWSSDZ128rk: case X86::VPDPWSSDZ128rkz: @@ -3530,10 +3530,10 @@ X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { return None; } -static unsigned getLoadStoreRegOpcode(Register Reg, +static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, - bool IsStackAligned, - const X86Subtarget &STI, bool load) { + bool IsStackAligned, + const X86Subtarget &STI, bool load) { bool HasAVX = STI.hasAVX(); bool HasAVX512 = STI.hasAVX512(); bool HasVLX = STI.hasVLX(); @@ -3606,7 +3606,7 @@ static unsigned getLoadStoreRegOpcode(Register Reg, case 16: { if (X86::VR128XRegClass.hasSubClassEq(RC)) { // If stack is realigned we can use aligned stores. - if (IsStackAligned) + if (IsStackAligned) return load ? (HasVLX ? X86::VMOVAPSZ128rm : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : @@ -3638,7 +3638,7 @@ static unsigned getLoadStoreRegOpcode(Register Reg, case 32: assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. - if (IsStackAligned) + if (IsStackAligned) return load ? (HasVLX ? X86::VMOVAPSZ256rm : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : @@ -3657,80 +3657,80 @@ static unsigned getLoadStoreRegOpcode(Register Reg, case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); - if (IsStackAligned) + if (IsStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } } -Optional<ExtAddrMode> -X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, - const TargetRegisterInfo *TRI) const { - const MCInstrDesc &Desc = MemI.getDesc(); - int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); - if (MemRefBegin < 0) - return None; - - MemRefBegin += X86II::getOperandBias(Desc); - - auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg); - if (!BaseOp.isReg()) // Can be an MO_FrameIndex - return None; - - const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp); - // Displacement can be symbolic - if (!DispMO.isImm()) - return None; - - ExtAddrMode AM; - AM.BaseReg = BaseOp.getReg(); - AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg(); - AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm(); - AM.Displacement = DispMO.getImm(); - return AM; -} - -bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, - const Register Reg, - int64_t &ImmVal) const { - if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri) - return false; - // Mov Src can be a global address. - if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg) - return false; - ImmVal = MI.getOperand(1).getImm(); - return true; -} - -bool X86InstrInfo::preservesZeroValueInReg( - const MachineInstr *MI, const Register NullValueReg, - const TargetRegisterInfo *TRI) const { - if (!MI->modifiesRegister(NullValueReg, TRI)) - return true; - switch (MI->getOpcode()) { - // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax - // X. - case X86::SHR64ri: - case X86::SHR32ri: - case X86::SHL64ri: - case X86::SHL32ri: - assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() && - "expected for shift opcode!"); - return MI->getOperand(0).getReg() == NullValueReg && - MI->getOperand(1).getReg() == NullValueReg; - // Zero extend of a sub-reg of NullValueReg into itself does not change the - // null value. - case X86::MOV32rr: - return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) { - return TRI->isSubRegisterEq(NullValueReg, MO.getReg()); - }); - default: - return false; - } - llvm_unreachable("Should be handled above!"); -} - +Optional<ExtAddrMode> +X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, + const TargetRegisterInfo *TRI) const { + const MCInstrDesc &Desc = MemI.getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemRefBegin < 0) + return None; + + MemRefBegin += X86II::getOperandBias(Desc); + + auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg); + if (!BaseOp.isReg()) // Can be an MO_FrameIndex + return None; + + const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp); + // Displacement can be symbolic + if (!DispMO.isImm()) + return None; + + ExtAddrMode AM; + AM.BaseReg = BaseOp.getReg(); + AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg(); + AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm(); + AM.Displacement = DispMO.getImm(); + return AM; +} + +bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI, + const Register Reg, + int64_t &ImmVal) const { + if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri) + return false; + // Mov Src can be a global address. + if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg) + return false; + ImmVal = MI.getOperand(1).getImm(); + return true; +} + +bool X86InstrInfo::preservesZeroValueInReg( + const MachineInstr *MI, const Register NullValueReg, + const TargetRegisterInfo *TRI) const { + if (!MI->modifiesRegister(NullValueReg, TRI)) + return true; + switch (MI->getOpcode()) { + // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax + // X. + case X86::SHR64ri: + case X86::SHR32ri: + case X86::SHL64ri: + case X86::SHL32ri: + assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() && + "expected for shift opcode!"); + return MI->getOperand(0).getReg() == NullValueReg && + MI->getOperand(1).getReg() == NullValueReg; + // Zero extend of a sub-reg of NullValueReg into itself does not change the + // null value. + case X86::MOV32rr: + return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) { + return TRI->isSubRegisterEq(NullValueReg, MO.getReg()); + }); + default: + return false; + } + llvm_unreachable("Should be handled above!"); +} + bool X86InstrInfo::getMemOperandsWithOffsetWidth( const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -3775,17 +3775,17 @@ bool X86InstrInfo::getMemOperandsWithOffsetWidth( return true; } -static unsigned getStoreRegOpcode(Register SrcReg, +static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, - bool IsStackAligned, + bool IsStackAligned, const X86Subtarget &STI) { - return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false); + return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false); } -static unsigned getLoadRegOpcode(Register DestReg, +static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, - bool IsStackAligned, const X86Subtarget &STI) { - return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true); + bool IsStackAligned, const X86Subtarget &STI) { + return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true); } void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, @@ -3796,31 +3796,31 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const MachineFunction &MF = *MBB.getParent(); assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); - if (RC->getID() == X86::TILERegClassID) { - unsigned Opc = X86::TILESTORED; - // tilestored %tmm, (%sp, %idx) - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); - Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); - BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); - MachineInstr *NewMI = - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); - MachineOperand &MO = NewMI->getOperand(2); - MO.setReg(VirtReg); - MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PSTTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); - } else { - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); - bool isAligned = - (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); - } + if (RC->getID() == X86::TILERegClassID) { + unsigned Opc = X86::TILESTORED; + // tilestored %tmm, (%sp, %idx) + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); + Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); + MachineInstr *NewMI = + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); + MachineOperand &MO = NewMI->getOperand(2); + MO.setReg(VirtReg); + MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PSTTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); + } else { + unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); + } } void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, @@ -3828,32 +3828,32 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - if (RC->getID() == X86::TILERegClassID) { - unsigned Opc = X86::TILELOADD; - // tileloadd (%sp, %idx), %tmm - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); - Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); - MachineInstr *NewMI = - BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); - NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); - MachineOperand &MO = NewMI->getOperand(3); - MO.setReg(VirtReg); - MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PLDTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); - } else { - const MachineFunction &MF = *MBB.getParent(); - unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); - bool isAligned = - (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || - RI.canRealignStack(MF); - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); - } + if (RC->getID() == X86::TILERegClassID) { + unsigned Opc = X86::TILELOADD; + // tileloadd (%sp, %idx), %tmm + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); + Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + MachineInstr *NewMI = + BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); + NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); + MachineOperand &MO = NewMI->getOperand(3); + MO.setReg(VirtReg); + MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PLDTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); + } else { + const MachineFunction &MF = *MBB.getParent(); + unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); + } } bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, @@ -4416,7 +4416,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, - Register &FoldAsLoadDefReg, + Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); @@ -4479,8 +4479,8 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, /// %k4 = K_SET1 /// to: /// %k4 = KXNORrr %k0, %k0 -static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, - Register Reg) { +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, + Register Reg) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); MIB->setDesc(Desc); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); @@ -4926,7 +4926,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // If MI is marked as reading Reg, the partial register update is wanted. const MachineOperand &MO = MI.getOperand(0); Register Reg = MO.getReg(); - if (Reg.isVirtual()) { + if (Reg.isVirtual()) { if (MO.readsReg() || MI.readsVirtualRegister(Reg)) return 0; } else { @@ -5224,12 +5224,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, /// Like getPartialRegUpdateClearance, this makes a strong assumption that the /// high bits that are passed-through are not live. unsigned -X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, +X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - const MachineOperand &MO = MI.getOperand(OpNum); - if (Register::isPhysicalRegister(MO.getReg()) && - hasUndefRegUpdate(MI.getOpcode(), OpNum)) - return UndefRegClearance; + const MachineOperand &MO = MI.getOperand(OpNum); + if (Register::isPhysicalRegister(MO.getReg()) && + hasUndefRegUpdate(MI.getOpcode(), OpNum)) + return UndefRegClearance; return 0; } @@ -5311,7 +5311,7 @@ static void updateOperandRegConstraints(MachineFunction &MF, if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual()) + if (!Reg.isVirtual()) continue; auto *NewRC = MRI.constrainRegClass( @@ -5562,10 +5562,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (I != nullptr) { unsigned Opcode = I->DstOp; - bool FoldedLoad = - isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0; - bool FoldedStore = - isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE); + bool FoldedLoad = + isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0; + bool FoldedStore = + isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE); MaybeAlign MinAlign = decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); if (MinAlign && Alignment < *MinAlign) @@ -5576,25 +5576,25 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - // Check if it's safe to fold the load. If the size of the object is - // narrower than the load width, then it's not. - // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. - if (FoldedLoad && Size < RCSize) { + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. + if (FoldedLoad && Size < RCSize) { // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is // due to live interval analysis remat'ing a load from stack slot. - if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) - return nullptr; + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return nullptr; if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } - // For stores, make sure the size of the object is equal to the size of - // the store. If the object is larger, the extra bits would be garbage. If - // the object is smaller we might overwrite another object or fault. - if (FoldedStore && Size != RCSize) - return nullptr; + // For stores, make sure the size of the object is equal to the size of + // the store. If the object is larger, the extra bits would be garbage. If + // the object is smaller we might overwrite another object or fault. + if (FoldedStore && Size != RCSize) + return nullptr; } if (isTwoAddrFold) @@ -5607,7 +5607,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // value and zero-extend the top bits. Change the destination register // to a 32-bit one. Register DstReg = NewMI->getOperand(0).getReg(); - if (DstReg.isPhysical()) + if (DstReg.isPhysical()) NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); @@ -6464,7 +6464,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, } if (Load) BeforeOps.push_back(SDValue(Load, 0)); - llvm::append_range(BeforeOps, AfterOps); + llvm::append_range(BeforeOps, AfterOps); // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (Opc) { default: break; @@ -6782,18 +6782,18 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - - // ENDBR instructions should not be scheduled around. - unsigned Opcode = MI.getOpcode(); - if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32) - return true; - - return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); -} - +bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + + // ENDBR instructions should not be scheduled around. + unsigned Opcode = MI.getOpcode(); + if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32) + return true; + + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); +} + bool X86InstrInfo:: reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); @@ -6824,7 +6824,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { "X86-64 PIC uses RIP relative addressing"); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); - Register GlobalBaseReg = X86FI->getGlobalBaseReg(); + Register GlobalBaseReg = X86FI->getGlobalBaseReg(); if (GlobalBaseReg != 0) return GlobalBaseReg; @@ -8380,7 +8380,7 @@ describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, // If the described register is a sub-register of the destination register, // then pick out the source register's corresponding sub-register. if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) { - Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx); + Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx); return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); } @@ -8644,7 +8644,7 @@ namespace { return false; X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - Register GlobalBaseReg = X86FI->getGlobalBaseReg(); + Register GlobalBaseReg = X86FI->getGlobalBaseReg(); // If we didn't need a GlobalBaseReg, don't insert code. if (GlobalBaseReg == 0) @@ -8657,7 +8657,7 @@ namespace { MachineRegisterInfo &RegInfo = MF.getRegInfo(); const X86InstrInfo *TII = STI.getInstrInfo(); - Register PC; + Register PC; if (STI.isPICStyleGOT()) PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else @@ -8727,7 +8727,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} // namespace +} // namespace char CGBR::ID = 0; FunctionPass* diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h index d7d2370c6f..6b0f1a9f14 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.h @@ -317,17 +317,17 @@ public: SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - Optional<ExtAddrMode> - getAddrModeFromMemoryOp(const MachineInstr &MemI, - const TargetRegisterInfo *TRI) const override; - - bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, - int64_t &ImmVal) const override; - - bool preservesZeroValueInReg(const MachineInstr *MI, - const Register NullValueReg, - const TargetRegisterInfo *TRI) const override; - + Optional<ExtAddrMode> + getAddrModeFromMemoryOp(const MachineInstr &MemI, + const TargetRegisterInfo *TRI) const override; + + bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, + int64_t &ImmVal) const override; + + bool preservesZeroValueInReg(const MachineInstr *MI, + const Register NullValueReg, + const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, @@ -420,13 +420,13 @@ public: bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; - /// isSchedulingBoundary - Overrides the isSchedulingBoundary from - /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR - /// intructions and prevent it from being re-scheduled. - bool isSchedulingBoundary(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const override; - + /// isSchedulingBoundary - Overrides the isSchedulingBoundary from + /// Codegen/TargetInstrInfo.cpp to make it capable of identifying ENDBR + /// intructions and prevent it from being re-scheduled. + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads /// should be scheduled togther. On some targets if two loads are loading from @@ -470,7 +470,7 @@ public: unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; - unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, + unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; @@ -525,7 +525,7 @@ public: /// the machine instruction generated due to folding. MachineInstr *optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, - Register &FoldAsLoadDefReg, + Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; std::pair<unsigned, unsigned> diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td index b006d1d9aa..94e85df086 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrInfo.td @@ -69,8 +69,8 @@ def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, SDTCisVT<2, i8>]>; -def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; -def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>; +def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>; def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -94,11 +94,11 @@ def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>; -def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, i32>, - SDTCisVT<3, i8>, - SDTCisVT<4, i32>]>; +def SDT_X86VAARG : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; @@ -127,11 +127,11 @@ def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; def SDT_X86ENQCMD : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisSameAs<1, 2>]>; -def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>, - SDTCisVT<1, i32>, - SDTCisVT<2, v2i64>, - SDTCisPtrTy<3>]>; - +def SDT_X86AESENCDECKL : SDTypeProfile<2, 2, [SDTCisVT<0, v2i64>, + SDTCisVT<1, i32>, + SDTCisVT<2, v2i64>, + SDTCisPtrTy<3>]>; + def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, [SDNPHasChain,SDNPSideEffect]>; def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, @@ -169,10 +169,10 @@ def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru, def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair, +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair, +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair, [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; @@ -186,13 +186,13 @@ def X86vastart_save_xmm_regs : SDT_X86VASTART_SAVE_XMM_REGS, [SDNPHasChain, SDNPVariadic]>; def X86vaarg64 : - SDNode<"X86ISD::VAARG_64", SDT_X86VAARG, - [SDNPHasChain, SDNPMayLoad, SDNPMayStore, - SDNPMemOperand]>; -def X86vaargx32 : - SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG, + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def X86vaargx32 : + SDNode<"X86ISD::VAARG_X32", SDT_X86VAARG, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; def X86callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, [SDNPHasChain, SDNPOutGlue]>; @@ -280,7 +280,7 @@ def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, SDNPMemOperand]>; def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; -def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>; +def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>; def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>; @@ -320,23 +320,23 @@ def X86enqcmd : SDNode<"X86ISD::ENQCMD", SDT_X86ENQCMD, [SDNPHasChain, SDNPSideEffect]>; def X86enqcmds : SDNode<"X86ISD::ENQCMDS", SDT_X86ENQCMD, [SDNPHasChain, SDNPSideEffect]>; -def X86testui : SDNode<"X86ISD::TESTUI", - SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, - [SDNPHasChain, SDNPSideEffect]>; - -def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL, - [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, - SDNPMemOperand]>; -def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL, - [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, - SDNPMemOperand]>; -def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL, - [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, - SDNPMemOperand]>; -def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL, - [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, - SDNPMemOperand]>; - +def X86testui : SDNode<"X86ISD::TESTUI", + SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86aesenc128kl : SDNode<"X86ISD::AESENC128KL", SDT_X86AESENCDECKL, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; +def X86aesdec128kl : SDNode<"X86ISD::AESDEC128KL", SDT_X86AESENCDECKL, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; +def X86aesenc256kl : SDNode<"X86ISD::AESENC256KL", SDT_X86AESENCDECKL, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; +def X86aesdec256kl : SDNode<"X86ISD::AESDEC256KL", SDT_X86AESENCDECKL, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -914,8 +914,8 @@ def PKU : Predicate<"Subtarget->hasPKU()">; def HasVNNI : Predicate<"Subtarget->hasVNNI()">; def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; def HasBF16 : Predicate<"Subtarget->hasBF16()">; -def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; -def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; +def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; +def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -979,15 +979,15 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; -def HasKL : Predicate<"Subtarget->hasKL()">; -def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">; -def HasHRESET : Predicate<"Subtarget->hasHRESET()">; +def HasKL : Predicate<"Subtarget->hasKL()">; +def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">; +def HasHRESET : Predicate<"Subtarget->hasHRESET()">; def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">; def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">; def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; -def HasUINTR : Predicate<"Subtarget->hasUINTR()">; +def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, @@ -1035,7 +1035,7 @@ def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasERMSB : Predicate<"Subtarget->hasERMSB()">; -def HasFSRM : Predicate<"Subtarget->hasFSRM()">; +def HasFSRM : Predicate<"Subtarget->hasFSRM()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">; def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">; @@ -1073,7 +1073,7 @@ def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; -def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>; +def i64timmSExt32 : TImmLeaf<i64, [{ return isInt<32>(Imm); }]>; def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{ return isSExtAbsoluteSymbolRef(8, N); @@ -2679,11 +2679,11 @@ let Predicates = [HasBMI2] in { // let Predicates = [HasTBM], Defs = [EFLAGS] in { -multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, - SDNode OpNode, Operand immtype, - SDPatternOperator immoperator, - X86FoldableSchedWrite Sched> { +multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + SDNode OpNode, Operand immtype, + SDPatternOperator immoperator, + X86FoldableSchedWrite Sched> { def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), !strconcat(OpcodeStr, "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), @@ -2697,12 +2697,12 @@ multiclass tbm_bextri<bits<8> opc, RegisterClass RC, string OpcodeStr, XOP, XOPA, Sched<[Sched.Folded]>; } -defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32, - X86bextri, i32imm, timm, WriteBEXTR>; +defm BEXTRI32 : tbm_bextri<0x10, GR32, "bextr{l}", i32mem, loadi32, + X86bextri, i32imm, timm, WriteBEXTR>; let ImmT = Imm32S in -defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64, - X86bextri, i64i32imm, - i64timmSExt32, WriteBEXTR>, VEX_W; +defm BEXTRI64 : tbm_bextri<0x10, GR64, "bextr{q}", i64mem, loadi64, + X86bextri, i64i32imm, + i64timmSExt32, WriteBEXTR>, VEX_W; multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, RegisterClass RC, string OpcodeStr, @@ -2808,7 +2808,7 @@ let SchedRW = [ WriteSystem ] in { let Uses = [ ECX, EAX, EBX ] in { def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", - []>, TB, Requires<[ HasMWAITX ]>; + []>, TB, Requires<[ HasMWAITX ]>; } } // SchedRW @@ -2925,41 +2925,41 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>; def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// -// INVLPGB Instruction -// OPCODE 0F 01 FE -// -let SchedRW = [WriteSystem] in { - let Uses = [EAX, EDX] in - def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins), - "invlpgb}", []>, - PS, Requires<[Not64BitMode]>; - let Uses = [RAX, EDX] in - def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins), - "invlpgb", []>, - PS, Requires<[In64BitMode]>; -} // SchedRW - -def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>; -def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>; - -//===----------------------------------------------------------------------===// -// TLBSYNC Instruction -// OPCODE 0F 01 FF -// -let SchedRW = [WriteSystem] in { - def TLBSYNC : I<0x01, MRM_FF, (outs), (ins), - "tlbsync", []>, - PS, Requires<[]>; -} // SchedRW - -//===----------------------------------------------------------------------===// -// HRESET Instruction -// -let Uses = [EAX], SchedRW = [WriteSystem] in - def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>, - Requires<[HasHRESET]>, TAXS; - -//===----------------------------------------------------------------------===// +// INVLPGB Instruction +// OPCODE 0F 01 FE +// +let SchedRW = [WriteSystem] in { + let Uses = [EAX, EDX] in + def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins), + "invlpgb}", []>, + PS, Requires<[Not64BitMode]>; + let Uses = [RAX, EDX] in + def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins), + "invlpgb", []>, + PS, Requires<[In64BitMode]>; +} // SchedRW + +def : InstAlias<"invlpgb\t{%eax, %edx|eax, edx}", (INVLPGB32)>, Requires<[Not64BitMode]>; +def : InstAlias<"invlpgb\t{%rax, %edx|rax, edx}", (INVLPGB64)>, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// TLBSYNC Instruction +// OPCODE 0F 01 FF +// +let SchedRW = [WriteSystem] in { + def TLBSYNC : I<0x01, MRM_FF, (outs), (ins), + "tlbsync", []>, + PS, Requires<[]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// HRESET Instruction +// +let Uses = [EAX], SchedRW = [WriteSystem] in + def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>, + Requires<[HasHRESET]>, TAXS; + +//===----------------------------------------------------------------------===// // SERIALIZE Instruction // def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize", @@ -2977,25 +2977,25 @@ let Predicates = [HasTSXLDTRK] in { } //===----------------------------------------------------------------------===// -// UINTR Instructions -// -let Predicates = [HasUINTR, In64BitMode] in { - def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret", - []>, XS; - def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui", - [(int_x86_clui)]>, XS; - def STUI : I<0x01, MRM_EF, (outs), (ins), "stui", - [(int_x86_stui)]>, XS; - - def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg", - [(int_x86_senduipi GR64:$arg)]>, XS; - - let Defs = [EFLAGS] in - def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui", - [(set EFLAGS, (X86testui))]>, XS; -} - -//===----------------------------------------------------------------------===// +// UINTR Instructions +// +let Predicates = [HasUINTR, In64BitMode] in { + def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret", + []>, XS; + def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui", + [(int_x86_clui)]>, XS; + def STUI : I<0x01, MRM_EF, (outs), (ins), "stui", + [(int_x86_stui)]>, XS; + + def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg", + [(int_x86_senduipi GR64:$arg)]>, XS; + + let Defs = [EFLAGS] in + def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui", + [(set EFLAGS, (X86testui))]>, XS; +} + +//===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// @@ -3154,16 +3154,16 @@ include "X86InstrMPX.td" include "X86InstrVMX.td" include "X86InstrSVM.td" -include "X86InstrSNP.td" +include "X86InstrSNP.td" include "X86InstrTSX.td" include "X86InstrSGX.td" -include "X86InstrTDX.td" - -// Key Locker instructions -include "X86InstrKL.td" - +include "X86InstrTDX.td" + +// Key Locker instructions +include "X86InstrKL.td" + // AMX instructions include "X86InstrAMX.td" diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td index b91e563a15..6f2fbb56e1 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrKL.td @@ -1,86 +1,86 @@ -//===---------------------------*-tablegen-*-------------------------------===// -//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes the instructions that make up the Intel key locker -// instruction set. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Key Locker instructions - -let SchedRW = [WriteSystem], Predicates = [HasKL] in { - let Uses = [XMM0, EAX], Defs = [EFLAGS] in { - def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), - "loadiwkey\t{$src2, $src1|$src1, $src2}", - [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS, - NotMemoryFoldable; - } - - let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in { - def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS, - NotMemoryFoldable; - } - - let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in { - def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS, - NotMemoryFoldable; - } - - let Constraints = "$src1 = $dst", - Defs = [EFLAGS] in { - def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), - "aesenc128kl\t{$src2, $src1|$src1, $src2}", - [(set VR128:$dst, EFLAGS, - (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS, - NotMemoryFoldable; - - def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), - "aesdec128kl\t{$src2, $src1|$src1, $src2}", - [(set VR128:$dst, EFLAGS, - (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS, - NotMemoryFoldable; - - def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), - "aesenc256kl\t{$src2, $src1|$src1, $src2}", - [(set VR128:$dst, EFLAGS, - (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS, - NotMemoryFoldable; - - def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), - "aesdec256kl\t{$src2, $src1|$src1, $src2}", - [(set VR128:$dst, EFLAGS, - (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS, - NotMemoryFoldable; - } - -} // SchedRW, Predicates - -let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in { - let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7], - Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7], - mayLoad = 1 in { - def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src), - "aesencwide128kl\t$src", []>, T8XS, - NotMemoryFoldable; - def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src), - "aesdecwide128kl\t$src", []>, T8XS, - NotMemoryFoldable; - def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src), - "aesencwide256kl\t$src", []>, T8XS, - NotMemoryFoldable; - def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src), - "aesdecwide256kl\t$src", []>, T8XS, - NotMemoryFoldable; - } - -} // SchedRW, Predicates +//===---------------------------*-tablegen-*-------------------------------===// +//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel key locker +// instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Key Locker instructions + +let SchedRW = [WriteSystem], Predicates = [HasKL] in { + let Uses = [XMM0, EAX], Defs = [EFLAGS] in { + def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "loadiwkey\t{$src2, $src1|$src1, $src2}", + [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS, + NotMemoryFoldable; + } + + let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in { + def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS, + NotMemoryFoldable; + } + + let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in { + def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS, + NotMemoryFoldable; + } + + let Constraints = "$src1 = $dst", + Defs = [EFLAGS] in { + def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), + "aesenc128kl\t{$src2, $src1|$src1, $src2}", + [(set VR128:$dst, EFLAGS, + (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS, + NotMemoryFoldable; + + def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), + "aesdec128kl\t{$src2, $src1|$src1, $src2}", + [(set VR128:$dst, EFLAGS, + (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS, + NotMemoryFoldable; + + def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), + "aesenc256kl\t{$src2, $src1|$src1, $src2}", + [(set VR128:$dst, EFLAGS, + (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS, + NotMemoryFoldable; + + def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2), + "aesdec256kl\t{$src2, $src1|$src1, $src2}", + [(set VR128:$dst, EFLAGS, + (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS, + NotMemoryFoldable; + } + +} // SchedRW, Predicates + +let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in { + let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7], + Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7], + mayLoad = 1 in { + def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src), + "aesencwide128kl\t$src", []>, T8XS, + NotMemoryFoldable; + def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src), + "aesdecwide128kl\t$src", []>, T8XS, + NotMemoryFoldable; + def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src), + "aesencwide256kl\t$src", []>, T8XS, + NotMemoryFoldable; + def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src), + "aesdecwide256kl\t$src", []>, T8XS, + NotMemoryFoldable; + } + +} // SchedRW, Predicates diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td index bb3e6df3bf..cb18a9b59a 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrMMX.td @@ -472,7 +472,7 @@ defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, SchedWriteVarShuffle.MMX>; -let Predicates = [HasMMX, HasSSE1] in { +let Predicates = [HasMMX, HasSSE1] in { def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -486,7 +486,7 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (int_x86_sse_pshuf_w (load_mmx addr:$src1), timm:$src2))]>, Sched<[SchedWriteShuffle.MMX.Folded]>; -} +} // -- Conversion Instructions defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td index de59f3fe27..be95d70282 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSNP.td @@ -1,47 +1,47 @@ -//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the instructions that make up the AMD Secure Nested -// Paging (SNP) instruction set. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// SNP instructions - -let SchedRW = [WriteSystem] in { -// F3 0F 01 FF -let Uses = [RAX] in -def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS, - Requires<[In64BitMode]>; - -// F2 0F 01 FF -let Uses = [RAX] in -def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>, - XD, Requires<[In64BitMode]>; - -let Uses = [EAX] in -def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>, - XD, Requires<[Not64BitMode]>; - -// F2 0F 01 FE -let Uses = [RAX] in -def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD, - Requires<[In64BitMode]>; - -// F3 0F 01 FE -let Uses = [RAX] in -def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS, - Requires<[In64BitMode]>; -} // SchedRW - -def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>; -def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>; -def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>; -def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>; -def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>; +//===-- X86InstrSNP.td - SNP Instruction Set Extension -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the AMD Secure Nested +// Paging (SNP) instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SNP instructions + +let SchedRW = [WriteSystem] in { +// F3 0F 01 FF +let Uses = [RAX] in +def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS, + Requires<[In64BitMode]>; + +// F2 0F 01 FF +let Uses = [RAX] in +def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>, + XD, Requires<[In64BitMode]>; + +let Uses = [EAX] in +def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>, + XD, Requires<[Not64BitMode]>; + +// F2 0F 01 FE +let Uses = [RAX] in +def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD, + Requires<[In64BitMode]>; + +// F3 0F 01 FE +let Uses = [RAX] in +def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS, + Requires<[In64BitMode]>; +} // SchedRW + +def : InstAlias<"psmash\t{%rax|rax}", (PSMASH)>, Requires<[In64BitMode]>; +def : InstAlias<"pvalidate\t{%rax|rax}", (PVALIDATE64)>, Requires<[In64BitMode]>; +def : InstAlias<"pvalidate\t{%eax|eax}", (PVALIDATE32)>, Requires<[Not64BitMode]>; +def : InstAlias<"rmpupdate\t{%rax|rax}", (RMPUPDATE)>, Requires<[In64BitMode]>; +def : InstAlias<"rmpadjust\t{%rax|rax}", (RMPADJUST)>, Requires<[In64BitMode]>; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td index a185a2007b..29ac01b143 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSSE.td @@ -1242,8 +1242,8 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], - ExeDomain = SSEPackedSingle in { +let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], + ExeDomain = SSEPackedSingle in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1261,7 +1261,7 @@ def : Pat<(f32 (any_fpround FR64:$src)), (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; -let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { +let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (any_fpround FR64:$src))]>, @@ -1273,7 +1273,7 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC; } -let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { +let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1307,7 +1307,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { +let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -1327,7 +1327,7 @@ def : Pat<(f64 (any_fpextend FR32:$src)), def : Pat<(any_fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; -let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { +let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (any_fpextend FR32:$src))]>, @@ -1339,8 +1339,8 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC; } // isCodeGenOnly = 1 -let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, - ExeDomain = SSEPackedSingle in { +let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, + ExeDomain = SSEPackedSingle in { def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -3778,7 +3778,7 @@ let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { VEX_4V, VEX_WIG; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, i128mem, SchedWriteShuffle.XMM, load, 0>, - VEX_4V, VEX_WIG; + VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { @@ -3794,7 +3794,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { VEX_4V, VEX_L, VEX_WIG; defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, i256mem, SchedWriteShuffle.YMM, load, 0>, - VEX_4V, VEX_L, VEX_WIG; + VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -3930,7 +3930,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, + (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, @@ -3940,7 +3940,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), - timm:$src3))]>, + timm:$src3))]>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } @@ -3950,13 +3950,13 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), - timm:$src2))]>, + timm:$src2))]>, PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), - timm:$src2))]>, + timm:$src2))]>, Sched<[WriteVecExtract]>; // Insert @@ -4756,7 +4756,7 @@ let isCommutable = 0 in { SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, load, i128mem, - SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; @@ -4802,7 +4802,7 @@ let isCommutable = 0 in { SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, load, i256mem, - SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, @@ -5153,14 +5153,14 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), - timm:$src2))]>, + timm:$src2))]>, Sched<[WriteVecExtract]>; let hasSideEffects = 0, mayStore = 1 in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), + [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), addr:$dst)]>, Sched<[WriteVecExtractSt]>; } @@ -5184,7 +5184,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { (ins i16mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), + [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), addr:$dst)]>, Sched<[WriteVecExtractSt]>; } @@ -5274,7 +5274,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, + (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i8mem:$src2, u8imm:$src3), @@ -5283,7 +5283,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } @@ -6503,7 +6503,7 @@ multiclass pcmpistrm_SS42AI<string asm> { let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG; + defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG; defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; } @@ -6521,7 +6521,7 @@ multiclass SS42AI_pcmpestrm<string asm> { let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG; + defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG; defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; } @@ -6539,7 +6539,7 @@ multiclass SS42AI_pcmpistri<string asm> { let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG; + defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG; defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; } @@ -6557,7 +6557,7 @@ multiclass SS42AI_pcmpestri<string asm> { let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in - defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG; + defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG; defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; } @@ -7016,19 +7016,19 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; let Predicates = [HasAVX, NoVLX] in { -def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; // NOTE: We're using FP instructions here, but execution domain fixing can // convert to integer when profitable. -def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; -def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; -def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; -def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), +def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF128 addr:$src)>; } @@ -7164,68 +7164,68 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", WriteFMaskMove64, WriteFMaskMove64Y>; //===----------------------------------------------------------------------===// -// AVX_VNNI -//===----------------------------------------------------------------------===// -let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in -multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable> { - let isCommutable = IsCommutable in - def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), - !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, - VR128:$src2, VR128:$src3)))]>, - VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; - - def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i128mem:$src3), - !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, - (loadv4i32 addr:$src3))))]>, - VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; - - let isCommutable = IsCommutable in - def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3), - !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, - VR256:$src2, VR256:$src3)))]>, - VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; - - def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i256mem:$src3), - !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, - (loadv8i32 addr:$src3))))]>, - VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; -} - -defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix; -defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix; -defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix; -defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix; - -def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), - (X86vpmaddwd node:$lhs, node:$rhs), [{ - return N->hasOneUse(); -}]>; - -let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { - def : Pat<(v8i32 (add VR256:$src1, - (X86vpmaddwd_su VR256:$src2, VR256:$src3))), - (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; - def : Pat<(v8i32 (add VR256:$src1, - (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), - (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; - def : Pat<(v4i32 (add VR128:$src1, - (X86vpmaddwd_su VR128:$src2, VR128:$src3))), - (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; - def : Pat<(v4i32 (add VR128:$src1, - (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), - (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; -} - -//===----------------------------------------------------------------------===// +// AVX_VNNI +//===----------------------------------------------------------------------===// +let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in +multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable> { + let isCommutable = IsCommutable in + def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, + VR128:$src2, VR128:$src3)))]>, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i128mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, + (loadv4i32 addr:$src3))))]>, + VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; + + let isCommutable = IsCommutable in + def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, + VR256:$src2, VR256:$src3)))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; + + def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i256mem:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, + (loadv8i32 addr:$src3))))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; +} + +defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix; +defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix; +defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix; +defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix; + +def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), + (X86vpmaddwd node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { + def : Pat<(v8i32 (add VR256:$src1, + (X86vpmaddwd_su VR256:$src2, VR256:$src3))), + (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; + def : Pat<(v8i32 (add VR256:$src1, + (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), + (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; + def : Pat<(v4i32 (add VR128:$src1, + (X86vpmaddwd_su VR128:$src2, VR128:$src3))), + (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (add VR128:$src1, + (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), + (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; +} + +//===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values // @@ -7287,12 +7287,12 @@ let ExeDomain = SSEPackedSingle in { let isCommutable = 1 in def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), - "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, - VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), - "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, - VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; } // Immediate transform to help with commuting. @@ -7300,27 +7300,27 @@ def Perm2XCommuteImm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); }]>; -multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { - def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), - (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; - def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), - (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; - // Pattern with load in other operand. - def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), - (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, - (Perm2XCommuteImm timm:$imm))>; -} - +multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { + def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), + (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; + def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), + (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; + // Pattern with load in other operand. + def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), + (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, + (Perm2XCommuteImm timm:$imm))>; +} + let Predicates = [HasAVX] in { - defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; - defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; + defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; + defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; } let Predicates = [HasAVX1Only] in { - defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; - defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; - defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; - defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; + defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; + defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; + defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; } //===----------------------------------------------------------------------===// @@ -7689,24 +7689,24 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, WriteFShuffle256, f256mem>, VEX_W; //===----------------------------------------------------------------------===// -// VPERM2I128 - Permute Integer vector Values in 128-bit chunks +// VPERM2I128 - Permute Integer vector Values in 128-bit chunks // let isCommutable = 1 in def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), - "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, - Sched<[WriteShuffle256]>, VEX_4V, VEX_L; + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), - "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; -let Predicates = [HasAVX2] in { - defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; - defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; - defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; - defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; -} +let Predicates = [HasAVX2] in { + defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; + defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; + defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; + defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; +} //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td index d8f70b016c..2bc32910a6 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSVM.td @@ -26,47 +26,47 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; // 0F 01 DE let Uses = [EAX] in -def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB; +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit", []>, TB; // 0F 01 D8 let Uses = [EAX] in -def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB, +def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in -def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB, +def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun", []>, TB, Requires<[In64BitMode]>; // 0F 01 DA let Uses = [EAX] in -def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB, +def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in -def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB, +def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload", []>, TB, Requires<[In64BitMode]>; // 0F 01 DB let Uses = [EAX] in -def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB, +def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in -def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB, +def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave", []>, TB, Requires<[In64BitMode]>; // 0F 01 DF let Uses = [EAX, ECX] in def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), - "invlpga", []>, TB, Requires<[Not64BitMode]>; + "invlpga", []>, TB, Requires<[Not64BitMode]>; let Uses = [RAX, ECX] in def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), - "invlpga", []>, TB, Requires<[In64BitMode]>; + "invlpga", []>, TB, Requires<[In64BitMode]>; } // SchedRW - -def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>; -def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>; -def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>; -def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>; -def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>; -def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>; -def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>; -def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>; -def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>; + +def : InstAlias<"skinit\t{%eax|eax}", (SKINIT), 0>; +def : InstAlias<"vmrun\t{%eax|eax}", (VMRUN32), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"vmrun\t{%rax|rax}", (VMRUN64), 0>, Requires<[In64BitMode]>; +def : InstAlias<"vmload\t{%eax|eax}", (VMLOAD32), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"vmload\t{%rax|rax}", (VMLOAD64), 0>, Requires<[In64BitMode]>; +def : InstAlias<"vmsave\t{%eax|eax}", (VMSAVE32), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"vmsave\t{%rax|rax}", (VMSAVE64), 0>, Requires<[In64BitMode]>; +def : InstAlias<"invlpga\t{%eax, %ecx|eax, ecx}", (INVLPGA32), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"invlpga\t{%rax, %ecx|rax, ecx}", (INVLPGA64), 0>, Requires<[In64BitMode]>; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td index eb8740896e..3b105cedb3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrSystem.td @@ -49,7 +49,7 @@ let Uses = [EFLAGS] in def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; } // SchedRW -def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>; +def UBSAN_UD1 : PseudoI<(outs), (ins i32imm:$kind), [(ubsantrap (i32 timm:$kind))]>; // The long form of "int $3" turns into int3 as a size optimization. // FIXME: This doesn't work because InstAlias can't match immediate constants. //def : InstAlias<"int\t$3", (INT3)>; @@ -172,17 +172,17 @@ def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>; } // SchedRW //===----------------------------------------------------------------------===// -// Address-size override prefixes. -// - -let SchedRW = [WriteNop] in { -def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>, - Requires<[In32BitMode]>; -def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>, - Requires<[In64BitMode]>; -} // SchedRW - -//===----------------------------------------------------------------------===// +// Address-size override prefixes. +// + +let SchedRW = [WriteNop] in { +def ADDR16_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr16", []>, + Requires<[In32BitMode]>; +def ADDR32_PREFIX : I<0x67, PrefixByte, (outs), (ins), "addr32", []>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// // Moves to and from segment registers. // @@ -459,7 +459,7 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in // Cache instructions let SchedRW = [WriteSystem] in { def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB; -def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS; // wbnoinvd is like wbinvd, except without invalidation // encoding: like wbinvd + an 0xF3 prefix diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td b/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td index 8d7cd60820..e21028c8a3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstrTDX.td @@ -1,39 +1,39 @@ -//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the instructions that make up the Intel TDX instruction -// set. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// TDX instructions - -// 64-bit only instructions -let SchedRW = [WriteSystem], Predicates = [In64BitMode] in { -// SEAMCALL - Call to SEAM VMX-root Operation Module -def SEAMCALL : I<0x01, MRM_CF, (outs), (ins), - "seamcall", []>, PD; - -// SEAMRET - Return to Legacy VMX-root Operation -def SEAMRET : I<0x01, MRM_CD, (outs), (ins), - "seamret", []>, PD; - -// SEAMOPS - SEAM Operations -def SEAMOPS : I<0x01, MRM_CE, (outs), (ins), - "seamops", []>, PD; - -} // SchedRW - -// common instructions -let SchedRW = [WriteSystem] in { -// TDCALL - Call SEAM Module Functions -def TDCALL : I<0x01, MRM_CC, (outs), (ins), - "tdcall", []>, PD; - -} // SchedRW +//===- X86InstrTDX.td - TDX Instruction Set Extension -*- tablegen -*===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel TDX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TDX instructions + +// 64-bit only instructions +let SchedRW = [WriteSystem], Predicates = [In64BitMode] in { +// SEAMCALL - Call to SEAM VMX-root Operation Module +def SEAMCALL : I<0x01, MRM_CF, (outs), (ins), + "seamcall", []>, PD; + +// SEAMRET - Return to Legacy VMX-root Operation +def SEAMRET : I<0x01, MRM_CD, (outs), (ins), + "seamret", []>, PD; + +// SEAMOPS - SEAM Operations +def SEAMOPS : I<0x01, MRM_CE, (outs), (ins), + "seamops", []>, PD; + +} // SchedRW + +// common instructions +let SchedRW = [WriteSystem] in { +// TDCALL - Call SEAM Module Functions +def TDCALL : I<0x01, MRM_CC, (outs), (ins), + "tdcall", []>, PD; + +} // SchedRW diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp index ff53171303..8a3091af28 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InstructionSelector.cpp @@ -214,8 +214,8 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) { return SubIdx; } -static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) { - assert(Reg.isPhysical()); +static const TargetRegisterClass *getRegClassFromGRPhysReg(Register Reg) { + assert(Reg.isPhysical()); if (X86::GR64RegClass.contains(Reg)) return &X86::GR64RegClass; if (X86::GR32RegClass.contains(Reg)) @@ -239,7 +239,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - if (DstReg.isPhysical()) { + if (DstReg.isPhysical()) { assert(I.isCopy() && "Generic operators do not allow physical registers"); if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID && @@ -266,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } - assert((!SrcReg.isPhysical() || I.isCopy()) && + assert((!SrcReg.isPhysical() || I.isCopy()) && "No phys reg on generic operators"); assert((DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (SrcReg.isPhysical() && + (SrcReg.isPhysical() && DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) && "Copy with different width?!"); @@ -280,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, if (SrcRegBank.getID() == X86::GPRRegBankID && DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize && - SrcReg.isPhysical()) { + SrcReg.isPhysical()) { // Change the physical register to performe truncate. const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg); @@ -479,7 +479,7 @@ static void X86SelectAddress(const MachineInstr &I, "unsupported type."); if (I.getOpcode() == TargetOpcode::G_PTR_ADD) { - if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) { + if (auto COff = getConstantVRegSExtVal(I.getOperand(2).getReg(), MRI)) { int64_t Imm = *COff; if (isInt<32>(Imm)) { // Check for displacement overflow. AM.Disp = static_cast<int32_t>(Imm); @@ -780,18 +780,18 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); - assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) && - "8=>16 Zext is handled by tablegen"); + assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(16)) && + "8=>16 Zext is handled by tablegen"); assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(32)) && "8=>32 Zext is handled by tablegen"); assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(32)) && "16=>32 Zext is handled by tablegen"); - assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) && - "8=>64 Zext is handled by tablegen"); - assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) && - "16=>64 Zext is handled by tablegen"); - assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) && - "32=>64 Zext is handled by tablegen"); + assert(!(SrcTy == LLT::scalar(8) && DstTy == LLT::scalar(64)) && + "8=>64 Zext is handled by tablegen"); + assert(!(SrcTy == LLT::scalar(16) && DstTy == LLT::scalar(64)) && + "16=>64 Zext is handled by tablegen"); + assert(!(SrcTy == LLT::scalar(32) && DstTy == LLT::scalar(64)) && + "32=>64 Zext is handled by tablegen"); if (SrcTy != LLT::scalar(1)) return false; @@ -808,17 +808,17 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, else return false; - Register DefReg = SrcReg; + Register DefReg = SrcReg; if (DstTy != LLT::scalar(8)) { - Register ImpDefReg = - MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); - BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg); - + Register ImpDefReg = + MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::IMPLICIT_DEF), ImpDefReg); + DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(TargetOpcode::INSERT_SUBREG), DefReg) - .addReg(ImpDefReg) + TII.get(TargetOpcode::INSERT_SUBREG), DefReg) + .addReg(ImpDefReg) .addReg(SrcReg) .addImm(X86::sub_8bit); } @@ -1559,9 +1559,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, }}, // i64 }; - auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) { - return El.SizeInBits == RegTy.getSizeInBits(); - }); + auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) { + return El.SizeInBits == RegTy.getSizeInBits(); + }); if (OpEntryIt == std::end(OpTable)) return false; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp index 95655dd472..83829e0b82 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86InterleavedAccess.cpp @@ -44,8 +44,8 @@ namespace { /// E.g. A group of interleaving access loads (Factor = 2; accessing every /// other element) /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr -/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> -/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> +/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> +/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> class X86InterleavedAccessGroup { /// Reference to the wide-load instruction of an interleaved access /// group. @@ -211,7 +211,7 @@ void X86InterleavedAccessGroup::decompose( VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); } // Generate N loads of T type. - assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && + assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(8) && "VecBaseTy's size must be a multiple of 8"); const Align FirstAlignment = LI->getAlign(); const Align SubsequentAlignment = commonAlignment( @@ -295,7 +295,7 @@ static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, if (VecElems == 16) { for (unsigned i = 0; i < Stride; i++) - TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); + TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); return; } @@ -576,7 +576,7 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3( // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 for (int i = 0; i < 3; i++) - Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); + Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf); // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 @@ -598,8 +598,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3( // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 - Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3); - TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2); + Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3); + TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2); TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; } @@ -656,8 +656,8 @@ void X86InterleavedAccessGroup::interleave8bitStride3( // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 - Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2); - Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3); + Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2); + Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3); Vec[2] = InVec[2]; // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2 diff --git a/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h index 72ab3e9cf7..233ac1a3e3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86IntrinsicsInfo.h @@ -22,7 +22,7 @@ namespace llvm { enum IntrinsicType : uint16_t { CVTNEPS2BF16_MASK, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8, INTR_TYPE_3OP_IMM8, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI, CVTPD2PS_MASK, @@ -458,12 +458,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FADDS, X86ISD::FADDS_RND), X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK, X86ISD::FADDS, X86ISD::FADDS_RND), - X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), - X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, @@ -882,12 +882,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), - X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_128, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_256, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_pternlog_q_512, INTR_TYPE_4OP_IMM8, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), @@ -1098,7 +1098,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), - X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0), + X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1108,8 +1108,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), - X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0), - X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0), + X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0), + X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTRI, 0), X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), @@ -1132,10 +1132,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP, X86ISD::GF2P8MULB, 0), - X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), - X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), - X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), - X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP_IMM8, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0), X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp index 1b371ac2a1..9f51fa4f4c 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86LegalizerInfo.cpp @@ -70,11 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoAVX512DQ(); setLegalizerInfoAVX512BW(); - getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) - .scalarize(0) - .minScalar(0, LLT::scalar(32)) - .libcall(); - + getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) + .scalarize(0) + .minScalar(0, LLT::scalar(32)) + .libcall(); + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1); for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR}) setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1); @@ -86,8 +86,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizeScalarToDifferentSizeStrategy( G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest); - getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); - + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + computeTables(); verify(*STI.getInstrInfo()); } @@ -155,11 +155,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) .clampScalar(0, s8, s32) .clampScalar(1, s8, s8); - - // Comparison - getActionDefinitionsBuilder(G_ICMP) - .legalForCartesianProduct({s8}, {s8, s16, s32, p0}) - .clampScalar(0, s8, s8); + + // Comparison + getActionDefinitionsBuilder(G_ICMP) + .legalForCartesianProduct({s8}, {s8, s16, s32, p0}) + .clampScalar(0, s8, s8); } // Control-flow @@ -246,9 +246,9 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { .widenScalarToNextPow2(1); // Comparison - getActionDefinitionsBuilder(G_ICMP) - .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0}) - .clampScalar(0, s8, s8); + getActionDefinitionsBuilder(G_ICMP) + .legalForCartesianProduct({s8}, {s8, s16, s32, s64, p0}) + .clampScalar(0, s8, s8); getActionDefinitionsBuilder(G_FCMP) .legalForCartesianProduct({s8}, {s32, s64}) diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 810fee052b..e419f45405 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -42,7 +42,7 @@ #include "X86TargetMachine.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -105,9 +105,9 @@ static cl::opt<bool> EmitDotVerify( cl::init(false), cl::Hidden); static llvm::sys::DynamicLibrary OptimizeDL; -typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize, - unsigned int *Edges, int *EdgeValues, - int *CutEdges /* out */, unsigned int EdgesSize); +typedef int (*OptimizeCutT)(unsigned int *Nodes, unsigned int NodesSize, + unsigned int *Edges, int *EdgeValues, + int *CutEdges /* out */, unsigned int EdgesSize); static OptimizeCutT OptimizeCut = nullptr; namespace { @@ -149,8 +149,8 @@ public: private: using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>; - using Edge = MachineGadgetGraph::Edge; - using Node = MachineGadgetGraph::Node; + using Edge = MachineGadgetGraph::Edge; + using Node = MachineGadgetGraph::Node; using EdgeSet = MachineGadgetGraph::EdgeSet; using NodeSet = MachineGadgetGraph::NodeSet; @@ -164,8 +164,8 @@ private: const MachineDominanceFrontier &MDF) const; int hardenLoadsWithPlugin(MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const; - int hardenLoadsWithHeuristic(MachineFunction &MF, - std::unique_ptr<MachineGadgetGraph> Graph) const; + int hardenLoadsWithHeuristic(MachineFunction &MF, + std::unique_ptr<MachineGadgetGraph> Graph) const; int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */, NodeSet &ElimNodes /* in, out */) const; @@ -198,7 +198,7 @@ struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits { using ChildIteratorType = typename Traits::ChildIteratorType; using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType; - DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} + DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} std::string getNodeLabel(NodeRef Node, GraphType *) { if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel) @@ -243,7 +243,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( AU.setPreservesCFG(); } -static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF, +static void writeGadgetGraph(raw_ostream &OS, MachineFunction &MF, MachineGadgetGraph *G) { WriteGraph(OS, G, /*ShortNames*/ false, "Speculative gadgets for \"" + MF.getName() + "\" function"); @@ -279,7 +279,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( return false; // didn't find any gadgets if (EmitDotVerify) { - writeGadgetGraph(outs(), MF, Graph.get()); + writeGadgetGraph(outs(), MF, Graph.get()); return false; } @@ -292,7 +292,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( raw_fd_ostream FileOut(FileName, FileError); if (FileError) errs() << FileError.message(); - writeGadgetGraph(FileOut, MF, Graph.get()); + writeGadgetGraph(FileOut, MF, Graph.get()); FileOut.close(); LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n"); if (EmitDotOnly) @@ -313,7 +313,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( } FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph)); } else { // Use the default greedy heuristic - FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph)); + FencesInserted = hardenLoadsWithHeuristic(MF, std::move(Graph)); } if (FencesInserted > 0) @@ -367,7 +367,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( // Use RDF to find all the uses of `Def` rdf::NodeSet Uses; - RegisterRef DefReg = Def.Addr->getRegRef(DFG); + RegisterRef DefReg = Def.Addr->getRegRef(DFG); for (auto UseID : L.getAllReachedUses(DefReg, Def)) { auto Use = DFG.addr<UseNode *>(UseID); if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node @@ -540,17 +540,17 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( // Returns the number of remaining gadget edges that could not be eliminated int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( - MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */, - NodeSet &ElimNodes /* in, out */) const { + MachineGadgetGraph &G, EdgeSet &ElimEdges /* in, out */, + NodeSet &ElimNodes /* in, out */) const { if (G.NumFences > 0) { // Eliminate fences and CFG edges that ingress and egress the fence, as // they are trivially mitigated. - for (const Edge &E : G.edges()) { - const Node *Dest = E.getDest(); + for (const Edge &E : G.edges()) { + const Node *Dest = E.getDest(); if (isFence(Dest->getValue())) { ElimNodes.insert(*Dest); ElimEdges.insert(E); - for (const Edge &DE : Dest->edges()) + for (const Edge &DE : Dest->edges()) ElimEdges.insert(DE); } } @@ -558,28 +558,28 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( // Find and eliminate gadget edges that have been mitigated. int MitigatedGadgets = 0, RemainingGadgets = 0; - NodeSet ReachableNodes{G}; - for (const Node &RootN : G.nodes()) { + NodeSet ReachableNodes{G}; + for (const Node &RootN : G.nodes()) { if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge)) continue; // skip this node if it isn't a gadget source // Find all of the nodes that are CFG-reachable from RootN using DFS ReachableNodes.clear(); - std::function<void(const Node *, bool)> FindReachableNodes = - [&](const Node *N, bool FirstNode) { - if (!FirstNode) - ReachableNodes.insert(*N); - for (const Edge &E : N->edges()) { - const Node *Dest = E.getDest(); - if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) && - !ReachableNodes.contains(*Dest)) - FindReachableNodes(Dest, false); - } - }; + std::function<void(const Node *, bool)> FindReachableNodes = + [&](const Node *N, bool FirstNode) { + if (!FirstNode) + ReachableNodes.insert(*N); + for (const Edge &E : N->edges()) { + const Node *Dest = E.getDest(); + if (MachineGadgetGraph::isCFGEdge(E) && !ElimEdges.contains(E) && + !ReachableNodes.contains(*Dest)) + FindReachableNodes(Dest, false); + } + }; FindReachableNodes(&RootN, true); // Any gadget whose sink is unreachable has been mitigated - for (const Edge &E : RootN.edges()) { + for (const Edge &E : RootN.edges()) { if (MachineGadgetGraph::isGadgetEdge(E)) { if (ReachableNodes.contains(*E.getDest())) { // This gadget's sink is reachable @@ -597,8 +597,8 @@ int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes( std::unique_ptr<MachineGadgetGraph> X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges( std::unique_ptr<MachineGadgetGraph> Graph) const { - NodeSet ElimNodes{*Graph}; - EdgeSet ElimEdges{*Graph}; + NodeSet ElimNodes{*Graph}; + EdgeSet ElimEdges{*Graph}; int RemainingGadgets = elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes); if (ElimEdges.empty() && ElimNodes.empty()) { @@ -629,11 +629,11 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin( auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size()); auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size()); auto EdgeValues = std::make_unique<int[]>(Graph->edges_size()); - for (const Node &N : Graph->nodes()) { + for (const Node &N : Graph->nodes()) { Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin()); } Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node - for (const Edge &E : Graph->edges()) { + for (const Edge &E : Graph->edges()) { Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest()); EdgeValues[Graph->getEdgeIndex(E)] = E.getValue(); } @@ -650,67 +650,67 @@ int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin( LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n"); LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n"); - Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges); + Graph = GraphBuilder::trim(*Graph, NodeSet{*Graph}, CutEdges); } while (true); return FencesInserted; } -int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic( +int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithHeuristic( MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const { - // If `MF` does not have any fences, then no gadgets would have been - // mitigated at this point. - if (Graph->NumFences > 0) { - LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); - Graph = trimMitigatedEdges(std::move(Graph)); - LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); - } - + // If `MF` does not have any fences, then no gadgets would have been + // mitigated at this point. + if (Graph->NumFences > 0) { + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n"); + Graph = trimMitigatedEdges(std::move(Graph)); + LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n"); + } + if (Graph->NumGadgets == 0) return 0; LLVM_DEBUG(dbgs() << "Cutting edges...\n"); - EdgeSet CutEdges{*Graph}; - - // Begin by collecting all ingress CFG edges for each node - DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap; - for (const Edge &E : Graph->edges()) - if (MachineGadgetGraph::isCFGEdge(E)) - IngressEdgeMap[E.getDest()].push_back(&E); - - // For each gadget edge, make cuts that guarantee the gadget will be - // mitigated. A computationally efficient way to achieve this is to either: - // (a) cut all egress CFG edges from the gadget source, or - // (b) cut all ingress CFG edges to the gadget sink. - // - // Moreover, the algorithm tries not to make a cut into a loop by preferring - // to make a (b)-type cut if the gadget source resides at a greater loop depth - // than the gadget sink, or an (a)-type cut otherwise. - for (const Node &N : Graph->nodes()) { - for (const Edge &E : N.edges()) { - if (!MachineGadgetGraph::isGadgetEdge(E)) + EdgeSet CutEdges{*Graph}; + + // Begin by collecting all ingress CFG edges for each node + DenseMap<const Node *, SmallVector<const Edge *, 2>> IngressEdgeMap; + for (const Edge &E : Graph->edges()) + if (MachineGadgetGraph::isCFGEdge(E)) + IngressEdgeMap[E.getDest()].push_back(&E); + + // For each gadget edge, make cuts that guarantee the gadget will be + // mitigated. A computationally efficient way to achieve this is to either: + // (a) cut all egress CFG edges from the gadget source, or + // (b) cut all ingress CFG edges to the gadget sink. + // + // Moreover, the algorithm tries not to make a cut into a loop by preferring + // to make a (b)-type cut if the gadget source resides at a greater loop depth + // than the gadget sink, or an (a)-type cut otherwise. + for (const Node &N : Graph->nodes()) { + for (const Edge &E : N.edges()) { + if (!MachineGadgetGraph::isGadgetEdge(E)) continue; - SmallVector<const Edge *, 2> EgressEdges; - SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()]; - for (const Edge &EgressEdge : N.edges()) - if (MachineGadgetGraph::isCFGEdge(EgressEdge)) - EgressEdges.push_back(&EgressEdge); - - int EgressCutCost = 0, IngressCutCost = 0; - for (const Edge *EgressEdge : EgressEdges) - if (!CutEdges.contains(*EgressEdge)) - EgressCutCost += EgressEdge->getValue(); - for (const Edge *IngressEdge : IngressEdges) - if (!CutEdges.contains(*IngressEdge)) - IngressCutCost += IngressEdge->getValue(); - - auto &EdgesToCut = - IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges; - for (const Edge *E : EdgesToCut) - CutEdges.insert(*E); + SmallVector<const Edge *, 2> EgressEdges; + SmallVector<const Edge *, 2> &IngressEdges = IngressEdgeMap[E.getDest()]; + for (const Edge &EgressEdge : N.edges()) + if (MachineGadgetGraph::isCFGEdge(EgressEdge)) + EgressEdges.push_back(&EgressEdge); + + int EgressCutCost = 0, IngressCutCost = 0; + for (const Edge *EgressEdge : EgressEdges) + if (!CutEdges.contains(*EgressEdge)) + EgressCutCost += EgressEdge->getValue(); + for (const Edge *IngressEdge : IngressEdges) + if (!CutEdges.contains(*IngressEdge)) + IngressCutCost += IngressEdge->getValue(); + + auto &EdgesToCut = + IngressCutCost < EgressCutCost ? IngressEdges : EgressEdges; + for (const Edge *E : EdgesToCut) + CutEdges.insert(*E); } - } + } LLVM_DEBUG(dbgs() << "Cutting edges... Done\n"); LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n"); @@ -726,8 +726,8 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences( MachineFunction &MF, MachineGadgetGraph &G, EdgeSet &CutEdges /* in, out */) const { int FencesInserted = 0; - for (const Node &N : G.nodes()) { - for (const Edge &E : N.edges()) { + for (const Node &N : G.nodes()) { + for (const Edge &E : N.edges()) { if (CutEdges.contains(E)) { MachineInstr *MI = N.getValue(), *Prev; MachineBasicBlock *MBB; // Insert an LFENCE in this MBB @@ -743,7 +743,7 @@ int X86LoadValueInjectionLoadHardeningPass::insertFences( Prev = MI->getPrevNode(); // Remove all egress CFG edges from this branch because the inserted // LFENCE prevents gadgets from crossing the branch. - for (const Edge &E : N.edges()) { + for (const Edge &E : N.edges()) { if (MachineGadgetGraph::isCFGEdge(E)) CutEdges.insert(E); } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp index 7b6276c1d8..4562c1aee2 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp @@ -75,35 +75,35 @@ bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction( bool Modified = false; for (auto &MBB : MF) { - for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) { - if (MBBI->getOpcode() != X86::RETQ) - continue; - - unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI); - if (ClobberReg != X86::NoRegister) { - BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r)) - .addReg(ClobberReg, RegState::Define) - .setMIFlag(MachineInstr::FrameDestroy); - BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE)); - BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r)) - .addReg(ClobberReg); - MBB.erase(MBBI); - } else { - // In case there is no available scratch register, we can still read - // from RSP to assert that RSP points to a valid page. The write to RSP - // is also helpful because it verifies that the stack's write - // permissions are intact. - MachineInstr *Fence = - BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE)); - addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)), - X86::RSP, false, 0) - .addImm(0) - ->addRegisterDead(X86::EFLAGS, TRI); - } - - ++NumFences; - Modified = true; - break; + for (auto MBBI = MBB.begin(); MBBI != MBB.end(); ++MBBI) { + if (MBBI->getOpcode() != X86::RETQ) + continue; + + unsigned ClobberReg = TRI->findDeadCallerSavedReg(MBB, MBBI); + if (ClobberReg != X86::NoRegister) { + BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::POP64r)) + .addReg(ClobberReg, RegState::Define) + .setMIFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE)); + BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::JMP64r)) + .addReg(ClobberReg); + MBB.erase(MBBI); + } else { + // In case there is no available scratch register, we can still read + // from RSP to assert that RSP points to a valid page. The write to RSP + // is also helpful because it verifies that the stack's write + // permissions are intact. + MachineInstr *Fence = + BuildMI(MBB, MBBI, DebugLoc(), TII->get(X86::LFENCE)); + addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)), + X86::RSP, false, 0) + .addImm(0) + ->addRegisterDead(X86::EFLAGS, TRI); + } + + ++NumFences; + Modified = true; + break; } } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp b/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp index 85166decd8..4c7eb5862f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86LowerAMXType.cpp @@ -1,351 +1,351 @@ -//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Pass to transform <256 x i32> load/store -/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only -/// provides simple operation on x86_amx. The basic elementwise operation -/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32> -/// and only AMX intrinsics can operate on the type, we need transform -/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can -/// not be combined with load/store, we transform the bitcast to amx load/store -/// and <256 x i32> store/load. -// -//===----------------------------------------------------------------------===// -// -#include "X86.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsX86.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "lower-amx-type" - -static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) { - Function &F = *BB->getParent(); - Module *M = BB->getModule(); - const DataLayout &DL = M->getDataLayout(); - - Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false); - LLVMContext &Ctx = Builder.getContext(); - auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx)); - unsigned AllocaAS = DL.getAllocaAddrSpace(); - AllocaInst *AllocaRes = - new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front()); - AllocaRes->setAlignment(AllocaAlignment); - return AllocaRes; -} - -static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) { - Value *Row = nullptr, *Col = nullptr; - switch (II->getIntrinsicID()) { - default: - llvm_unreachable("Expect amx intrinsics"); - case Intrinsic::x86_tileloadd64_internal: - case Intrinsic::x86_tilestored64_internal: { - Row = II->getArgOperand(0); - Col = II->getArgOperand(1); - break; - } - // a * b + c - // The shape depends on which operand. - case Intrinsic::x86_tdpbssd_internal: { - switch (OpNo) { - case 3: - Row = II->getArgOperand(0); - Col = II->getArgOperand(1); - break; - case 4: - Row = II->getArgOperand(0); - Col = II->getArgOperand(2); - break; - case 5: - Row = II->getArgOperand(2); - Col = II->getArgOperand(1); - break; - } - break; - } - } - - return std::make_pair(Row, Col); -} - -// %src = load <256 x i32>, <256 x i32>* %addr, align 64 -// %2 = bitcast <256 x i32> %src to x86_amx -// --> -// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, -// i8* %addr, i64 %stride64) -static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { - Value *Row = nullptr, *Col = nullptr; - Use &U = *(Bitcast->use_begin()); - unsigned OpNo = U.getOperandNo(); - auto *II = cast<IntrinsicInst>(U.getUser()); - std::tie(Row, Col) = getShape(II, OpNo); - IRBuilder<> Builder(Bitcast); - // Use the maximun column as stride. - Value *Stride = Builder.getInt64(64); - Value *I8Ptr = - Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy()); - std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; - - Value *NewInst = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args); - Bitcast->replaceAllUsesWith(NewInst); -} - -// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr, -// %stride); -// %13 = bitcast x86_amx %src to <256 x i32> -// store <256 x i32> %13, <256 x i32>* %addr, align 64 -// --> -// call void @llvm.x86.tilestored64.internal(%row, %col, %addr, -// %stride64, %13) -static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { - - Value *Tile = Bitcast->getOperand(0); - auto *II = cast<IntrinsicInst>(Tile); - // Tile is output from AMX intrinsic. The first operand of the - // intrinsic is row, the second operand of the intrinsic is column. - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); - IRBuilder<> Builder(ST); - // Use the maximum column as stride. It must be the same with load - // stride. - Value *Stride = Builder.getInt64(64); - Value *I8Ptr = - Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy()); - std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); - if (Bitcast->hasOneUse()) - return; - // %13 = bitcast x86_amx %src to <256 x i32> - // store <256 x i32> %13, <256 x i32>* %addr, align 64 - // %add = <256 x i32> %13, <256 x i32> %src2 - // --> - // %13 = bitcast x86_amx %src to <256 x i32> - // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, - // %stride64, %13) - // %14 = load <256 x i32>, %addr - // %add = <256 x i32> %14, <256 x i32> %src2 - Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1)); - Bitcast->replaceAllUsesWith(Vec); -} - -// transform bitcast to <store, load> instructions. -static bool transformBitcast(BitCastInst *Bitcast) { - IRBuilder<> Builder(Bitcast); - AllocaInst *AllocaAddr; - Value *I8Ptr, *Stride; - auto *Src = Bitcast->getOperand(0); - - auto Prepare = [&]() { - AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent()); - I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); - Stride = Builder.getInt64(64); - }; - - if (Bitcast->getType()->isX86_AMXTy()) { - // %2 = bitcast <256 x i32> %src to x86_amx - // --> - // %addr = alloca <256 x i32>, align 64 - // store <256 x i32> %src, <256 x i32>* %addr, align 64 - // %addr2 = bitcast <256 x i32>* to i8* - // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, - // i8* %addr2, - // i64 64) - Use &U = *(Bitcast->use_begin()); - unsigned OpNo = U.getOperandNo(); - auto *II = dyn_cast<IntrinsicInst>(U.getUser()); - if (!II) - return false; // May be bitcast from x86amx to <256 x i32>. - Prepare(); - Builder.CreateStore(Src, AllocaAddr); - // TODO we can pick an constant operand for the shape. - Value *Row = nullptr, *Col = nullptr; - std::tie(Row, Col) = getShape(II, OpNo); - std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; - Value *NewInst = Builder.CreateIntrinsic( - Intrinsic::x86_tileloadd64_internal, None, Args); - Bitcast->replaceAllUsesWith(NewInst); - } else { - // %2 = bitcast x86_amx %src to <256 x i32> - // --> - // %addr = alloca <256 x i32>, align 64 - // %addr2 = bitcast <256 x i32>* to i8* - // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, - // i8* %addr2, i64 %stride) - // %2 = load <256 x i32>, <256 x i32>* %addr, align 64 - auto *II = dyn_cast<IntrinsicInst>(Src); - if (!II) - return false; // May be bitcast from <256 x i32> to x86amx. - Prepare(); - Value *Row = II->getOperand(0); - Value *Col = II->getOperand(1); - std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); - Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr); - Bitcast->replaceAllUsesWith(NewInst); - } - - return true; -} - -namespace { -class X86LowerAMXType { - Function &Func; - -public: - X86LowerAMXType(Function &F) : Func(F) {} - bool visit(); -}; - -bool X86LowerAMXType::visit() { - SmallVector<Instruction *, 8> DeadInsts; - - for (BasicBlock *BB : post_order(&Func)) { - for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); - II != IE;) { - Instruction &Inst = *II++; - auto *Bitcast = dyn_cast<BitCastInst>(&Inst); - if (!Bitcast) - continue; - - Value *Src = Bitcast->getOperand(0); - if (Bitcast->getType()->isX86_AMXTy()) { - if (Bitcast->user_empty()) { - DeadInsts.push_back(Bitcast); - continue; - } - LoadInst *LD = dyn_cast<LoadInst>(Src); - if (!LD) { - if (transformBitcast(Bitcast)) - DeadInsts.push_back(Bitcast); - continue; - } - // If load has mutli-user, duplicate a vector load. - // %src = load <256 x i32>, <256 x i32>* %addr, align 64 - // %2 = bitcast <256 x i32> %src to x86_amx - // %add = add <256 x i32> %src, <256 x i32> %src2 - // --> - // %src = load <256 x i32>, <256 x i32>* %addr, align 64 - // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, - // i8* %addr, i64 %stride64) - // %add = add <256 x i32> %src, <256 x i32> %src2 - - // If load has one user, the load will be eliminated in DAG ISel. - // %src = load <256 x i32>, <256 x i32>* %addr, align 64 - // %2 = bitcast <256 x i32> %src to x86_amx - // --> - // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, - // i8* %addr, i64 %stride64) - combineLoadBitcast(LD, Bitcast); - DeadInsts.push_back(Bitcast); - if (LD->hasOneUse()) - DeadInsts.push_back(LD); - } else if (Src->getType()->isX86_AMXTy()) { - if (Bitcast->user_empty()) { - DeadInsts.push_back(Bitcast); - continue; - } - StoreInst *ST = nullptr; - for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end(); - UI != UE;) { - Value *I = (UI++)->getUser(); - ST = dyn_cast<StoreInst>(I); - if (ST) - break; - } - if (!ST) { - if (transformBitcast(Bitcast)) - DeadInsts.push_back(Bitcast); - continue; - } - // If bitcast (%13) has one use, combine bitcast and store to amx store. - // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr, - // %stride); - // %13 = bitcast x86_amx %src to <256 x i32> - // store <256 x i32> %13, <256 x i32>* %addr, align 64 - // --> - // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, - // %stride64, %13) - // - // If bitcast (%13) has multi-use, transform as below. - // %13 = bitcast x86_amx %src to <256 x i32> - // store <256 x i32> %13, <256 x i32>* %addr, align 64 - // %add = <256 x i32> %13, <256 x i32> %src2 - // --> - // %13 = bitcast x86_amx %src to <256 x i32> - // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, - // %stride64, %13) - // %14 = load <256 x i32>, %addr - // %add = <256 x i32> %14, <256 x i32> %src2 - // - combineBitcastStore(Bitcast, ST); - // Delete user first. - DeadInsts.push_back(ST); - DeadInsts.push_back(Bitcast); - } - } - } - - bool C = !DeadInsts.empty(); - - for (auto *Inst : DeadInsts) - Inst->eraseFromParent(); - - return C; -} -} // anonymous namespace - -namespace { - -class X86LowerAMXTypeLegacyPass : public FunctionPass { -public: - static char ID; - - X86LowerAMXTypeLegacyPass() : FunctionPass(ID) { - initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - X86LowerAMXType LAT(F); - bool C = LAT.visit(); - return C; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - } -}; - -} // anonymous namespace - -static const char PassName[] = "Lower AMX type for load/store"; -char X86LowerAMXTypeLegacyPass::ID = 0; -INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, - false) -INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, - false) - -FunctionPass *llvm::createX86LowerAMXTypePass() { - return new X86LowerAMXTypeLegacyPass(); -} +//===- llvm/CodeGen/TileShapeInfo.h - ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to transform <256 x i32> load/store +/// <256 x i32> is bitcasted to x86_amx on X86, and AMX instruction set only +/// provides simple operation on x86_amx. The basic elementwise operation +/// is not supported by AMX. Since x86_amx is bitcasted from vector <256 x i32> +/// and only AMX intrinsics can operate on the type, we need transform +/// load/store <256 x i32> instruction to AMX load/store. If the bitcast can +/// not be combined with load/store, we transform the bitcast to amx load/store +/// and <256 x i32> store/load. +// +//===----------------------------------------------------------------------===// +// +#include "X86.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "lower-amx-type" + +static AllocaInst *CreateAllocaInst(IRBuilder<> &Builder, BasicBlock *BB) { + Function &F = *BB->getParent(); + Module *M = BB->getModule(); + const DataLayout &DL = M->getDataLayout(); + + Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false); + LLVMContext &Ctx = Builder.getContext(); + auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx)); + unsigned AllocaAS = DL.getAllocaAddrSpace(); + AllocaInst *AllocaRes = + new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front()); + AllocaRes->setAlignment(AllocaAlignment); + return AllocaRes; +} + +static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) { + Value *Row = nullptr, *Col = nullptr; + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Expect amx intrinsics"); + case Intrinsic::x86_tileloadd64_internal: + case Intrinsic::x86_tilestored64_internal: { + Row = II->getArgOperand(0); + Col = II->getArgOperand(1); + break; + } + // a * b + c + // The shape depends on which operand. + case Intrinsic::x86_tdpbssd_internal: { + switch (OpNo) { + case 3: + Row = II->getArgOperand(0); + Col = II->getArgOperand(1); + break; + case 4: + Row = II->getArgOperand(0); + Col = II->getArgOperand(2); + break; + case 5: + Row = II->getArgOperand(2); + Col = II->getArgOperand(1); + break; + } + break; + } + } + + return std::make_pair(Row, Col); +} + +// %src = load <256 x i32>, <256 x i32>* %addr, align 64 +// %2 = bitcast <256 x i32> %src to x86_amx +// --> +// %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, +// i8* %addr, i64 %stride64) +static void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { + Value *Row = nullptr, *Col = nullptr; + Use &U = *(Bitcast->use_begin()); + unsigned OpNo = U.getOperandNo(); + auto *II = cast<IntrinsicInst>(U.getUser()); + std::tie(Row, Col) = getShape(II, OpNo); + IRBuilder<> Builder(Bitcast); + // Use the maximun column as stride. + Value *Stride = Builder.getInt64(64); + Value *I8Ptr = + Builder.CreateBitCast(LD->getOperand(0), Builder.getInt8PtrTy()); + std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; + + Value *NewInst = + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, None, Args); + Bitcast->replaceAllUsesWith(NewInst); +} + +// %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr, +// %stride); +// %13 = bitcast x86_amx %src to <256 x i32> +// store <256 x i32> %13, <256 x i32>* %addr, align 64 +// --> +// call void @llvm.x86.tilestored64.internal(%row, %col, %addr, +// %stride64, %13) +static void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { + + Value *Tile = Bitcast->getOperand(0); + auto *II = cast<IntrinsicInst>(Tile); + // Tile is output from AMX intrinsic. The first operand of the + // intrinsic is row, the second operand of the intrinsic is column. + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + IRBuilder<> Builder(ST); + // Use the maximum column as stride. It must be the same with load + // stride. + Value *Stride = Builder.getInt64(64); + Value *I8Ptr = + Builder.CreateBitCast(ST->getOperand(1), Builder.getInt8PtrTy()); + std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Tile}; + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); + if (Bitcast->hasOneUse()) + return; + // %13 = bitcast x86_amx %src to <256 x i32> + // store <256 x i32> %13, <256 x i32>* %addr, align 64 + // %add = <256 x i32> %13, <256 x i32> %src2 + // --> + // %13 = bitcast x86_amx %src to <256 x i32> + // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, + // %stride64, %13) + // %14 = load <256 x i32>, %addr + // %add = <256 x i32> %14, <256 x i32> %src2 + Value *Vec = Builder.CreateLoad(Bitcast->getType(), ST->getOperand(1)); + Bitcast->replaceAllUsesWith(Vec); +} + +// transform bitcast to <store, load> instructions. +static bool transformBitcast(BitCastInst *Bitcast) { + IRBuilder<> Builder(Bitcast); + AllocaInst *AllocaAddr; + Value *I8Ptr, *Stride; + auto *Src = Bitcast->getOperand(0); + + auto Prepare = [&]() { + AllocaAddr = CreateAllocaInst(Builder, Bitcast->getParent()); + I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); + Stride = Builder.getInt64(64); + }; + + if (Bitcast->getType()->isX86_AMXTy()) { + // %2 = bitcast <256 x i32> %src to x86_amx + // --> + // %addr = alloca <256 x i32>, align 64 + // store <256 x i32> %src, <256 x i32>* %addr, align 64 + // %addr2 = bitcast <256 x i32>* to i8* + // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, + // i8* %addr2, + // i64 64) + Use &U = *(Bitcast->use_begin()); + unsigned OpNo = U.getOperandNo(); + auto *II = dyn_cast<IntrinsicInst>(U.getUser()); + if (!II) + return false; // May be bitcast from x86amx to <256 x i32>. + Prepare(); + Builder.CreateStore(Src, AllocaAddr); + // TODO we can pick an constant operand for the shape. + Value *Row = nullptr, *Col = nullptr; + std::tie(Row, Col) = getShape(II, OpNo); + std::array<Value *, 4> Args = {Row, Col, I8Ptr, Stride}; + Value *NewInst = Builder.CreateIntrinsic( + Intrinsic::x86_tileloadd64_internal, None, Args); + Bitcast->replaceAllUsesWith(NewInst); + } else { + // %2 = bitcast x86_amx %src to <256 x i32> + // --> + // %addr = alloca <256 x i32>, align 64 + // %addr2 = bitcast <256 x i32>* to i8* + // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, + // i8* %addr2, i64 %stride) + // %2 = load <256 x i32>, <256 x i32>* %addr, align 64 + auto *II = dyn_cast<IntrinsicInst>(Src); + if (!II) + return false; // May be bitcast from <256 x i32> to x86amx. + Prepare(); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src}; + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); + Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr); + Bitcast->replaceAllUsesWith(NewInst); + } + + return true; +} + +namespace { +class X86LowerAMXType { + Function &Func; + +public: + X86LowerAMXType(Function &F) : Func(F) {} + bool visit(); +}; + +bool X86LowerAMXType::visit() { + SmallVector<Instruction *, 8> DeadInsts; + + for (BasicBlock *BB : post_order(&Func)) { + for (BasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); + II != IE;) { + Instruction &Inst = *II++; + auto *Bitcast = dyn_cast<BitCastInst>(&Inst); + if (!Bitcast) + continue; + + Value *Src = Bitcast->getOperand(0); + if (Bitcast->getType()->isX86_AMXTy()) { + if (Bitcast->user_empty()) { + DeadInsts.push_back(Bitcast); + continue; + } + LoadInst *LD = dyn_cast<LoadInst>(Src); + if (!LD) { + if (transformBitcast(Bitcast)) + DeadInsts.push_back(Bitcast); + continue; + } + // If load has mutli-user, duplicate a vector load. + // %src = load <256 x i32>, <256 x i32>* %addr, align 64 + // %2 = bitcast <256 x i32> %src to x86_amx + // %add = add <256 x i32> %src, <256 x i32> %src2 + // --> + // %src = load <256 x i32>, <256 x i32>* %addr, align 64 + // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, + // i8* %addr, i64 %stride64) + // %add = add <256 x i32> %src, <256 x i32> %src2 + + // If load has one user, the load will be eliminated in DAG ISel. + // %src = load <256 x i32>, <256 x i32>* %addr, align 64 + // %2 = bitcast <256 x i32> %src to x86_amx + // --> + // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, + // i8* %addr, i64 %stride64) + combineLoadBitcast(LD, Bitcast); + DeadInsts.push_back(Bitcast); + if (LD->hasOneUse()) + DeadInsts.push_back(LD); + } else if (Src->getType()->isX86_AMXTy()) { + if (Bitcast->user_empty()) { + DeadInsts.push_back(Bitcast); + continue; + } + StoreInst *ST = nullptr; + for (auto UI = Bitcast->use_begin(), UE = Bitcast->use_end(); + UI != UE;) { + Value *I = (UI++)->getUser(); + ST = dyn_cast<StoreInst>(I); + if (ST) + break; + } + if (!ST) { + if (transformBitcast(Bitcast)) + DeadInsts.push_back(Bitcast); + continue; + } + // If bitcast (%13) has one use, combine bitcast and store to amx store. + // %src = call x86_amx @llvm.x86.tileloadd64.internal(%row, %col, %addr, + // %stride); + // %13 = bitcast x86_amx %src to <256 x i32> + // store <256 x i32> %13, <256 x i32>* %addr, align 64 + // --> + // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, + // %stride64, %13) + // + // If bitcast (%13) has multi-use, transform as below. + // %13 = bitcast x86_amx %src to <256 x i32> + // store <256 x i32> %13, <256 x i32>* %addr, align 64 + // %add = <256 x i32> %13, <256 x i32> %src2 + // --> + // %13 = bitcast x86_amx %src to <256 x i32> + // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, + // %stride64, %13) + // %14 = load <256 x i32>, %addr + // %add = <256 x i32> %14, <256 x i32> %src2 + // + combineBitcastStore(Bitcast, ST); + // Delete user first. + DeadInsts.push_back(ST); + DeadInsts.push_back(Bitcast); + } + } + } + + bool C = !DeadInsts.empty(); + + for (auto *Inst : DeadInsts) + Inst->eraseFromParent(); + + return C; +} +} // anonymous namespace + +namespace { + +class X86LowerAMXTypeLegacyPass : public FunctionPass { +public: + static char ID; + + X86LowerAMXTypeLegacyPass() : FunctionPass(ID) { + initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + X86LowerAMXType LAT(F); + bool C = LAT.visit(); + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; + +} // anonymous namespace + +static const char PassName[] = "Lower AMX type for load/store"; +char X86LowerAMXTypeLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, + false) +INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, + false) + +FunctionPass *llvm::createX86LowerAMXTypePass() { + return new X86LowerAMXTypeLegacyPass(); +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp b/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp index 89fa3ae3a3..6de5c84db3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86MCInstLower.cpp @@ -977,24 +977,24 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI) { NoAutoPaddingScope NoPadScope(*OutStreamer); - bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 && - MI.getOpcode() != X86::TLS_base_addr32; - bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 || - MI.getOpcode() == X86::TLS_base_addr64; + bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 && + MI.getOpcode() != X86::TLS_base_addr32; + bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 || + MI.getOpcode() == X86::TLS_base_addr64; MCContext &Ctx = OutStreamer->getContext(); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { case X86::TLS_addr32: case X86::TLS_addr64: - case X86::TLS_addrX32: + case X86::TLS_addrX32: SRVK = MCSymbolRefExpr::VK_TLSGD; break; case X86::TLS_base_addr32: SRVK = MCSymbolRefExpr::VK_TLSLDM; break; case X86::TLS_base_addr64: - case X86::TLS_base_addrX32: + case X86::TLS_base_addrX32: SRVK = MCSymbolRefExpr::VK_TLSLD; break; default: @@ -1014,7 +1014,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, if (Is64Bits) { bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD; - if (NeedsPadding && Is64BitsLP64) + if (NeedsPadding && Is64BitsLP64) EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); EmitAndCountInstruction(MCInstBuilder(X86::LEA64r) .addReg(X86::RDI) @@ -1087,26 +1087,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, /// bytes. Return the size of nop emitted. static unsigned emitNop(MCStreamer &OS, unsigned NumBytes, const X86Subtarget *Subtarget) { - // Determine the longest nop which can be efficiently decoded for the given - // target cpu. 15-bytes is the longest single NOP instruction, but some - // platforms can't decode the longest forms efficiently. - unsigned MaxNopLength = 1; - if (Subtarget->is64Bit()) { - // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the - // IndexReg/BaseReg below need to be updated. - if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP)) - MaxNopLength = 7; - else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP)) - MaxNopLength = 15; - else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP)) - MaxNopLength = 11; - else - MaxNopLength = 10; - } if (Subtarget->is32Bit()) - MaxNopLength = 2; - + // Determine the longest nop which can be efficiently decoded for the given + // target cpu. 15-bytes is the longest single NOP instruction, but some + // platforms can't decode the longest forms efficiently. + unsigned MaxNopLength = 1; + if (Subtarget->is64Bit()) { + // FIXME: We can use NOOPL on 32-bit targets with FeatureNOPL, but the + // IndexReg/BaseReg below need to be updated. + if (Subtarget->hasFeature(X86::FeatureFast7ByteNOP)) + MaxNopLength = 7; + else if (Subtarget->hasFeature(X86::FeatureFast15ByteNOP)) + MaxNopLength = 15; + else if (Subtarget->hasFeature(X86::FeatureFast11ByteNOP)) + MaxNopLength = 11; + else + MaxNopLength = 10; + } if (Subtarget->is32Bit()) + MaxNopLength = 2; + // Cap a single nop emission at the profitable value for the target - NumBytes = std::min(NumBytes, MaxNopLength); + NumBytes = std::min(NumBytes, MaxNopLength); unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; @@ -1334,7 +1334,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, MCInst MCI; MCI.setOpcode(Opcode); - for (auto &MO : drop_begin(MI.operands(), 2)) + for (auto &MO : drop_begin(MI.operands(), 2)) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) MCI.addOperand(MaybeOperand.getValue()); @@ -1710,7 +1710,7 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, unsigned OpCode = MI.getOperand(0).getImm(); MCInst Ret; Ret.setOpcode(OpCode); - for (auto &MO : drop_begin(MI.operands())) + for (auto &MO : drop_begin(MI.operands())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) Ret.addOperand(MaybeOperand.getValue()); OutStreamer->emitInstruction(Ret, getSubtargetInfo()); @@ -1749,7 +1749,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, // Before emitting the instruction, add a comment to indicate that this is // indeed a tail call. OutStreamer->AddComment("TAILCALL"); - for (auto &MO : drop_begin(MI.operands())) + for (auto &MO : drop_begin(MI.operands())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) TC.addOperand(MaybeOperand.getValue()); OutStreamer->emitInstruction(TC, getSubtargetInfo()); @@ -1784,7 +1784,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI, if (ConstantEntry.isMachineConstantPoolEntry()) return nullptr; - return ConstantEntry.Val.ConstVal; + return ConstantEntry.Val.ConstVal; } static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx, @@ -2446,10 +2446,10 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { case X86::TLS_addr32: case X86::TLS_addr64: - case X86::TLS_addrX32: + case X86::TLS_addrX32: case X86::TLS_base_addr32: case X86::TLS_base_addr64: - case X86::TLS_base_addrX32: + case X86::TLS_base_addrX32: return LowerTlsAddr(MCInstLowering, *MI); case X86::MOVPC32r: { @@ -2598,15 +2598,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } return; } - case X86::UBSAN_UD1: - EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm) - .addReg(X86::EAX) - .addReg(X86::EAX) - .addImm(1) - .addReg(X86::NoRegister) - .addImm(MI->getOperand(0).getImm()) - .addReg(X86::NoRegister)); - return; + case X86::UBSAN_UD1: + EmitAndCountInstruction(MCInstBuilder(X86::UD1Lm) + .addReg(X86::EAX) + .addReg(X86::EAX) + .addImm(1) + .addReg(X86::NoRegister) + .addImm(MI->getOperand(0).getImm()) + .addReg(X86::NoRegister)); + return; } MCInst TmpInst; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp b/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp index babd923e74..db4203d314 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86PartialReduction.cpp @@ -392,7 +392,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { break; // Push incoming values to the worklist. - append_range(Worklist, PN->incoming_values()); + append_range(Worklist, PN->incoming_values()); continue; } @@ -401,7 +401,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { if (BO->getOpcode() == Instruction::Add) { // Simple case. Single use, just push its operands to the worklist. if (BO->hasNUses(BO == Root ? 2 : 1)) { - append_range(Worklist, BO->operands()); + append_range(Worklist, BO->operands()); continue; } @@ -424,7 +424,7 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { continue; // The phi forms a loop with this Add, push its operands. - append_range(Worklist, BO->operands()); + append_range(Worklist, BO->operands()); } } } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp b/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp index 05ee6c6c83..b2f6d0604d 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86PreTileConfig.cpp @@ -1,265 +1,265 @@ -//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Pass to pre-config the shape of AMX register -/// AMX register need to be configured before use. The shape of AMX register -/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions. -/// The pldtilecfg is to config tile registers. It should dominator all AMX -/// instructions. The pldtilecfg produce a virtual cfg register and the cfg -/// register is used by all AMX instructions. -/// This pass is to find the common dominator of all AMX instructions and -/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg -/// produces is inserted as the last operand of each AMX instruction. We use -/// this scheme to model the def-use relationship between AMX config instruction -/// and other AMX instructions. Below is an example. -/// -/// ----B1---- -/// / \ -/// / \ -/// B2 B3 -/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV -/// -/// is transformed to -/// -/// B1 -/// %25:tilecfg = PLDTILECFG -/// / \ -/// / \ -/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25 -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86RegisterInfo.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TileShapeInfo.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "tile-pre-config" - -namespace { - -class X86PreTileConfig : public MachineFunctionPass { - // context - MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - MachineDominatorTree *DomTree = nullptr; - MachineRegisterInfo *MRI = nullptr; - - MachineInstr *getTileConfigPoint(); - -public: - X86PreTileConfig() : MachineFunctionPass(ID) {} - - /// Return the pass name. - StringRef getPassName() const override { - return "Tile Register Pre-configure"; - } - - /// X86PreTileConfig analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &mf) override; - - static char ID; -}; - -} // end anonymous namespace - -char X86PreTileConfig::ID = 0; - -INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", - "Tile Register Configure", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", - "Tile Register Configure", false, false) - -void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - AU.addRequired<MachineDominatorTree>(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, - const TargetInstrInfo *TII, - MachineRegisterInfo *MRI, - const X86Subtarget *ST) { - auto *MBB = MI->getParent(); - - // FIXME: AMX should assume AVX512 enabled. - if (ST->hasAVX512()) { - // Zero stack slot. - Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) - .addReg(Zmm, RegState::Undef) - .addReg(Zmm, RegState::Undef); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), - FrameIdx) - .addReg(Zmm); - } - - // build psuedo ldtilecfg - Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); - - addFrameReference( - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); - - return VReg; -} - -static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - llvm_unreachable("Unexpected machine instruction on tile"); - case X86::PTILELOADDV: - case X86::PTDPBSSDV: - case X86::PTILEZEROV: - MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1)); - MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2)); - ShapeT Shape(&MO1, &MO2, MRI); - return Shape; - } -} - -MachineInstr *X86PreTileConfig::getTileConfigPoint() { - DenseMap<Register, ShapeT> PhysShapeInfo; - MachineBasicBlock *MBB = nullptr; - DenseSet<const MachineInstr *> MIs; - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register VirtReg = Register::index2VirtReg(i); - if (MRI->reg_nodbg_empty(VirtReg)) - continue; - const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - if (RC.getID() != X86::TILERegClassID) - continue; - - // Find the common dominator for all MI that define tile register. - for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { - if (MO.isUndef()) - continue; - const auto *MI = MO.getParent(); - // PHI or IMPLICIT_DEF instructiion. - // There must be a input tile before PHI instruction. - if (MI->isTransient()) - continue; - if (!MBB) - MBB = const_cast<MachineBasicBlock *>(MI->getParent()); - MBB = DomTree->findNearestCommonDominator( - MBB, const_cast<MachineBasicBlock *>(MI->getParent())); - - // Collect the instructions that define shape. - ShapeT Shape = getShape(*MI, MRI); - std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(), - Shape.getCol()}; - for (auto *ShapeMO : ShapeMOs) { - Register ShapeReg = ShapeMO->getReg(); - for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) { - const auto *ShapeMI = MO.getParent(); - MIs.insert(ShapeMI); - } - } - } - } - if (!MBB) - return nullptr; - // This pass is before the pass of eliminating PHI node, so it - // is in SSA form. - assert(MRI->isSSA() && "Not SSA form in pre-tile config"); - // Shape def should dominate tile config MBB. - // def s s1 s2 - // / \ \ / - // / \ \ / - // conf s3=phi(s1,s2) - // | - // c - // - for (const auto *MI : MIs) { - const MachineBasicBlock *ShapeMBB = MI->getParent(); - if (DomTree->dominates(ShapeMBB, MBB)) - continue; - if (MI->isMoveImmediate()) - continue; - report_fatal_error(MF->getName() + ": Failed to config tile register, " - "please define the shape earlier"); - } - - // ldtilecfg should be inserted after the MI that define the shape. - MachineBasicBlock::reverse_instr_iterator I, E; - for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) { - auto *MI = &*I; - if (MIs.count(MI) && (!MI->isMoveImmediate())) - break; - } - MachineBasicBlock::iterator MII; - if (I == E) - MII = MBB->getFirstNonPHI(); - else { - MII = MachineBasicBlock::iterator(&*I); - MII++; - } - return &*MII; -} - -static void addTileCFGUse(MachineFunction &MF, Register CFG) { - for (MachineBasicBlock &MBB : MF) { - - // Traverse the basic block. - for (MachineInstr &MI : MBB) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - break; - case X86::PTILELOADDV: - case X86::PTILESTOREDV: - case X86::PTDPBSSDV: - case X86::PTILEZEROV: - unsigned NumOperands = MI.getNumOperands(); - MI.RemoveOperand(NumOperands - 1); - MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); - break; - } - } - } -} - -bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - MRI = &mf.getRegInfo(); - ST = &mf.getSubtarget<X86Subtarget>(); - TRI = ST->getRegisterInfo(); - TII = mf.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis<MachineDominatorTree>(); - - MachineInstr *MI = getTileConfigPoint(); - if (!MI) - return false; - unsigned Size = ST->getTileConfigSize(); - Align Alignment = ST->getTileConfigAlignment(); - int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); - addTileCFGUse(mf, CFG); - return true; -} - -FunctionPass *llvm::createX86PreTileConfigPass() { - return new X86PreTileConfig(); -} +//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to pre-config the shape of AMX register +/// AMX register need to be configured before use. The shape of AMX register +/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions. +/// The pldtilecfg is to config tile registers. It should dominator all AMX +/// instructions. The pldtilecfg produce a virtual cfg register and the cfg +/// register is used by all AMX instructions. +/// This pass is to find the common dominator of all AMX instructions and +/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg +/// produces is inserted as the last operand of each AMX instruction. We use +/// this scheme to model the def-use relationship between AMX config instruction +/// and other AMX instructions. Below is an example. +/// +/// ----B1---- +/// / \ +/// / \ +/// B2 B3 +/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV +/// +/// is transformed to +/// +/// B1 +/// %25:tilecfg = PLDTILECFG +/// / \ +/// / \ +/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25 +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TileShapeInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "tile-pre-config" + +namespace { + +class X86PreTileConfig : public MachineFunctionPass { + // context + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + MachineDominatorTree *DomTree = nullptr; + MachineRegisterInfo *MRI = nullptr; + + MachineInstr *getTileConfigPoint(); + +public: + X86PreTileConfig() : MachineFunctionPass(ID) {} + + /// Return the pass name. + StringRef getPassName() const override { + return "Tile Register Pre-configure"; + } + + /// X86PreTileConfig analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &mf) override; + + static char ID; +}; + +} // end anonymous namespace + +char X86PreTileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", + "Tile Register Configure", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", + "Tile Register Configure", false, false) + +void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, + const TargetInstrInfo *TII, + MachineRegisterInfo *MRI, + const X86Subtarget *ST) { + auto *MBB = MI->getParent(); + + // FIXME: AMX should assume AVX512 enabled. + if (ST->hasAVX512()) { + // Zero stack slot. + Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); + BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) + .addReg(Zmm, RegState::Undef) + .addReg(Zmm, RegState::Undef); + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), + FrameIdx) + .addReg(Zmm); + } + + // build psuedo ldtilecfg + Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); + + addFrameReference( + BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); + + return VReg; +} + +static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + llvm_unreachable("Unexpected machine instruction on tile"); + case X86::PTILELOADDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1)); + MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2)); + ShapeT Shape(&MO1, &MO2, MRI); + return Shape; + } +} + +MachineInstr *X86PreTileConfig::getTileConfigPoint() { + DenseMap<Register, ShapeT> PhysShapeInfo; + MachineBasicBlock *MBB = nullptr; + DenseSet<const MachineInstr *> MIs; + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + Register VirtReg = Register::index2VirtReg(i); + if (MRI->reg_nodbg_empty(VirtReg)) + continue; + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + if (RC.getID() != X86::TILERegClassID) + continue; + + // Find the common dominator for all MI that define tile register. + for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { + if (MO.isUndef()) + continue; + const auto *MI = MO.getParent(); + // PHI or IMPLICIT_DEF instructiion. + // There must be a input tile before PHI instruction. + if (MI->isTransient()) + continue; + if (!MBB) + MBB = const_cast<MachineBasicBlock *>(MI->getParent()); + MBB = DomTree->findNearestCommonDominator( + MBB, const_cast<MachineBasicBlock *>(MI->getParent())); + + // Collect the instructions that define shape. + ShapeT Shape = getShape(*MI, MRI); + std::array<MachineOperand *, 2> ShapeMOs = {Shape.getRow(), + Shape.getCol()}; + for (auto *ShapeMO : ShapeMOs) { + Register ShapeReg = ShapeMO->getReg(); + for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) { + const auto *ShapeMI = MO.getParent(); + MIs.insert(ShapeMI); + } + } + } + } + if (!MBB) + return nullptr; + // This pass is before the pass of eliminating PHI node, so it + // is in SSA form. + assert(MRI->isSSA() && "Not SSA form in pre-tile config"); + // Shape def should dominate tile config MBB. + // def s s1 s2 + // / \ \ / + // / \ \ / + // conf s3=phi(s1,s2) + // | + // c + // + for (const auto *MI : MIs) { + const MachineBasicBlock *ShapeMBB = MI->getParent(); + if (DomTree->dominates(ShapeMBB, MBB)) + continue; + if (MI->isMoveImmediate()) + continue; + report_fatal_error(MF->getName() + ": Failed to config tile register, " + "please define the shape earlier"); + } + + // ldtilecfg should be inserted after the MI that define the shape. + MachineBasicBlock::reverse_instr_iterator I, E; + for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) { + auto *MI = &*I; + if (MIs.count(MI) && (!MI->isMoveImmediate())) + break; + } + MachineBasicBlock::iterator MII; + if (I == E) + MII = MBB->getFirstNonPHI(); + else { + MII = MachineBasicBlock::iterator(&*I); + MII++; + } + return &*MII; +} + +static void addTileCFGUse(MachineFunction &MF, Register CFG) { + for (MachineBasicBlock &MBB : MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case X86::PTILELOADDV: + case X86::PTILESTOREDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + unsigned NumOperands = MI.getNumOperands(); + MI.RemoveOperand(NumOperands - 1); + MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); + break; + } + } + } +} + +bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + MRI = &mf.getRegInfo(); + ST = &mf.getSubtarget<X86Subtarget>(); + TRI = ST->getRegisterInfo(); + TII = mf.getSubtarget().getInstrInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + + MachineInstr *MI = getTileConfigPoint(); + if (!MI) + return false; + unsigned Size = ST->getTileConfigSize(); + Align Alignment = ST->getTileConfigAlignment(); + int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); + Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); + addTileCFGUse(mf, CFG); + return true; +} + +FunctionPass *llvm::createX86PreTileConfigPass() { + return new X86PreTileConfig(); +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp index d90b4e7bdc..eb919a0146 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.cpp @@ -18,8 +18,8 @@ #include "X86Subtarget.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -726,12 +726,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert((!needsStackRealignment(MF) || MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && "Return instruction can only reference SP relative frame objects"); - FIOffset = - TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed(); + FIOffset = + TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed(); } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) { FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr); } else { - FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed(); + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed(); } // LOCAL_ESCAPE uses a single offset, with no register. It only works in the @@ -786,55 +786,55 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } -unsigned X86RegisterInfo::findDeadCallerSavedReg( - MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { - const MachineFunction *MF = MBB.getParent(); - if (MF->callsEHReturn()) - return 0; - - const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF); - - if (MBBI == MBB.end()) - return 0; - - switch (MBBI->getOpcode()) { - default: - return 0; - case TargetOpcode::PATCHABLE_RET: - case X86::RET: - case X86::RETL: - case X86::RETQ: - case X86::RETIL: - case X86::RETIQ: - case X86::TCRETURNdi: - case X86::TCRETURNri: - case X86::TCRETURNmi: - case X86::TCRETURNdi64: - case X86::TCRETURNri64: - case X86::TCRETURNmi64: - case X86::EH_RETURN: - case X86::EH_RETURN64: { - SmallSet<uint16_t, 8> Uses; - for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { - MachineOperand &MO = MBBI->getOperand(I); - if (!MO.isReg() || MO.isDef()) - continue; - Register Reg = MO.getReg(); - if (!Reg) - continue; - for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) - Uses.insert(*AI); - } - - for (auto CS : AvailableRegs) - if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP) - return CS; - } - } - - return 0; -} - +unsigned X86RegisterInfo::findDeadCallerSavedReg( + MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { + const MachineFunction *MF = MBB.getParent(); + if (MF->callsEHReturn()) + return 0; + + const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF); + + if (MBBI == MBB.end()) + return 0; + + switch (MBBI->getOpcode()) { + default: + return 0; + case TargetOpcode::PATCHABLE_RET: + case X86::RET: + case X86::RETL: + case X86::RETQ: + case X86::RETIL: + case X86::RETIQ: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<uint16_t, 8> Uses; + for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { + MachineOperand &MO = MBBI->getOperand(I); + if (!MO.isReg() || MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (!Reg) + continue; + for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) + Uses.insert(*AI); + } + + for (auto CS : AvailableRegs) + if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP) + return CS; + } + } + + return 0; +} + Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? FramePtr : StackPtr; @@ -857,79 +857,79 @@ X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { StackReg = getX86SubSuperRegister(StackReg, 32); return StackReg; } - -static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, - const MachineRegisterInfo *MRI) { - if (VRM->hasShape(VirtReg)) - return VRM->getShape(VirtReg); - - const MachineOperand &Def = *MRI->def_begin(VirtReg); - MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent()); - unsigned OpCode = MI->getOpcode(); - switch (OpCode) { - default: - llvm_unreachable("Unexpected machine instruction on tile register!"); - break; - // We only collect the tile shape that is defined. - case X86::PTILELOADDV: - case X86::PTDPBSSDV: - case X86::PTILEZEROV: - MachineOperand &MO1 = MI->getOperand(1); - MachineOperand &MO2 = MI->getOperand(2); - ShapeT Shape(&MO1, &MO2, MRI); - VRM->assignVirt2Shape(VirtReg, Shape); - return Shape; - } -} - -bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, - ArrayRef<MCPhysReg> Order, - SmallVectorImpl<MCPhysReg> &Hints, - const MachineFunction &MF, - const VirtRegMap *VRM, - const LiveRegMatrix *Matrix) const { - const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( - VirtReg, Order, Hints, MF, VRM, Matrix); - - if (RC.getID() != X86::TILERegClassID) - return BaseImplRetVal; - - ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI); - auto AddHint = [&](MCPhysReg PhysReg) { - Register VReg = Matrix->getOneVReg(PhysReg); - if (VReg == MCRegister::NoRegister) { // Not allocated yet - Hints.push_back(PhysReg); - return; - } - ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI); - if (PhysShape == VirtShape) - Hints.push_back(PhysReg); - }; - - SmallSet<MCPhysReg, 4> CopyHints; - CopyHints.insert(Hints.begin(), Hints.end()); - Hints.clear(); - for (auto Hint : CopyHints) { - if (RC.contains(Hint) && !MRI->isReserved(Hint)) - AddHint(Hint); - } - for (MCPhysReg PhysReg : Order) { - if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) && - !MRI->isReserved(PhysReg)) - AddHint(PhysReg); - } - -#define DEBUG_TYPE "tile-hint" - LLVM_DEBUG({ - dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n"; - for (auto Hint : Hints) { - dbgs() << "tmm" << Hint << ","; - } - dbgs() << "\n"; - }); -#undef DEBUG_TYPE - - return true; -} + +static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, + const MachineRegisterInfo *MRI) { + if (VRM->hasShape(VirtReg)) + return VRM->getShape(VirtReg); + + const MachineOperand &Def = *MRI->def_begin(VirtReg); + MachineInstr *MI = const_cast<MachineInstr *>(Def.getParent()); + unsigned OpCode = MI->getOpcode(); + switch (OpCode) { + default: + llvm_unreachable("Unexpected machine instruction on tile register!"); + break; + // We only collect the tile shape that is defined. + case X86::PTILELOADDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + MachineOperand &MO1 = MI->getOperand(1); + MachineOperand &MO2 = MI->getOperand(2); + ShapeT Shape(&MO1, &MO2, MRI); + VRM->assignVirt2Shape(VirtReg, Shape); + return Shape; + } +} + +bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, + ArrayRef<MCPhysReg> Order, + SmallVectorImpl<MCPhysReg> &Hints, + const MachineFunction &MF, + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( + VirtReg, Order, Hints, MF, VRM, Matrix); + + if (RC.getID() != X86::TILERegClassID) + return BaseImplRetVal; + + ShapeT VirtShape = getTileShape(VirtReg, const_cast<VirtRegMap *>(VRM), MRI); + auto AddHint = [&](MCPhysReg PhysReg) { + Register VReg = Matrix->getOneVReg(PhysReg); + if (VReg == MCRegister::NoRegister) { // Not allocated yet + Hints.push_back(PhysReg); + return; + } + ShapeT PhysShape = getTileShape(VReg, const_cast<VirtRegMap *>(VRM), MRI); + if (PhysShape == VirtShape) + Hints.push_back(PhysReg); + }; + + SmallSet<MCPhysReg, 4> CopyHints; + CopyHints.insert(Hints.begin(), Hints.end()); + Hints.clear(); + for (auto Hint : CopyHints) { + if (RC.contains(Hint) && !MRI->isReserved(Hint)) + AddHint(Hint); + } + for (MCPhysReg PhysReg : Order) { + if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) && + !MRI->isReserved(PhysReg)) + AddHint(PhysReg); + } + +#define DEBUG_TYPE "tile-hint" + LLVM_DEBUG({ + dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n"; + for (auto Hint : Hints) { + dbgs() << "tmm" << Hint << ","; + } + dbgs() << "\n"; + }); +#undef DEBUG_TYPE + + return true; +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h index 7fd10ddd1a..4e4fb3f368 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.h @@ -125,12 +125,12 @@ public: int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - /// findDeadCallerSavedReg - Return a caller-saved register that isn't live - /// when it reaches the "return" instruction. We can then pop a stack object - /// to this register without worry about clobbering it. - unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI) const; - + /// findDeadCallerSavedReg - Return a caller-saved register that isn't live + /// when it reaches the "return" instruction. We can then pop a stack object + /// to this register without worry about clobbering it. + unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI) const; + // Debug information queries. Register getFrameRegister(const MachineFunction &MF) const override; unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; @@ -144,11 +144,11 @@ public: Register getFramePtr() const { return FramePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } - - bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order, - SmallVectorImpl<MCPhysReg> &Hints, - const MachineFunction &MF, const VirtRegMap *VRM, - const LiveRegMatrix *Matrix) const override; + + bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order, + SmallVectorImpl<MCPhysReg> &Hints, + const MachineFunction &MF, const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; }; } // End llvm namespace diff --git a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td index 75cbd4e1cf..29aa2bc252 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/libs/llvm12/lib/Target/X86/X86RegisterInfo.td @@ -265,9 +265,9 @@ let SubRegIndices = [sub_ymm] in { } } -// Tile config registers. -def TMMCFG: X86Reg<"tmmcfg", 0>; - +// Tile config registers. +def TMMCFG: X86Reg<"tmmcfg", 0>; + // Tile "registers". def TMM0: X86Reg<"tmm0", 0>; def TMM1: X86Reg<"tmm1", 1>; @@ -636,11 +636,11 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; // Tiles -let CopyCost = -1 in // Don't allow copying of tile registers -def TILE : RegisterClass<"X86", [x86amx], 8192, +let CopyCost = -1 in // Don't allow copying of tile registers +def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { - let CopyCost = -1; // Don't allow copying of tile config registers. - let isAllocatable = 1; - let Size = 512; -} +def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { + let CopyCost = -1; // Don't allow copying of tile config registers. + let isAllocatable = 1; + let Size = 512; +} diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp index e76908ef4b..8cfdeea3d1 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -24,10 +24,10 @@ using namespace llvm; #define DEBUG_TYPE "x86-selectiondag-info" -static cl::opt<bool> - UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false), - cl::desc("Use fast short rep mov in memcpy lowering")); - +static cl::opt<bool> + UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false), + cl::desc("Use fast short rep mov in memcpy lowering")); + bool X86SelectionDAGInfo::isBaseRegConflictPossible( SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const { // We cannot use TRI->hasBasePointer() until *after* we select all basic @@ -310,10 +310,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); - // If enabled and available, use fast short rep mov. - if (UseFSRMForMemcpy && Subtarget.hasFSRM()) - return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); - + // If enabled and available, use fast short rep mov. + if (UseFSRMForMemcpy && Subtarget.hasFSRM()) + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); + /// Handle constant sizes, if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size)) return emitConstantSizeRepmov( diff --git a/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 14a3fea240..c3ed77fcae 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -293,4 +293,4 @@ void DecodeVPPERMMask(const Constant *C, unsigned Width, } } -} // namespace llvm +} // namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp index d57871130b..5746c91c0b 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp @@ -161,7 +161,7 @@ bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction( // This branch requires adding an LFENCE. if (!PrevInstIsLFENCE) { - assert(FirstTerminator && "Unknown terminator instruction"); + assert(FirstTerminator && "Unknown terminator instruction"); BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE)); NumLFENCEsInserted++; Modified = true; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp index aa73d4bce6..0edb63589a 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -184,7 +184,7 @@ private: MachineBasicBlock::iterator InsertPt, DebugLoc Loc); void restoreEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, - Register Reg); + Register Reg); void mergePredStateIntoSP(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, @@ -200,8 +200,8 @@ private: MachineInstr * sinkPostLoadHardenedInst(MachineInstr &MI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs); - bool canHardenRegister(Register Reg); - unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB, + bool canHardenRegister(Register Reg); + unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc); unsigned hardenPostLoad(MachineInstr &MI); @@ -1520,7 +1520,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( /// reliably lower. void X86SpeculativeLoadHardeningPass::restoreEFLAGS( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, - Register Reg) { + Register Reg) { BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg); ++NumInstsInserted; } @@ -1842,7 +1842,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // just bail. Also check that its register class is one of the ones we // can harden. Register UseDefReg = UseMI.getOperand(0).getReg(); - if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg)) + if (!UseDefReg.isVirtual() || !canHardenRegister(UseDefReg)) return {}; SingleUseMI = &UseMI; @@ -1864,7 +1864,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( return MI; } -bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) { +bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) { auto *RC = MRI->getRegClass(Reg); int RegBytes = TRI->getRegSizeInBits(*RC) / 8; if (RegBytes > 8) @@ -1908,10 +1908,10 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) { /// The new, hardened virtual register is returned. It will have the same /// register class as `Reg`. unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( - Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { assert(canHardenRegister(Reg) && "Cannot harden this register!"); - assert(Reg.isVirtual() && "Cannot harden a physical register!"); + assert(Reg.isVirtual() && "Cannot harden a physical register!"); auto *RC = MRI->getRegClass(Reg); int Bytes = TRI->getRegSizeInBits(*RC) / 8; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp index c95213c353..e0a96938be 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.cpp @@ -166,10 +166,10 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, return X86II::MO_DARWIN_NONLAZY_PIC_BASE; } - // 32-bit ELF references GlobalAddress directly in static relocation model. - // We cannot use MO_GOT because EBX may not be set up. - if (TM.getRelocationModel() == Reloc::Static) - return X86II::MO_NO_FLAG; + // 32-bit ELF references GlobalAddress directly in static relocation model. + // We cannot use MO_GOT because EBX may not be set up. + if (TM.getRelocationModel() == Reloc::Static) + return X86II::MO_NO_FLAG; return X86II::MO_GOT; } @@ -206,9 +206,9 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, (!F && M.getRtLibUseGOT())) && is64Bit()) return X86II::MO_GOTPCREL; - // Reference ExternalSymbol directly in static relocation model. - if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static) - return X86II::MO_NO_FLAG; + // Reference ExternalSymbol directly in static relocation model. + if (!is64Bit() && !GV && TM.getRelocationModel() == Reloc::Static) + return X86II::MO_NO_FLAG; return X86II::MO_PLT; } @@ -234,22 +234,22 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } -void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, - StringRef FS) { - if (CPU.empty()) - CPU = "generic"; +void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, + StringRef FS) { + if (CPU.empty()) + CPU = "generic"; - if (TuneCPU.empty()) - TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect. + if (TuneCPU.empty()) + TuneCPU = "i586"; // FIXME: "generic" is more modern than llc tests expect. - std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); - assert(!FullFS.empty() && "Failed to parse X86 triple"); + std::string FullFS = X86_MC::ParseX86Triple(TargetTriple); + assert(!FullFS.empty() && "Failed to parse X86 triple"); - if (!FS.empty()) - FullFS = (Twine(FullFS) + "," + FS).str(); + if (!FS.empty()) + FullFS = (Twine(FullFS) + "," + FS).str(); // Parse features string and set the CPU. - ParseSubtargetFeatures(CPU, TuneCPU, FullFS); + ParseSubtargetFeatures(CPU, TuneCPU, FullFS); // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of // 16-bytes and under that are reasonably fast. These features were @@ -265,13 +265,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, report_fatal_error("64-bit code requested on a subtarget that doesn't " "support it!"); - // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all - // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes - // following the i386 psABI, while on Illumos it is always 16 bytes. + // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and for all + // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes + // following the i386 psABI, while on Illumos it is always 16 bytes. if (StackAlignOverride) stackAlignment = *StackAlignOverride; - else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || - In64BitMode) + else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || + In64BitMode) stackAlignment = Align(16); // Consume the vector width attribute or apply any target specific limit. @@ -284,24 +284,24 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, } X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, - StringRef TuneCPU, + StringRef TuneCPU, StringRef FS) { - initSubtargetFeatures(CPU, TuneCPU, FS); + initSubtargetFeatures(CPU, TuneCPU, FS); return *this; } -X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, - StringRef FS, const X86TargetMachine &TM, +X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, + StringRef FS, const X86TargetMachine &TM, MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth) - : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS), - PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT), - StackAlignOverride(StackAlignOverride), + : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS), + PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT), + StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), - InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)), - TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { + InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)), + TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) setPICStyle(PICStyles::Style::None); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h index fa2622333d..3bd6599215 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86Subtarget.h @@ -189,8 +189,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Processor has RDSEED instructions. bool HasRDSEED = false; - /// Processor has LAHF/SAHF instructions in 64-bit mode. - bool HasLAHFSAHF64 = false; + /// Processor has LAHF/SAHF instructions in 64-bit mode. + bool HasLAHFSAHF64 = false; /// Processor has MONITORX/MWAITX instructions. bool HasMWAITX = false; @@ -302,9 +302,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB = false; - /// True if the processor has fast short REP MOV. - bool HasFSRM = false; - + /// True if the processor has fast short REP MOV. + bool HasFSRM = false; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions = false; @@ -355,9 +355,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Processor has AVX-512 Vector Neural Network Instructions bool HasVNNI = false; - /// Processor has AVX Vector Neural Network Instructions - bool HasAVXVNNI = false; - + /// Processor has AVX Vector Neural Network Instructions + bool HasAVXVNNI = false; + /// Processor has AVX-512 bfloat16 floating-point extensions bool HasBF16 = false; @@ -398,15 +398,15 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Processor supports PCONFIG instruction bool HasPCONFIG = false; - /// Processor support key locker instructions - bool HasKL = false; - - /// Processor support key locker wide instructions - bool HasWIDEKL = false; - - /// Processor supports HRESET instruction - bool HasHRESET = false; - + /// Processor support key locker instructions + bool HasKL = false; + + /// Processor support key locker wide instructions + bool HasWIDEKL = false; + + /// Processor supports HRESET instruction + bool HasHRESET = false; + /// Processor supports SERIALIZE instruction bool HasSERIALIZE = false; @@ -418,9 +418,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool HasAMXBF16 = false; bool HasAMXINT8 = false; - /// Processor supports User Level Interrupt instructions - bool HasUINTR = false; - + /// Processor supports User Level Interrupt instructions + bool HasUINTR = false; + /// Processor has a single uop BEXTR implementation. bool HasFastBEXTR = false; @@ -472,8 +472,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// entry to the function and which must be maintained by every function. Align stackAlignment = Align(4); - Align TileConfigAlignment = Align(4); - + Align TileConfigAlignment = Align(4); + /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. /// // FIXME: this is a known good value for Yonah. How about others? @@ -515,13 +515,13 @@ private: unsigned RequiredVectorWidth; /// True if compiling for 64-bit, false for 16-bit or 32-bit. - bool In64BitMode = false; + bool In64BitMode = false; /// True if compiling for 32-bit, false for 16-bit or 64-bit. - bool In32BitMode = false; + bool In32BitMode = false; /// True if compiling for 16-bit, false for 32-bit or 64-bit. - bool In16BitMode = false; + bool In16BitMode = false; X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which @@ -534,7 +534,7 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. /// - X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, + X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS, const X86TargetMachine &TM, MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth); @@ -557,9 +557,9 @@ public: return &getInstrInfo()->getRegisterInfo(); } - unsigned getTileConfigSize() const { return 64; } - Align getTileConfigAlignment() const { return TileConfigAlignment; } - + unsigned getTileConfigSize() const { return 64; } + Align getTileConfigAlignment() const { return TileConfigAlignment; } + /// Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. @@ -571,7 +571,7 @@ public: /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. - void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); /// Methods used by Global ISel const CallLowering *getCallLowering() const override; @@ -582,10 +582,10 @@ public: private: /// Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. - X86Subtarget &initializeSubtargetDependencies(StringRef CPU, - StringRef TuneCPU, - StringRef FS); - void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + X86Subtarget &initializeSubtargetDependencies(StringRef CPU, + StringRef TuneCPU, + StringRef FS); + void initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); public: /// Is this x86_64? (disregarding specific ABI / programming model) @@ -684,7 +684,7 @@ public: return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); } bool hasRDSEED() const { return HasRDSEED; } - bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } + bool hasLAHFSAHF() const { return HasLAHFSAHF64 || !is64Bit(); } bool hasMWAITX() const { return HasMWAITX; } bool hasCLZERO() const { return HasCLZERO; } bool hasCLDEMOTE() const { return HasCLDEMOTE; } @@ -717,7 +717,7 @@ public: bool hasMacroFusion() const { return HasMacroFusion; } bool hasBranchFusion() const { return HasBranchFusion; } bool hasERMSB() const { return HasERMSB; } - bool hasFSRM() const { return HasFSRM; } + bool hasFSRM() const { return HasFSRM; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } @@ -748,17 +748,17 @@ public: bool hasSGX() const { return HasSGX; } bool hasINVPCID() const { return HasINVPCID; } bool hasENQCMD() const { return HasENQCMD; } - bool hasKL() const { return HasKL; } - bool hasWIDEKL() const { return HasWIDEKL; } - bool hasHRESET() const { return HasHRESET; } + bool hasKL() const { return HasKL; } + bool hasWIDEKL() const { return HasWIDEKL; } + bool hasHRESET() const { return HasHRESET; } bool hasSERIALIZE() const { return HasSERIALIZE; } bool hasTSXLDTRK() const { return HasTSXLDTRK; } - bool hasUINTR() const { return HasUINTR; } + bool hasUINTR() const { return HasUINTR; } bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } bool useRetpolineIndirectBranches() const { return UseRetpolineIndirectBranches; } - bool hasAVXVNNI() const { return HasAVXVNNI; } + bool hasAVXVNNI() const { return HasAVXVNNI; } bool hasAMXTILE() const { return HasAMXTILE; } bool hasAMXBF16() const { return HasAMXBF16; } bool hasAMXINT8() const { return HasAMXINT8; } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp index c8f76c210a..762ea5bc6e 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.cpp @@ -62,7 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target()); PassRegistry &PR = *PassRegistry::getPassRegistry(); - initializeX86LowerAMXTypeLegacyPassPass(PR); + initializeX86LowerAMXTypeLegacyPassPass(PR); initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); @@ -72,7 +72,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86FixupSetCCPassPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); - initializeX86TileConfigPass(PR); + initializeX86TileConfigPass(PR); initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); @@ -85,7 +85,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); initializeX86PartialReductionPass(PR); - initializePseudoProbeInserterPass(PR); + initializePseudoProbeInserterPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -236,30 +236,30 @@ X86TargetMachine::~X86TargetMachine() = default; const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); - Attribute TuneAttr = F.getFnAttribute("tune-cpu"); + Attribute TuneAttr = F.getFnAttribute("tune-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); - StringRef CPU = - CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU; - StringRef TuneCPU = - TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU; - StringRef FS = - FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS; + StringRef CPU = + CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU; + StringRef TuneCPU = + TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU; + StringRef FS = + FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS; SmallString<512> Key; - // The additions here are ordered so that the definitely short strings are - // added first so we won't exceed the small size. We append the - // much longer FS string at the end so that we only heap allocate at most - // one time. + // The additions here are ordered so that the definitely short strings are + // added first so we won't exceed the small size. We append the + // much longer FS string at the end so that we only heap allocate at most + // one time. // Extract prefer-vector-width attribute. unsigned PreferVectorWidthOverride = 0; - Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width"); - if (PreferVecWidthAttr.isValid()) { - StringRef Val = PreferVecWidthAttr.getValueAsString(); + Attribute PreferVecWidthAttr = F.getFnAttribute("prefer-vector-width"); + if (PreferVecWidthAttr.isValid()) { + StringRef Val = PreferVecWidthAttr.getValueAsString(); unsigned Width; if (!Val.getAsInteger(0, Width)) { - Key += "prefer-vector-width="; + Key += "prefer-vector-width="; Key += Val; PreferVectorWidthOverride = Width; } @@ -267,45 +267,45 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // Extract min-legal-vector-width attribute. unsigned RequiredVectorWidth = UINT32_MAX; - Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width"); - if (MinLegalVecWidthAttr.isValid()) { - StringRef Val = MinLegalVecWidthAttr.getValueAsString(); + Attribute MinLegalVecWidthAttr = F.getFnAttribute("min-legal-vector-width"); + if (MinLegalVecWidthAttr.isValid()) { + StringRef Val = MinLegalVecWidthAttr.getValueAsString(); unsigned Width; if (!Val.getAsInteger(0, Width)) { - Key += "min-legal-vector-width="; + Key += "min-legal-vector-width="; Key += Val; RequiredVectorWidth = Width; } } - // Add CPU to the Key. - Key += CPU; - - // Add tune CPU to the Key. - Key += "tune="; - Key += TuneCPU; - - // Keep track of the start of the feature portion of the string. - unsigned FSStart = Key.size(); - - // FIXME: This is related to the code below to reset the target options, - // we need to know whether or not the soft float flag is set on the - // function before we can generate a subtarget. We also need to use - // it as a key for the subtarget since that can be the only difference - // between two functions. - bool SoftFloat = - F.getFnAttribute("use-soft-float").getValueAsString() == "true"; - // If the soft float attribute is set on the function turn on the soft float - // subtarget feature. - if (SoftFloat) - Key += FS.empty() ? "+soft-float" : "+soft-float,"; - - Key += FS; - - // We may have added +soft-float to the features so move the StringRef to - // point to the full string in the Key. - FS = Key.substr(FSStart); - + // Add CPU to the Key. + Key += CPU; + + // Add tune CPU to the Key. + Key += "tune="; + Key += TuneCPU; + + // Keep track of the start of the feature portion of the string. + unsigned FSStart = Key.size(); + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + Key += FS.empty() ? "+soft-float" : "+soft-float,"; + + Key += FS; + + // We may have added +soft-float to the features so move the StringRef to + // point to the full string in the Key. + FS = Key.substr(FSStart); + auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -313,21 +313,21 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = std::make_unique<X86Subtarget>( - TargetTriple, CPU, TuneCPU, FS, *this, + TargetTriple, CPU, TuneCPU, FS, *this, MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride, RequiredVectorWidth); } return I.get(); } -bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { - assert(SrcAS != DestAS && "Expected different address spaces!"); - if (getPointerSize(SrcAS) != getPointerSize(DestAS)) - return false; - return SrcAS < 256 && DestAS < 256; -} - +bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + assert(SrcAS != DestAS && "Expected different address spaces!"); + if (getPointerSize(SrcAS) != getPointerSize(DestAS)) + return false; + return SrcAS < 256 && DestAS < 256; +} + //===----------------------------------------------------------------------===// // X86 TTI query. //===----------------------------------------------------------------------===// @@ -381,7 +381,7 @@ public: void addPreEmitPass() override; void addPreEmitPass2() override; void addPreSched2() override; - bool addPreRewrite() override; + bool addPreRewrite() override; std::unique_ptr<CSEConfigBase> getCSEConfig() const override; }; @@ -410,7 +410,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { void X86PassConfig::addIRPasses() { addPass(createAtomicExpandPass()); - addPass(createX86LowerAMXTypePass()); + addPass(createX86LowerAMXTypePass()); TargetPassConfig::addIRPasses(); @@ -449,7 +449,7 @@ bool X86PassConfig::addInstSelector() { } bool X86PassConfig::addIRTranslator() { - addPass(new IRTranslator(getOptLevel())); + addPass(new IRTranslator(getOptLevel())); return false; } @@ -496,12 +496,12 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86SpeculativeLoadHardeningPass()); addPass(createX86FlagsCopyLoweringPass()); addPass(createX86WinAllocaExpander()); - - if (getOptLevel() != CodeGenOpt::None) { - addPass(createX86PreTileConfigPass()); - } + + if (getOptLevel() != CodeGenOpt::None) { + addPass(createX86PreTileConfigPass()); + } } - + void X86PassConfig::addMachineSSAOptimization() { addPass(createX86DomainReassignmentPass()); TargetPassConfig::addMachineSSAOptimization(); @@ -574,11 +574,11 @@ void X86PassConfig::addPreEmitPass2() { addPass(createX86LoadValueInjectionRetHardeningPass()); } -bool X86PassConfig::addPreRewrite() { - addPass(createX86TileConfigPass()); - return true; -} - +bool X86PassConfig::addPreRewrite() { + addPass(createX86TileConfigPass()); + return true; +} + std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const { return getStandardCSEConfigForOpt(TM->getOptLevel()); } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h index 69d7e48b89..3c77581146 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetMachine.h @@ -54,8 +54,8 @@ public: } bool isJIT() const { return IsJIT; } - - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; }; } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h index f4bf52c837..dbed7df9c6 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetObjectFile.h @@ -36,7 +36,7 @@ namespace llvm { MCStreamer &Streamer) const override; }; - /// This implementation is used for X86 ELF targets that don't + /// This implementation is used for X86 ELF targets that don't /// have a further specialization. class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF { public: diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp index 71455237fb..0741fa9ad3 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.cpp @@ -232,16 +232,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, bool Op2Signed = false; unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); - bool SignedMode = Op1Signed || Op2Signed; + bool SignedMode = Op1Signed || Op2Signed; unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); if (OpMinSize <= 7) return LT.first * 3; // pmullw/sext - if (!SignedMode && OpMinSize <= 8) + if (!SignedMode && OpMinSize <= 8) return LT.first * 3; // pmullw/zext if (OpMinSize <= 15) return LT.first * 5; // pmullw/pmulhw/pshuf - if (!SignedMode && OpMinSize <= 16) + if (!SignedMode && OpMinSize <= 16) return LT.first * 5; // pmullw/pmulhw/pshuf } @@ -321,11 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. - - { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence - { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence - { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence - { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence + + { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -341,11 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. - - { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence - { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence - { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence - { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence + + { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -363,15 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. - - { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. - { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. - { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence - { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence - { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. - { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. - { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence - { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence + + { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. + { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. + { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence }; // XOP has faster vXi8 shifts. @@ -1128,9 +1128,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 - - {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw - {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb + + {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw + {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb }; if (ST->hasBWI()) @@ -1184,13 +1184,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, - - {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq - {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq - {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd - {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps - {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq - {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd + + {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq + {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq + {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd + {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps + {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq + {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd }; if (ST->hasAVX512()) @@ -1396,7 +1396,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -2018,7 +2018,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind)); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind)); MVT SimpleSrcTy = SrcTy.getSimpleVT(); MVT SimpleDstTy = DstTy.getSimpleVT(); @@ -2079,18 +2079,18 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return AdjustCost(Entry->Cost); } - return AdjustCost( - BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, - I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, + I); // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -2274,7 +2274,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * (ExtraCost + Entry->Cost); - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } @@ -2288,9 +2288,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll - - // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not - // specialized in these tables yet. + + // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not + // specialized in these tables yet. static const CostTblEntry AVX512CDCostTbl[] = { { ISD::CTLZ, MVT::v8i64, 1 }, { ISD::CTLZ, MVT::v16i32, 1 }, @@ -2306,8 +2306,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTLZ, MVT::v16i8, 4 }, }; static const CostTblEntry AVX512BWCostTbl[] = { - { ISD::ABS, MVT::v32i16, 1 }, - { ISD::ABS, MVT::v64i8, 1 }, + { ISD::ABS, MVT::v32i16, 1 }, + { ISD::ABS, MVT::v64i8, 1 }, { ISD::BITREVERSE, MVT::v8i64, 5 }, { ISD::BITREVERSE, MVT::v16i32, 5 }, { ISD::BITREVERSE, MVT::v32i16, 5 }, @@ -2326,28 +2326,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v64i8, 9 }, { ISD::SADDSAT, MVT::v32i16, 1 }, { ISD::SADDSAT, MVT::v64i8, 1 }, - { ISD::SMAX, MVT::v32i16, 1 }, - { ISD::SMAX, MVT::v64i8, 1 }, - { ISD::SMIN, MVT::v32i16, 1 }, - { ISD::SMIN, MVT::v64i8, 1 }, + { ISD::SMAX, MVT::v32i16, 1 }, + { ISD::SMAX, MVT::v64i8, 1 }, + { ISD::SMIN, MVT::v32i16, 1 }, + { ISD::SMIN, MVT::v64i8, 1 }, { ISD::SSUBSAT, MVT::v32i16, 1 }, { ISD::SSUBSAT, MVT::v64i8, 1 }, { ISD::UADDSAT, MVT::v32i16, 1 }, { ISD::UADDSAT, MVT::v64i8, 1 }, - { ISD::UMAX, MVT::v32i16, 1 }, - { ISD::UMAX, MVT::v64i8, 1 }, - { ISD::UMIN, MVT::v32i16, 1 }, - { ISD::UMIN, MVT::v64i8, 1 }, + { ISD::UMAX, MVT::v32i16, 1 }, + { ISD::UMAX, MVT::v64i8, 1 }, + { ISD::UMIN, MVT::v32i16, 1 }, + { ISD::UMIN, MVT::v64i8, 1 }, { ISD::USUBSAT, MVT::v32i16, 1 }, { ISD::USUBSAT, MVT::v64i8, 1 }, }; static const CostTblEntry AVX512CostTbl[] = { - { ISD::ABS, MVT::v8i64, 1 }, - { ISD::ABS, MVT::v16i32, 1 }, - { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split - { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split - { ISD::ABS, MVT::v4i64, 1 }, - { ISD::ABS, MVT::v2i64, 1 }, + { ISD::ABS, MVT::v8i64, 1 }, + { ISD::ABS, MVT::v16i32, 1 }, + { ISD::ABS, MVT::v32i16, 2 }, // FIXME: include split + { ISD::ABS, MVT::v64i8, 2 }, // FIXME: include split + { ISD::ABS, MVT::v4i64, 1 }, + { ISD::ABS, MVT::v2i64, 1 }, { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, { ISD::BITREVERSE, MVT::v32i16, 10 }, @@ -2364,30 +2364,30 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i32, 28 }, { ISD::CTTZ, MVT::v32i16, 24 }, { ISD::CTTZ, MVT::v64i8, 18 }, - { ISD::SMAX, MVT::v8i64, 1 }, - { ISD::SMAX, MVT::v16i32, 1 }, - { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split - { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split - { ISD::SMAX, MVT::v4i64, 1 }, - { ISD::SMAX, MVT::v2i64, 1 }, - { ISD::SMIN, MVT::v8i64, 1 }, - { ISD::SMIN, MVT::v16i32, 1 }, - { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split - { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split - { ISD::SMIN, MVT::v4i64, 1 }, - { ISD::SMIN, MVT::v2i64, 1 }, - { ISD::UMAX, MVT::v8i64, 1 }, - { ISD::UMAX, MVT::v16i32, 1 }, - { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split - { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split - { ISD::UMAX, MVT::v4i64, 1 }, - { ISD::UMAX, MVT::v2i64, 1 }, - { ISD::UMIN, MVT::v8i64, 1 }, - { ISD::UMIN, MVT::v16i32, 1 }, - { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split - { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split - { ISD::UMIN, MVT::v4i64, 1 }, - { ISD::UMIN, MVT::v2i64, 1 }, + { ISD::SMAX, MVT::v8i64, 1 }, + { ISD::SMAX, MVT::v16i32, 1 }, + { ISD::SMAX, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SMAX, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SMAX, MVT::v4i64, 1 }, + { ISD::SMAX, MVT::v2i64, 1 }, + { ISD::SMIN, MVT::v8i64, 1 }, + { ISD::SMIN, MVT::v16i32, 1 }, + { ISD::SMIN, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SMIN, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SMIN, MVT::v4i64, 1 }, + { ISD::SMIN, MVT::v2i64, 1 }, + { ISD::UMAX, MVT::v8i64, 1 }, + { ISD::UMAX, MVT::v16i32, 1 }, + { ISD::UMAX, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UMAX, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UMAX, MVT::v4i64, 1 }, + { ISD::UMAX, MVT::v2i64, 1 }, + { ISD::UMIN, MVT::v8i64, 1 }, + { ISD::UMIN, MVT::v16i32, 1 }, + { ISD::UMIN, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UMIN, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UMIN, MVT::v4i64, 1 }, + { ISD::UMIN, MVT::v2i64, 1 }, { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq @@ -2428,10 +2428,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::BITREVERSE, MVT::i8, 3 } }; static const CostTblEntry AVX2CostTbl[] = { - { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) - { ISD::ABS, MVT::v8i32, 1 }, - { ISD::ABS, MVT::v16i16, 1 }, - { ISD::ABS, MVT::v32i8, 1 }, + { ISD::ABS, MVT::v4i64, 2 }, // VBLENDVPD(X,VPSUBQ(0,X),X) + { ISD::ABS, MVT::v8i32, 1 }, + { ISD::ABS, MVT::v16i16, 1 }, + { ISD::ABS, MVT::v32i8, 1 }, { ISD::BITREVERSE, MVT::v4i64, 5 }, { ISD::BITREVERSE, MVT::v8i32, 5 }, { ISD::BITREVERSE, MVT::v16i16, 5 }, @@ -2453,28 +2453,28 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v32i8, 9 }, { ISD::SADDSAT, MVT::v16i16, 1 }, { ISD::SADDSAT, MVT::v32i8, 1 }, - { ISD::SMAX, MVT::v8i32, 1 }, - { ISD::SMAX, MVT::v16i16, 1 }, - { ISD::SMAX, MVT::v32i8, 1 }, - { ISD::SMIN, MVT::v8i32, 1 }, - { ISD::SMIN, MVT::v16i16, 1 }, - { ISD::SMIN, MVT::v32i8, 1 }, + { ISD::SMAX, MVT::v8i32, 1 }, + { ISD::SMAX, MVT::v16i16, 1 }, + { ISD::SMAX, MVT::v32i8, 1 }, + { ISD::SMIN, MVT::v8i32, 1 }, + { ISD::SMIN, MVT::v16i16, 1 }, + { ISD::SMIN, MVT::v32i8, 1 }, { ISD::SSUBSAT, MVT::v16i16, 1 }, { ISD::SSUBSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v16i16, 1 }, { ISD::UADDSAT, MVT::v32i8, 1 }, { ISD::UADDSAT, MVT::v8i32, 3 }, // not + pminud + paddd - { ISD::UMAX, MVT::v8i32, 1 }, - { ISD::UMAX, MVT::v16i16, 1 }, - { ISD::UMAX, MVT::v32i8, 1 }, - { ISD::UMIN, MVT::v8i32, 1 }, - { ISD::UMIN, MVT::v16i16, 1 }, - { ISD::UMIN, MVT::v32i8, 1 }, + { ISD::UMAX, MVT::v8i32, 1 }, + { ISD::UMAX, MVT::v16i16, 1 }, + { ISD::UMAX, MVT::v32i8, 1 }, + { ISD::UMIN, MVT::v8i32, 1 }, + { ISD::UMIN, MVT::v16i16, 1 }, + { ISD::UMIN, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v16i16, 1 }, { ISD::USUBSAT, MVT::v32i8, 1 }, { ISD::USUBSAT, MVT::v8i32, 2 }, // pmaxud + psubd - { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS - { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD + { ISD::FMAXNUM, MVT::v8f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS + { ISD::FMAXNUM, MVT::v4f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD { ISD::FSQRT, MVT::f32, 7 }, // Haswell from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ { ISD::FSQRT, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ @@ -2483,10 +2483,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ }; static const CostTblEntry AVX1CostTbl[] = { - { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) - { ISD::ABS, MVT::v8i32, 3 }, - { ISD::ABS, MVT::v16i16, 3 }, - { ISD::ABS, MVT::v32i8, 3 }, + { ISD::ABS, MVT::v4i64, 5 }, // VBLENDVPD(X,VPSUBQ(0,X),X) + { ISD::ABS, MVT::v8i32, 3 }, + { ISD::ABS, MVT::v16i16, 3 }, + { ISD::ABS, MVT::v32i8, 3 }, { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert @@ -2508,32 +2508,32 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert { ISD::SADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::SMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SSUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::SSUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::UADDSAT, MVT::v8i32, 8 }, // 2 x 128-bit Op + extract/insert - { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert - { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMAX, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v8i32, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert + { ISD::UMIN, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert - { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS - { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS - { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? - { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD - { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD - { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? + { ISD::FMAXNUM, MVT::f32, 3 }, // MAXSS + CMPUNORDSS + BLENDVPS + { ISD::FMAXNUM, MVT::v4f32, 3 }, // MAXPS + CMPUNORDPS + BLENDVPS + { ISD::FMAXNUM, MVT::v8f32, 5 }, // MAXPS + CMPUNORDPS + BLENDVPS + ? + { ISD::FMAXNUM, MVT::f64, 3 }, // MAXSD + CMPUNORDSD + BLENDVPD + { ISD::FMAXNUM, MVT::v2f64, 3 }, // MAXPD + CMPUNORDPD + BLENDVPD + { ISD::FMAXNUM, MVT::v4f64, 5 }, // MAXPD + CMPUNORDPD + BLENDVPD + ? { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ @@ -2559,21 +2559,21 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; - static const CostTblEntry SSE41CostTbl[] = { - { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) - { ISD::SMAX, MVT::v4i32, 1 }, - { ISD::SMAX, MVT::v16i8, 1 }, - { ISD::SMIN, MVT::v4i32, 1 }, - { ISD::SMIN, MVT::v16i8, 1 }, - { ISD::UMAX, MVT::v4i32, 1 }, - { ISD::UMAX, MVT::v8i16, 1 }, - { ISD::UMIN, MVT::v4i32, 1 }, - { ISD::UMIN, MVT::v8i16, 1 }, - }; + static const CostTblEntry SSE41CostTbl[] = { + { ISD::ABS, MVT::v2i64, 2 }, // BLENDVPD(X,PSUBQ(0,X),X) + { ISD::SMAX, MVT::v4i32, 1 }, + { ISD::SMAX, MVT::v16i8, 1 }, + { ISD::SMIN, MVT::v4i32, 1 }, + { ISD::SMIN, MVT::v16i8, 1 }, + { ISD::UMAX, MVT::v4i32, 1 }, + { ISD::UMAX, MVT::v8i16, 1 }, + { ISD::UMIN, MVT::v4i32, 1 }, + { ISD::UMIN, MVT::v8i16, 1 }, + }; static const CostTblEntry SSSE3CostTbl[] = { - { ISD::ABS, MVT::v4i32, 1 }, - { ISD::ABS, MVT::v8i16, 1 }, - { ISD::ABS, MVT::v16i8, 1 }, + { ISD::ABS, MVT::v4i32, 1 }, + { ISD::ABS, MVT::v8i16, 1 }, + { ISD::ABS, MVT::v16i8, 1 }, { ISD::BITREVERSE, MVT::v2i64, 5 }, { ISD::BITREVERSE, MVT::v4i32, 5 }, { ISD::BITREVERSE, MVT::v8i16, 5 }, @@ -2595,10 +2595,10 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i8, 9 } }; static const CostTblEntry SSE2CostTbl[] = { - { ISD::ABS, MVT::v2i64, 4 }, - { ISD::ABS, MVT::v4i32, 3 }, - { ISD::ABS, MVT::v8i16, 2 }, - { ISD::ABS, MVT::v16i8, 2 }, + { ISD::ABS, MVT::v2i64, 4 }, + { ISD::ABS, MVT::v4i32, 3 }, + { ISD::ABS, MVT::v8i16, 2 }, + { ISD::ABS, MVT::v16i8, 2 }, { ISD::BITREVERSE, MVT::v2i64, 29 }, { ISD::BITREVERSE, MVT::v4i32, 27 }, { ISD::BITREVERSE, MVT::v8i16, 27 }, @@ -2620,16 +2620,16 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTTZ, MVT::v16i8, 13 }, { ISD::SADDSAT, MVT::v8i16, 1 }, { ISD::SADDSAT, MVT::v16i8, 1 }, - { ISD::SMAX, MVT::v8i16, 1 }, - { ISD::SMIN, MVT::v8i16, 1 }, + { ISD::SMAX, MVT::v8i16, 1 }, + { ISD::SMIN, MVT::v8i16, 1 }, { ISD::SSUBSAT, MVT::v8i16, 1 }, { ISD::SSUBSAT, MVT::v16i8, 1 }, { ISD::UADDSAT, MVT::v8i16, 1 }, { ISD::UADDSAT, MVT::v16i8, 1 }, - { ISD::UMAX, MVT::v8i16, 2 }, - { ISD::UMAX, MVT::v16i8, 1 }, - { ISD::UMIN, MVT::v8i16, 2 }, - { ISD::UMIN, MVT::v16i8, 1 }, + { ISD::UMAX, MVT::v8i16, 2 }, + { ISD::UMAX, MVT::v16i8, 1 }, + { ISD::UMIN, MVT::v8i16, 2 }, + { ISD::UMIN, MVT::v16i8, 1 }, { ISD::USUBSAT, MVT::v8i16, 1 }, { ISD::USUBSAT, MVT::v16i8, 1 }, { ISD::FMAXNUM, MVT::f64, 4 }, @@ -2668,18 +2668,18 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::CTPOP, MVT::i8, 1 }, }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV + { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, 14 }, { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, - { ISD::UMULO, MVT::i64, 2 }, // mulq + seto + { ISD::UMULO, MVT::i64, 2 }, // mulq + seto }; static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets - { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV - { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV + { ISD::ABS, MVT::i32, 2 }, // SUB+CMOV + { ISD::ABS, MVT::i16, 2 }, // SUB+CMOV { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, @@ -2698,9 +2698,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::UADDO, MVT::i32, 1 }, { ISD::UADDO, MVT::i16, 1 }, { ISD::UADDO, MVT::i8, 1 }, - { ISD::UMULO, MVT::i32, 2 }, // mul + seto - { ISD::UMULO, MVT::i16, 2 }, - { ISD::UMULO, MVT::i8, 2 }, + { ISD::UMULO, MVT::i32, 2 }, // mul + seto + { ISD::UMULO, MVT::i16, 2 }, + { ISD::UMULO, MVT::i8, 2 }, }; Type *RetTy = ICA.getReturnType(); @@ -2710,9 +2710,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( switch (IID) { default: break; - case Intrinsic::abs: - ISD = ISD::ABS; - break; + case Intrinsic::abs: + ISD = ISD::ABS; + break; case Intrinsic::bitreverse: ISD = ISD::BITREVERSE; break; @@ -2736,24 +2736,24 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( case Intrinsic::sadd_sat: ISD = ISD::SADDSAT; break; - case Intrinsic::smax: - ISD = ISD::SMAX; - break; - case Intrinsic::smin: - ISD = ISD::SMIN; - break; + case Intrinsic::smax: + ISD = ISD::SMAX; + break; + case Intrinsic::smin: + ISD = ISD::SMIN; + break; case Intrinsic::ssub_sat: ISD = ISD::SSUBSAT; break; case Intrinsic::uadd_sat: ISD = ISD::UADDSAT; break; - case Intrinsic::umax: - ISD = ISD::UMAX; - break; - case Intrinsic::umin: - ISD = ISD::UMIN; - break; + case Intrinsic::umax: + ISD = ISD::UMAX; + break; + case Intrinsic::umin: + ISD = ISD::UMIN; + break; case Intrinsic::usub_sat: ISD = ISD::USUBSAT; break; @@ -2772,12 +2772,12 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( ISD = ISD::UADDO; OpTy = RetTy->getContainedType(0); break; - case Intrinsic::umul_with_overflow: - case Intrinsic::smul_with_overflow: - // SMULO has same costs so don't duplicate. - ISD = ISD::UMULO; - OpTy = RetTy->getContainedType(0); - break; + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + // SMULO has same costs so don't duplicate. + ISD = ISD::UMULO; + OpTy = RetTy->getContainedType(0); + break; } if (ISD != ISD::DELETED_NODE) { @@ -2786,121 +2786,121 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( MVT MTy = LT.second; // Attempt to lookup cost. - if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && - MTy.isVector()) { - // With PSHUFB the code is very similar for all types. If we have integer - // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types - // we also need a PSHUFB. - unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; - - // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB - // instructions. We also need an extract and an insert. - if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || - (ST->hasBWI() && MTy.is512BitVector()))) - Cost = Cost * 2 + 2; - - return LT.first * Cost; - } - - auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost, - FastMathFlags FMF) { - // If there are no NANs to deal with, then these are reduced to a - // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we - // assume is used in the non-fast case. - if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { - if (FMF.noNaNs()) - return LegalizationCost * 1; - } - return LegalizationCost * (int)Entry.Cost; - }; - + if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && + MTy.isVector()) { + // With PSHUFB the code is very similar for all types. If we have integer + // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types + // we also need a PSHUFB. + unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; + + // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB + // instructions. We also need an extract and an insert. + if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || + (ST->hasBWI() && MTy.is512BitVector()))) + Cost = Cost * 2 + 2; + + return LT.first * Cost; + } + + auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost, + FastMathFlags FMF) { + // If there are no NANs to deal with, then these are reduced to a + // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we + // assume is used in the non-fast case. + if (Entry.ISD == ISD::FMAXNUM || Entry.ISD == ISD::FMINNUM) { + if (FMF.noNaNs()) + return LegalizationCost * 1; + } + return LegalizationCost * (int)Entry.Cost; + }; + if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->isSLM()) if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); - - if (ST->hasSSE41()) - if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasSSE1()) if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (ST->hasBMI()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } if (ST->hasLZCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } if (ST->hasPOPCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } // TODO - add BMI (TZCNT) scalar handling if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) - return adjustTableCost(*Entry, LT.first, ICA.getFlags()); + return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -3119,32 +3119,32 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); } else { - // In each 128-lane, if at least one index is demanded but not all - // indices are demanded and this 128-lane is not the first 128-lane of - // the legalized-vector, then this 128-lane needs a extracti128; If in - // each 128-lane, there is at least one demanded index, this 128-lane - // needs a inserti128. - - // The following cases will help you build a better understanding: - // Assume we insert several elements into a v8i32 vector in avx2, - // Case#1: inserting into 1th index needs vpinsrd + inserti128. - // Case#2: inserting into 5th index needs extracti128 + vpinsrd + - // inserti128. - // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. - unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first; - unsigned NumElts = LT.second.getVectorNumElements() * LT.first; - APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); - unsigned Scale = NumElts / Num128Lanes; - // We iterate each 128-lane, and check if we need a - // extracti128/inserti128 for this 128-lane. - for (unsigned I = 0; I < NumElts; I += Scale) { - APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); - APInt MaskedDE = Mask & WidenedDemandedElts; - unsigned Population = MaskedDE.countPopulation(); - Cost += (Population > 0 && Population != Scale && - I % LT.second.getVectorNumElements() != 0); - Cost += Population > 0; - } + // In each 128-lane, if at least one index is demanded but not all + // indices are demanded and this 128-lane is not the first 128-lane of + // the legalized-vector, then this 128-lane needs a extracti128; If in + // each 128-lane, there is at least one demanded index, this 128-lane + // needs a inserti128. + + // The following cases will help you build a better understanding: + // Assume we insert several elements into a v8i32 vector in avx2, + // Case#1: inserting into 1th index needs vpinsrd + inserti128. + // Case#2: inserting into 5th index needs extracti128 + vpinsrd + + // inserti128. + // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. + unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first; + unsigned NumElts = LT.second.getVectorNumElements() * LT.first; + APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); + unsigned Scale = NumElts / Num128Lanes; + // We iterate each 128-lane, and check if we need a + // extracti128/inserti128 for this 128-lane. + for (unsigned I = 0; I < NumElts; I += Scale) { + APInt Mask = WidenedDemandedElts.getBitsSet(NumElts, I, I + Scale); + APInt MaskedDE = Mask & WidenedDemandedElts; + unsigned Population = MaskedDE.countPopulation(); + Cost += (Population > 0 && Population != Scale && + I % LT.second.getVectorNumElements() != 0); + Cost += Population > 0; + } Cost += DemandedElts.countPopulation(); // For vXf32 cases, insertion into the 0'th index in each v4f32 @@ -3188,10 +3188,10 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) { - if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { + if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { // Store instruction with index and scale costs 2 Uops. // Check the preceding GEP to identify non-const indices. - if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) return TTI::TCC_Basic * 2; } @@ -3270,7 +3270,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, getScalarizationOverhead(MaskTy, DemandedElts, false, true); int ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + CmpInst::BAD_ICMP_PREDICATE, CostKind); int BranchCost = getCFInstrCost(Instruction::Br, CostKind); int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); int ValueSplitCost = @@ -3691,10 +3691,10 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // Otherwise fall back to cmp+select. - return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, - CostKind) + - getCmpSelInstrCost(Instruction::Select, Ty, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, + CostKind) + + getCmpSelInstrCost(Instruction::Select, Ty, CondTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); } int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, @@ -3923,10 +3923,10 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, return std::max(1, Cost); } -int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind, - Instruction *Inst) { +int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind, + Instruction *Inst) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -4066,28 +4066,28 @@ X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; } -int X86TTIImpl::getGatherOverhead() const { - // Some CPUs have more overhead for gather. The specified overhead is relative - // to the Load operation. "2" is the number provided by Intel architects. This - // parameter is used for cost estimation of Gather Op and comparison with - // other alternatives. - // TODO: Remove the explicit hasAVX512()?, That would mean we would only - // enable gather with a -march. - if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) - return 2; - - return 1024; -} - -int X86TTIImpl::getScatterOverhead() const { - if (ST->hasAVX512()) - return 2; - - return 1024; -} - -// Return an average cost of Gather / Scatter instruction, maybe improved later. -// FIXME: Add TargetCostKind support. +int X86TTIImpl::getGatherOverhead() const { + // Some CPUs have more overhead for gather. The specified overhead is relative + // to the Load operation. "2" is the number provided by Intel architects. This + // parameter is used for cost estimation of Gather Op and comparison with + // other alternatives. + // TODO: Remove the explicit hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) + return 2; + + return 1024; +} + +int X86TTIImpl::getScatterOverhead() const { + if (ST->hasAVX512()) + return 2; + + return 1024; +} + +// Return an average cost of Gather / Scatter instruction, maybe improved later. +// FIXME: Add TargetCostKind support. int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, Align Alignment, unsigned AddressSpace) { @@ -4145,8 +4145,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. const int GSOverhead = (Opcode == Instruction::Load) - ? getGatherOverhead() - : getScatterOverhead(); + ? getGatherOverhead() + : getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), MaybeAlign(Alignment), AddressSpace, TTI::TCK_RecipThroughput); @@ -4160,7 +4160,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, /// Alignment - Alignment for one element. /// AddressSpace - pointer[s] address space. /// -/// FIXME: Add TargetCostKind support. +/// FIXME: Add TargetCostKind support. int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, bool VariableMask, Align Alignment, unsigned AddressSpace) { @@ -4174,9 +4174,9 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); MaskUnpackCost = getScalarizationOverhead(MaskTy, DemandedElts, false, true); - int ScalarCompareCost = getCmpSelInstrCost( - Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + int ScalarCompareCost = getCmpSelInstrCost( + Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, + CmpInst::BAD_ICMP_PREDICATE, CostKind); int BranchCost = getCFInstrCost(Instruction::Br, CostKind); MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); } @@ -4207,15 +4207,15 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { - if (CostKind != TTI::TCK_RecipThroughput) { - if ((Opcode == Instruction::Load && - isLegalMaskedGather(SrcVTy, Align(Alignment))) || - (Opcode == Instruction::Store && - isLegalMaskedScatter(SrcVTy, Align(Alignment)))) - return 1; - return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, - Alignment, CostKind, I); - } + if (CostKind != TTI::TCK_RecipThroughput) { + if ((Opcode == Instruction::Load && + isLegalMaskedGather(SrcVTy, Align(Alignment))) || + (Opcode == Instruction::Store && + isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + return 1; + return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, + Alignment, CostKind, I); + } assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); @@ -4375,7 +4375,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { // scalarize it. if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { unsigned NumElts = DataVTy->getNumElements(); - if (NumElts == 1) + if (NumElts == 1) return false; } Type *ScalarTy = DataTy->getScalarType(); diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h index 17570f1c04..3ebfef5d65 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/libs/llvm12/lib/Target/X86/X86TargetTransformInfo.h @@ -22,8 +22,8 @@ namespace llvm { -class InstCombiner; - +class InstCombiner; + class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { typedef BasicTTIImplBase<X86TTIImpl> BaseT; typedef TargetTransformInfo TTI; @@ -130,10 +130,10 @@ public: int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); @@ -153,18 +153,18 @@ public: int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); - Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, - IntrinsicInst &II) const; - Optional<Value *> - simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, - APInt DemandedMask, KnownBits &Known, - bool &KnownBitsComputed) const; - Optional<Value *> simplifyDemandedVectorEltsIntrinsic( - InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, - APInt &UndefElts2, APInt &UndefElts3, - std::function<void(Instruction *, unsigned, APInt, APInt &)> - SimplifyAndSetOp) const; - + Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + Optional<Value *> + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const; + Optional<Value *> simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + SimplifyAndSetOp) const; + unsigned getAtomicMemIntrinsicMaxElementSize() const; int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, @@ -204,9 +204,9 @@ public: unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); - int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty, TTI::TargetCostKind CostKind, - Instruction *Inst = nullptr); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty, TTI::TargetCostKind CostKind, + Instruction *Inst = nullptr); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, @@ -245,9 +245,9 @@ private: int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr, Align Alignment, unsigned AddressSpace); - int getGatherOverhead() const; - int getScatterOverhead() const; - + int getGatherOverhead() const; + int getScatterOverhead() const; + /// @} }; diff --git a/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp b/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp index ef010bcd38..6164a84c73 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86TileConfig.cpp @@ -1,248 +1,248 @@ -//===-- X86TileConfig.cpp - Tile Register Configure----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Pass to config the shape of AMX physical registers -/// AMX register need to be configured before use. In X86PreTileConfig pass -/// the pldtilecfg instruction is inserted, however at that time we don't -/// know the shape of each physical tile registers, because the register -/// allocation is not done yet. This pass runs after egister allocation -/// pass. It collects the shape information of each physical tile register -/// and store the shape in the stack slot that is allocated for load config -/// to tile config register. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrBuilder.h" -#include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" -#include "X86Subtarget.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TileShapeInfo.h" -#include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "tile-config" - -namespace { - -class X86TileConfig : public MachineFunctionPass { - // context - MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - MachineDominatorTree *DomTree = nullptr; - MachineRegisterInfo *MRI = nullptr; - VirtRegMap *VRM = nullptr; - LiveIntervals *LIS = nullptr; - - MachineInstr *getTileConfigPoint(); - void tileConfig(); - -public: - X86TileConfig() : MachineFunctionPass(ID) {} - - /// Return the pass name. - StringRef getPassName() const override { return "Tile Register Configure"; } - - /// X86TileConfig analysis usage. - void getAnalysisUsage(AnalysisUsage &AU) const override; - - /// Perform register allocation. - bool runOnMachineFunction(MachineFunction &mf) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoPHIs); - } - - static char ID; -}; - -} // end anonymous namespace - -char X86TileConfig::ID = 0; - -INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", - false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", - false, false) - -void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<LiveIntervals>(); - AU.addPreserved<SlotIndexes>(); - AU.addRequired<VirtRegMap>(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -static unsigned getTilePhysRegIndex(Register PhysReg) { - assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) && - "Tile register number is invalid"); - return (PhysReg - X86::TMM0); -} - -static MachineInstr * -storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - Register SrcReg, unsigned BitSize, int FrameIdx, int Offset, - const TargetInstrInfo *TII, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) { - - unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit; - unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr; - if (BitSize == TRI->getRegSizeInBits(*RC)) - SubIdx = 0; - MachineInstr *NewMI = - addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx, - Offset) - .addReg(SrcReg, 0, SubIdx); - return NewMI; -} - -static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - int64_t Imm, unsigned BitSize, - int FrameIdx, int Offset, - const TargetInstrInfo *TII) { - unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi; - return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), - FrameIdx, Offset) - .addImm(Imm); -} - -MachineInstr *X86TileConfig::getTileConfigPoint() { - for (MachineBasicBlock &MBB : *MF) { - - // Traverse the basic block. - for (MachineInstr &MI : MBB) - // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. - if (MI.getOpcode() == X86::PLDTILECFG) - return &MI; - } - - return nullptr; -} - -void X86TileConfig::tileConfig() { - MachineInstr *MI = getTileConfigPoint(); - if (!MI) - return; - MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(1).getIndex(); - BitVector PhysRegs(TRI->getNumRegs()); - - // Fill in the palette first. - auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII); - LIS->InsertMachineInstrInMaps(*NewMI); - // Fill in the shape of each tile physical register. - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register VirtReg = Register::index2VirtReg(i); - if (MRI->reg_nodbg_empty(VirtReg)) - continue; - const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - if (RC.getID() != X86::TILERegClassID) - continue; - Register PhysReg = VRM->getPhys(VirtReg); - if (PhysRegs.test(PhysReg)) - continue; - PhysRegs.set(PhysReg); - ShapeT Shape = VRM->getShape(VirtReg); - Register RowReg = Shape.getRow()->getReg(); - Register ColReg = Shape.getCol()->getReg(); - - // Here is the data format for the tile config. - // 0 palette - // 1 start_row - // 2-15 reserved, must be zero - // 16-17 tile0.colsb Tile 0 bytes per row. - // 18-19 tile1.colsb Tile 1 bytes per row. - // 20-21 tile2.colsb Tile 2 bytes per row. - // ... (sequence continues) - // 30-31 tile7.colsb Tile 7 bytes per row. - // 32-47 reserved, must be zero - // 48 tile0.rows Tile 0 rows. - // 49 tile1.rows Tile 1 rows. - // 50 tile2.rows Tile 2 rows. - // ... (sequence continues) - // 55 tile7.rows Tile 7 rows. - // 56-63 reserved, must be zero - unsigned Index = getTilePhysRegIndex(PhysReg); - int RowOffset = 48 + Index; - int ColOffset = 16 + Index * 2; - - unsigned BitSize = 8; - for (const auto &Pair : {std::make_pair(RowReg, RowOffset), - std::make_pair(ColReg, ColOffset)}) { - int64_t Imm; - int ImmCount = 0; - // All def must be the same value, otherwise it is invalid MIs. - // Immediate is prefered. - for (const MachineOperand &MO : MRI->def_operands(Pair.first)) { - const auto *Inst = MO.getParent(); - if (Inst->isMoveImmediate()) { - ImmCount++; - Imm = Inst->getOperand(1).getImm(); - break; - } - } - auto StoreConfig = [&](int Offset) { - MachineInstr *NewMI = nullptr; - if (ImmCount) - NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII); - else { - const TargetRegisterClass *RC = MRI->getRegClass(Pair.first); - NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS, - Offset, TII, RC, TRI); - } - SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI); - if (!ImmCount) { - // Extend the live interval. - SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()}; - LiveInterval &Int = LIS->getInterval(Pair.first); - LIS->extendToIndices(Int, EndPoints); - } - }; - StoreConfig(Pair.second); - BitSize += 8; - } - } -} - -bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - MRI = &mf.getRegInfo(); - ST = &mf.getSubtarget<X86Subtarget>(); - TRI = ST->getRegisterInfo(); - TII = mf.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis<MachineDominatorTree>(); - VRM = &getAnalysis<VirtRegMap>(); - LIS = &getAnalysis<LiveIntervals>(); - - if (VRM->isShapeMapEmpty()) - return false; - - tileConfig(); - return true; -} - -FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); } +//===-- X86TileConfig.cpp - Tile Register Configure----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Pass to config the shape of AMX physical registers +/// AMX register need to be configured before use. In X86PreTileConfig pass +/// the pldtilecfg instruction is inserted, however at that time we don't +/// know the shape of each physical tile registers, because the register +/// allocation is not done yet. This pass runs after egister allocation +/// pass. It collects the shape information of each physical tile register +/// and store the shape in the stack slot that is allocated for load config +/// to tile config register. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TileShapeInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "tile-config" + +namespace { + +class X86TileConfig : public MachineFunctionPass { + // context + MachineFunction *MF = nullptr; + const X86Subtarget *ST = nullptr; + const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII; + MachineDominatorTree *DomTree = nullptr; + MachineRegisterInfo *MRI = nullptr; + VirtRegMap *VRM = nullptr; + LiveIntervals *LIS = nullptr; + + MachineInstr *getTileConfigPoint(); + void tileConfig(); + +public: + X86TileConfig() : MachineFunctionPass(ID) {} + + /// Return the pass name. + StringRef getPassName() const override { return "Tile Register Configure"; } + + /// X86TileConfig analysis usage. + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Perform register allocation. + bool runOnMachineFunction(MachineFunction &mf) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoPHIs); + } + + static char ID; +}; + +} // end anonymous namespace + +char X86TileConfig::ID = 0; + +INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure", + false, false) + +void X86TileConfig::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<LiveIntervals>(); + AU.addPreserved<SlotIndexes>(); + AU.addRequired<VirtRegMap>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +static unsigned getTilePhysRegIndex(Register PhysReg) { + assert((PhysReg >= X86::TMM0 && X86::TMM0 <= X86::TMM7) && + "Tile register number is invalid"); + return (PhysReg - X86::TMM0); +} + +static MachineInstr * +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + Register SrcReg, unsigned BitSize, int FrameIdx, int Offset, + const TargetInstrInfo *TII, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + + unsigned SubIdx = (BitSize == 8) ? X86::sub_8bit : X86::sub_16bit; + unsigned Opc = (BitSize == 8) ? X86::MOV8mr : X86::MOV16mr; + if (BitSize == TRI->getRegSizeInBits(*RC)) + SubIdx = 0; + MachineInstr *NewMI = + addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), FrameIdx, + Offset) + .addReg(SrcReg, 0, SubIdx); + return NewMI; +} + +static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + int64_t Imm, unsigned BitSize, + int FrameIdx, int Offset, + const TargetInstrInfo *TII) { + unsigned Opc = (BitSize == 8) ? X86::MOV8mi : X86::MOV16mi; + return addFrameReference(BuildMI(MBB, MI, DebugLoc(), TII->get(Opc)), + FrameIdx, Offset) + .addImm(Imm); +} + +MachineInstr *X86TileConfig::getTileConfigPoint() { + for (MachineBasicBlock &MBB : *MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) + // Refer X86PreTileConfig.cpp. + // We only support one tile config for now. + if (MI.getOpcode() == X86::PLDTILECFG) + return &MI; + } + + return nullptr; +} + +void X86TileConfig::tileConfig() { + MachineInstr *MI = getTileConfigPoint(); + if (!MI) + return; + MachineBasicBlock *MBB = MI->getParent(); + int SS = MI->getOperand(1).getIndex(); + BitVector PhysRegs(TRI->getNumRegs()); + + // Fill in the palette first. + auto *NewMI = storeImmToStackSlot(*MBB, *MI, 1, 8, SS, 0, TII); + LIS->InsertMachineInstrInMaps(*NewMI); + // Fill in the shape of each tile physical register. + for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { + Register VirtReg = Register::index2VirtReg(i); + if (MRI->reg_nodbg_empty(VirtReg)) + continue; + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + if (RC.getID() != X86::TILERegClassID) + continue; + Register PhysReg = VRM->getPhys(VirtReg); + if (PhysRegs.test(PhysReg)) + continue; + PhysRegs.set(PhysReg); + ShapeT Shape = VRM->getShape(VirtReg); + Register RowReg = Shape.getRow()->getReg(); + Register ColReg = Shape.getCol()->getReg(); + + // Here is the data format for the tile config. + // 0 palette + // 1 start_row + // 2-15 reserved, must be zero + // 16-17 tile0.colsb Tile 0 bytes per row. + // 18-19 tile1.colsb Tile 1 bytes per row. + // 20-21 tile2.colsb Tile 2 bytes per row. + // ... (sequence continues) + // 30-31 tile7.colsb Tile 7 bytes per row. + // 32-47 reserved, must be zero + // 48 tile0.rows Tile 0 rows. + // 49 tile1.rows Tile 1 rows. + // 50 tile2.rows Tile 2 rows. + // ... (sequence continues) + // 55 tile7.rows Tile 7 rows. + // 56-63 reserved, must be zero + unsigned Index = getTilePhysRegIndex(PhysReg); + int RowOffset = 48 + Index; + int ColOffset = 16 + Index * 2; + + unsigned BitSize = 8; + for (const auto &Pair : {std::make_pair(RowReg, RowOffset), + std::make_pair(ColReg, ColOffset)}) { + int64_t Imm; + int ImmCount = 0; + // All def must be the same value, otherwise it is invalid MIs. + // Immediate is prefered. + for (const MachineOperand &MO : MRI->def_operands(Pair.first)) { + const auto *Inst = MO.getParent(); + if (Inst->isMoveImmediate()) { + ImmCount++; + Imm = Inst->getOperand(1).getImm(); + break; + } + } + auto StoreConfig = [&](int Offset) { + MachineInstr *NewMI = nullptr; + if (ImmCount) + NewMI = storeImmToStackSlot(*MBB, *MI, Imm, BitSize, SS, Offset, TII); + else { + const TargetRegisterClass *RC = MRI->getRegClass(Pair.first); + NewMI = storeRegToStackSlot(*MBB, *MI, Pair.first, BitSize, SS, + Offset, TII, RC, TRI); + } + SlotIndex SIdx = LIS->InsertMachineInstrInMaps(*NewMI); + if (!ImmCount) { + // Extend the live interval. + SmallVector<SlotIndex, 8> EndPoints = {SIdx.getRegSlot()}; + LiveInterval &Int = LIS->getInterval(Pair.first); + LIS->extendToIndices(Int, EndPoints); + } + }; + StoreConfig(Pair.second); + BitSize += 8; + } + } +} + +bool X86TileConfig::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + MRI = &mf.getRegInfo(); + ST = &mf.getSubtarget<X86Subtarget>(); + TRI = ST->getRegisterInfo(); + TII = mf.getSubtarget().getInstrInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + VRM = &getAnalysis<VirtRegMap>(); + LIS = &getAnalysis<LiveIntervals>(); + + if (VRM->isShapeMapEmpty()) + return false; + + tileConfig(); + return true; +} + +FunctionPass *llvm::createX86TileConfigPass() { return new X86TileConfig(); } diff --git a/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp b/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp index 8d8bd5e6b3..0122a5e83f 100644 --- a/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/libs/llvm12/lib/Target/X86/X86WinEHState.cpp @@ -109,7 +109,7 @@ private: /// The linked list node subobject inside of RegNode. Value *Link = nullptr; }; -} // namespace +} // namespace FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } diff --git a/contrib/libs/llvm12/lib/Target/X86/ya.make b/contrib/libs/llvm12/lib/Target/X86/ya.make index 1df03a55e7..efb9cc5da8 100644 --- a/contrib/libs/llvm12/lib/Target/X86/ya.make +++ b/contrib/libs/llvm12/lib/Target/X86/ya.make @@ -15,26 +15,26 @@ LICENSE( LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Analysis - contrib/libs/llvm12/lib/CodeGen - contrib/libs/llvm12/lib/CodeGen/AsmPrinter - contrib/libs/llvm12/lib/CodeGen/GlobalISel - contrib/libs/llvm12/lib/CodeGen/SelectionDAG - contrib/libs/llvm12/lib/IR - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/ProfileData - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target - contrib/libs/llvm12/lib/Target/X86/MCTargetDesc - contrib/libs/llvm12/lib/Target/X86/TargetInfo - contrib/libs/llvm12/lib/Transforms/CFGuard + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Analysis + contrib/libs/llvm12/lib/CodeGen + contrib/libs/llvm12/lib/CodeGen/AsmPrinter + contrib/libs/llvm12/lib/CodeGen/GlobalISel + contrib/libs/llvm12/lib/CodeGen/SelectionDAG + contrib/libs/llvm12/lib/IR + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/ProfileData + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target + contrib/libs/llvm12/lib/Target/X86/MCTargetDesc + contrib/libs/llvm12/lib/Target/X86/TargetInfo + contrib/libs/llvm12/lib/Transforms/CFGuard ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 - contrib/libs/llvm12/lib/Target/X86 + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/X86 + contrib/libs/llvm12/lib/Target/X86 ) NO_CLANG_COVERAGE() @@ -68,7 +68,7 @@ SRCS( X86IndirectThunks.cpp X86InsertPrefetch.cpp X86InsertWait.cpp - X86InstCombineIntrinsic.cpp + X86InstCombineIntrinsic.cpp X86InstrFMA3Info.cpp X86InstrFoldTables.cpp X86InstrInfo.cpp @@ -77,14 +77,14 @@ SRCS( X86LegalizerInfo.cpp X86LoadValueInjectionLoadHardening.cpp X86LoadValueInjectionRetHardening.cpp - X86LowerAMXType.cpp + X86LowerAMXType.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp X86PartialReduction.cpp - X86PreTileConfig.cpp + X86PreTileConfig.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp X86SelectionDAGInfo.cpp @@ -95,7 +95,7 @@ SRCS( X86TargetMachine.cpp X86TargetObjectFile.cpp X86TargetTransformInfo.cpp - X86TileConfig.cpp + X86TileConfig.cpp X86VZeroUpper.cpp X86WinAllocaExpander.cpp X86WinEHState.cpp |