diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/ARM | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/ARM')
72 files changed, 7233 insertions, 7233 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/libs/llvm12/lib/Target/ARM/A15SDOptimizer.cpp index bb81233cf8..6c6f49ff6d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/A15SDOptimizer.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/A15SDOptimizer.cpp @@ -359,7 +359,7 @@ void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI, SmallVector<MachineInstr *, 8> Front; Front.push_back(MI); while (Front.size() != 0) { - MI = Front.pop_back_val(); + MI = Front.pop_back_val(); // If we have already explored this MachineInstr, ignore it. if (Reached.find(MI) != Reached.end()) diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARM.h b/contrib/libs/llvm12/lib/Target/ARM/ARM.h index f4fdc98037..2fbfabe828 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARM.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARM.h @@ -37,7 +37,7 @@ class PassRegistry; Pass *createMVETailPredicationPass(); FunctionPass *createARMLowOverheadLoopsPass(); -FunctionPass *createARMBlockPlacementPass(); +FunctionPass *createARMBlockPlacementPass(); Pass *createARMParallelDSPPass(); FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -56,8 +56,8 @@ InstructionSelector * createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI, const ARMRegisterBankInfo &RBI); Pass *createMVEGatherScatterLoweringPass(); -FunctionPass *createARMSLSHardeningPass(); -FunctionPass *createARMIndirectThunks(); +FunctionPass *createARMSLSHardeningPass(); +FunctionPass *createARMIndirectThunks(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); @@ -72,10 +72,10 @@ void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); void initializeMVEVPTOptimisationsPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); -void initializeARMBlockPlacementPass(PassRegistry &); +void initializeARMBlockPlacementPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); -void initializeARMSLSHardeningPass(PassRegistry &); +void initializeARMSLSHardeningPass(PassRegistry &); } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARM.td b/contrib/libs/llvm12/lib/Target/ARM/ARM.td index 3d0a0bf7f8..9540784c7f 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARM.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARM.td @@ -535,10 +535,10 @@ def HasV8_6aOps : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true", [HasV8_5aOps, FeatureBF16, FeatureMatMulInt8]>; -def HasV8_7aOps : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true", - "Support ARM v8.7a instructions", - [HasV8_6aOps]>; - +def HasV8_7aOps : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true", + "Support ARM v8.7a instructions", + [HasV8_6aOps]>; + def HasV8_1MMainlineOps : SubtargetFeature< "v8.1m.main", "HasV8_1MMainlineOps", "true", "Support ARM v8-1M Mainline instructions", @@ -563,20 +563,20 @@ foreach i = {0-7} in [HasCDEOps]>; //===----------------------------------------------------------------------===// -// Control codegen mitigation against Straight Line Speculation vulnerability. -//===----------------------------------------------------------------------===// - -def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", - "HardenSlsRetBr", "true", - "Harden against straight line speculation across RETurn and BranchRegister " - "instructions">; -def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", - "HardenSlsBlr", "true", - "Harden against straight line speculation across indirect calls">; - - - -//===----------------------------------------------------------------------===// +// Control codegen mitigation against Straight Line Speculation vulnerability. +//===----------------------------------------------------------------------===// + +def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", + "HardenSlsRetBr", "true", + "Harden against straight line speculation across RETurn and BranchRegister " + "instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across indirect calls">; + + + +//===----------------------------------------------------------------------===// // ARM Processor subtarget features. // @@ -616,14 +616,14 @@ def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", []>; def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", []>; -def ProcA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C", - "Cortex-A78C ARM processors", []>; +def ProcA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C", + "Cortex-A78C ARM processors", []>; def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", "Cortex-X1 ARM processors", []>; -def ProcV1 : SubtargetFeature<"neoverse-v1", "ARMProcFamily", - "NeoverseV1", "Neoverse-V1 ARM processors", []>; - +def ProcV1 : SubtargetFeature<"neoverse-v1", "ARMProcFamily", + "NeoverseV1", "Neoverse-V1 ARM processors", []>; + def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", "Qualcomm Krait processors", []>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", @@ -662,8 +662,8 @@ def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52", def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", "Cortex-M3 ARM processors", []>; -def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", - "Cortex-M7 ARM processors", []>; +def ProcM7 : SubtargetFeature<"m7", "ARMProcFamily", "CortexM7", + "Cortex-M7 ARM processors", []>; //===----------------------------------------------------------------------===// // ARM Helper classes. @@ -852,19 +852,19 @@ def ARMv86a : Architecture<"armv8.6-a", "ARMv86a", [HasV8_6aOps, FeatureCRC, FeatureRAS, FeatureDotProd]>; -def ARMv87a : Architecture<"armv8.7-a", "ARMv86a", [HasV8_7aOps, - FeatureAClass, - FeatureDB, - FeatureFPARMv8, - FeatureNEON, - FeatureDSP, - FeatureTrustZone, - FeatureMP, - FeatureVirtualization, - FeatureCrypto, - FeatureCRC, - FeatureRAS, - FeatureDotProd]>; +def ARMv87a : Architecture<"armv8.7-a", "ARMv86a", [HasV8_7aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, @@ -919,14 +919,14 @@ def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>; def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>; def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; -//===----------------------------------------------------------------------===// -// Register File Description -//===----------------------------------------------------------------------===// - -include "ARMRegisterInfo.td" -include "ARMRegisterBanks.td" -include "ARMCallingConv.td" +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// +include "ARMRegisterInfo.td" +include "ARMRegisterBanks.td" +include "ARMCallingConv.td" + //===----------------------------------------------------------------------===// // ARM schedules. //===----------------------------------------------------------------------===// @@ -935,25 +935,25 @@ include "ARMPredicates.td" include "ARMSchedule.td" //===----------------------------------------------------------------------===// -// Instruction Descriptions -//===----------------------------------------------------------------------===// - -include "ARMInstrInfo.td" -def ARMInstrInfo : InstrInfo; - -//===----------------------------------------------------------------------===// -// ARM schedules -// -include "ARMScheduleV6.td" -include "ARMScheduleA8.td" -include "ARMScheduleA9.td" -include "ARMScheduleSwift.td" -include "ARMScheduleR52.td" -include "ARMScheduleA57.td" -include "ARMScheduleM4.td" -include "ARMScheduleM7.td" - -//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "ARMInstrInfo.td" +def ARMInstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// ARM schedules +// +include "ARMScheduleV6.td" +include "ARMScheduleA8.td" +include "ARMScheduleA9.td" +include "ARMScheduleSwift.td" +include "ARMScheduleR52.td" +include "ARMScheduleA57.td" +include "ARMScheduleM4.td" +include "ARMScheduleM7.td" + +//===----------------------------------------------------------------------===// // ARM processors // // Dummy CPU, used to target architectures @@ -1193,10 +1193,10 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, FeatureUseMISched, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em, - ProcM7, - FeatureFPARMv8_D16, - FeatureUseMISched]>; +def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em, + ProcM7, + FeatureFPARMv8_D16, + FeatureUseMISched]>; def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; @@ -1310,14 +1310,14 @@ def : ProcNoItin<"cortex-a78", [ARMv82a, ProcA78, FeatureFullFP16, FeatureDotProd]>; -def : ProcNoItin<"cortex-a78c", [ARMv82a, ProcA78C, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureDotProd, - FeatureFullFP16]>; - +def : ProcNoItin<"cortex-a78c", [ARMv82a, ProcA78C, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureDotProd, + FeatureFullFP16]>; + def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1, FeatureHWDivThumb, FeatureHWDivARM, @@ -1326,15 +1326,15 @@ def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1, FeatureFullFP16, FeatureDotProd]>; -def : ProcNoItin<"neoverse-v1", [ARMv84a, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureFullFP16, - FeatureBF16, - FeatureMatMulInt8]>; - +def : ProcNoItin<"neoverse-v1", [ARMv84a, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureBF16, + FeatureMatMulInt8]>; + def : ProcNoItin<"neoverse-n1", [ARMv82a, FeatureHWDivThumb, FeatureHWDivARM, @@ -1342,11 +1342,11 @@ def : ProcNoItin<"neoverse-n1", [ARMv82a, FeatureCRC, FeatureDotProd]>; -def : ProcNoItin<"neoverse-n2", [ARMv85a, - FeatureBF16, - FeatureMatMulInt8, - FeaturePerfMon]>; - +def : ProcNoItin<"neoverse-n2", [ARMv85a, + FeatureBF16, + FeatureMatMulInt8, + FeaturePerfMon]>; + def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMAsmPrinter.cpp index 04e21867d5..31059e5910 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMAsmPrinter.cpp @@ -285,7 +285,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, return false; case 'y': // Print a VFP single precision register as indexed double. if (MI->getOperand(OpNum).isReg()) { - MCRegister Reg = MI->getOperand(OpNum).getReg().asMCReg(); + MCRegister Reg = MI->getOperand(OpNum).getReg().asMCReg(); const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); // Find the 'd' register that has this 's' register as a sub-register, // and determine the lane number. @@ -903,7 +903,7 @@ void ARMAsmPrinter::emitMachineConstantPoolValue( MCSymbol *MCSym; if (ACPV->isLSDA()) { - MCSym = getMBBExceptionSym(MF->front()); + MCSym = getMBBExceptionSym(MF->front()); } else if (ACPV->isBlockAddress()) { const BlockAddress *BA = cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(); @@ -1897,7 +1897,7 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { // LSJLJEH: Register SrcReg = MI->getOperand(0).getReg(); Register ValReg = MI->getOperand(1).getReg(); - MCSymbol *Label = OutContext.createTempSymbol("SJLJEH"); + MCSymbol *Label = OutContext.createTempSymbol("SJLJEH"); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) .addReg(ValReg) @@ -2180,49 +2180,49 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { case ARM::PATCHABLE_TAIL_CALL: LowerPATCHABLE_TAIL_CALL(*MI); return; - case ARM::SpeculationBarrierISBDSBEndBB: { - // Print DSB SYS + ISB - MCInst TmpInstDSB; - TmpInstDSB.setOpcode(ARM::DSB); - TmpInstDSB.addOperand(MCOperand::createImm(0xf)); - EmitToStreamer(*OutStreamer, TmpInstDSB); - MCInst TmpInstISB; - TmpInstISB.setOpcode(ARM::ISB); - TmpInstISB.addOperand(MCOperand::createImm(0xf)); - EmitToStreamer(*OutStreamer, TmpInstISB); - return; - } - case ARM::t2SpeculationBarrierISBDSBEndBB: { - // Print DSB SYS + ISB - MCInst TmpInstDSB; - TmpInstDSB.setOpcode(ARM::t2DSB); - TmpInstDSB.addOperand(MCOperand::createImm(0xf)); - TmpInstDSB.addOperand(MCOperand::createImm(ARMCC::AL)); - TmpInstDSB.addOperand(MCOperand::createReg(0)); - EmitToStreamer(*OutStreamer, TmpInstDSB); - MCInst TmpInstISB; - TmpInstISB.setOpcode(ARM::t2ISB); - TmpInstISB.addOperand(MCOperand::createImm(0xf)); - TmpInstISB.addOperand(MCOperand::createImm(ARMCC::AL)); - TmpInstISB.addOperand(MCOperand::createReg(0)); - EmitToStreamer(*OutStreamer, TmpInstISB); - return; - } - case ARM::SpeculationBarrierSBEndBB: { - // Print SB - MCInst TmpInstSB; - TmpInstSB.setOpcode(ARM::SB); - EmitToStreamer(*OutStreamer, TmpInstSB); - return; - } - case ARM::t2SpeculationBarrierSBEndBB: { - // Print SB - MCInst TmpInstSB; - TmpInstSB.setOpcode(ARM::t2SB); - EmitToStreamer(*OutStreamer, TmpInstSB); - return; - } - } + case ARM::SpeculationBarrierISBDSBEndBB: { + // Print DSB SYS + ISB + MCInst TmpInstDSB; + TmpInstDSB.setOpcode(ARM::DSB); + TmpInstDSB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstDSB); + MCInst TmpInstISB; + TmpInstISB.setOpcode(ARM::ISB); + TmpInstISB.addOperand(MCOperand::createImm(0xf)); + EmitToStreamer(*OutStreamer, TmpInstISB); + return; + } + case ARM::t2SpeculationBarrierISBDSBEndBB: { + // Print DSB SYS + ISB + MCInst TmpInstDSB; + TmpInstDSB.setOpcode(ARM::t2DSB); + TmpInstDSB.addOperand(MCOperand::createImm(0xf)); + TmpInstDSB.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInstDSB.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInstDSB); + MCInst TmpInstISB; + TmpInstISB.setOpcode(ARM::t2ISB); + TmpInstISB.addOperand(MCOperand::createImm(0xf)); + TmpInstISB.addOperand(MCOperand::createImm(ARMCC::AL)); + TmpInstISB.addOperand(MCOperand::createReg(0)); + EmitToStreamer(*OutStreamer, TmpInstISB); + return; + } + case ARM::SpeculationBarrierSBEndBB: { + // Print SB + MCInst TmpInstSB; + TmpInstSB.setOpcode(ARM::SB); + EmitToStreamer(*OutStreamer, TmpInstSB); + return; + } + case ARM::t2SpeculationBarrierSBEndBB: { + // Print SB + MCInst TmpInstSB; + TmpInstSB.setOpcode(ARM::t2SB); + EmitToStreamer(*OutStreamer, TmpInstSB); + return; + } + } MCInst TmpInst; LowerARMMachineInstrToMCInst(MI, TmpInst, *this); diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.cpp index e418d53b56..d3047e1ae7 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -19,7 +19,7 @@ #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" -#include "MVETailPredUtils.h" +#include "MVETailPredUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -36,8 +36,8 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/MultiHazardRecognizer.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MultiHazardRecognizer.h" #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -134,43 +134,43 @@ ARMBaseInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); } -// Called during: -// - pre-RA scheduling -// - post-RA scheduling when FeatureUseMISched is set -ScheduleHazardRecognizer *ARMBaseInstrInfo::CreateTargetMIHazardRecognizer( - const InstrItineraryData *II, const ScheduleDAGMI *DAG) const { - MultiHazardRecognizer *MHR = new MultiHazardRecognizer(); - - // We would like to restrict this hazard recognizer to only - // post-RA scheduling; we can tell that we're post-RA because we don't - // track VRegLiveness. - // Cortex-M7: TRM indicates that there is a single ITCM bank and two DTCM - // banks banked on bit 2. Assume that TCMs are in use. - if (Subtarget.isCortexM7() && !DAG->hasVRegLiveness()) - MHR->AddHazardRecognizer( - std::make_unique<ARMBankConflictHazardRecognizer>(DAG, 0x4, true)); - - // Not inserting ARMHazardRecognizerFPMLx because that would change - // legacy behavior - - auto BHR = TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); - MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR)); - return MHR; -} - -// Called during post-RA scheduling when FeatureUseMISched is not set +// Called during: +// - pre-RA scheduling +// - post-RA scheduling when FeatureUseMISched is set +ScheduleHazardRecognizer *ARMBaseInstrInfo::CreateTargetMIHazardRecognizer( + const InstrItineraryData *II, const ScheduleDAGMI *DAG) const { + MultiHazardRecognizer *MHR = new MultiHazardRecognizer(); + + // We would like to restrict this hazard recognizer to only + // post-RA scheduling; we can tell that we're post-RA because we don't + // track VRegLiveness. + // Cortex-M7: TRM indicates that there is a single ITCM bank and two DTCM + // banks banked on bit 2. Assume that TCMs are in use. + if (Subtarget.isCortexM7() && !DAG->hasVRegLiveness()) + MHR->AddHazardRecognizer( + std::make_unique<ARMBankConflictHazardRecognizer>(DAG, 0x4, true)); + + // Not inserting ARMHazardRecognizerFPMLx because that would change + // legacy behavior + + auto BHR = TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); + MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR)); + return MHR; +} + +// Called during post-RA scheduling when FeatureUseMISched is not set ScheduleHazardRecognizer *ARMBaseInstrInfo:: CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { - MultiHazardRecognizer *MHR = new MultiHazardRecognizer(); - + MultiHazardRecognizer *MHR = new MultiHazardRecognizer(); + if (Subtarget.isThumb2() || Subtarget.hasVFP2Base()) - MHR->AddHazardRecognizer(std::make_unique<ARMHazardRecognizerFPMLx>()); - - auto BHR = TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); - if (BHR) - MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR)); - return MHR; + MHR->AddHazardRecognizer(std::make_unique<ARMHazardRecognizerFPMLx>()); + + auto BHR = TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); + if (BHR) + MHR->AddHazardRecognizer(std::unique_ptr<ScheduleHazardRecognizer>(BHR)); + return MHR; } MachineInstr *ARMBaseInstrInfo::convertToThreeAddress( @@ -351,8 +351,8 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, TBB = nullptr; FBB = nullptr; - MachineBasicBlock::instr_iterator I = MBB.instr_end(); - if (I == MBB.instr_begin()) + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + if (I == MBB.instr_begin()) return false; // Empty blocks are easy. --I; @@ -364,12 +364,12 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // out. bool CantAnalyze = false; - // Skip over DEBUG values, predicated nonterminators and speculation - // barrier terminators. - while (I->isDebugInstr() || !I->isTerminator() || - isSpeculationBarrierEndBBOpcode(I->getOpcode()) || - I->getOpcode() == ARM::t2DoLoopStartTP){ - if (I == MBB.instr_begin()) + // Skip over DEBUG values, predicated nonterminators and speculation + // barrier terminators. + while (I->isDebugInstr() || !I->isTerminator() || + isSpeculationBarrierEndBBOpcode(I->getOpcode()) || + I->getOpcode() == ARM::t2DoLoopStartTP){ + if (I == MBB.instr_begin()) return false; --I; } @@ -393,7 +393,7 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, Cond.push_back(I->getOperand(2)); } else if (I->isReturn()) { // Returns can't be analyzed, but we should run cleanup. - CantAnalyze = true; + CantAnalyze = true; } else { // We encountered other unrecognized terminator. Bail out immediately. return true; @@ -414,30 +414,30 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB, // unconditional branch. if (AllowModify) { MachineBasicBlock::iterator DI = std::next(I); - while (DI != MBB.instr_end()) { + while (DI != MBB.instr_end()) { MachineInstr &InstToDelete = *DI; ++DI; - // Speculation barriers must not be deleted. - if (isSpeculationBarrierEndBBOpcode(InstToDelete.getOpcode())) - continue; + // Speculation barriers must not be deleted. + if (isSpeculationBarrierEndBBOpcode(InstToDelete.getOpcode())) + continue; InstToDelete.eraseFromParent(); } } } - if (CantAnalyze) { - // We may not be able to analyze the block, but we could still have - // an unconditional branch as the last instruction in the block, which - // just branches to layout successor. If this is the case, then just - // remove it if we're allowed to make modifications. - if (AllowModify && !isPredicated(MBB.back()) && - isUncondBranchOpcode(MBB.back().getOpcode()) && - TBB && MBB.isLayoutSuccessor(TBB)) - removeBranch(MBB); + if (CantAnalyze) { + // We may not be able to analyze the block, but we could still have + // an unconditional branch as the last instruction in the block, which + // just branches to layout successor. If this is the case, then just + // remove it if we're allowed to make modifications. + if (AllowModify && !isPredicated(MBB.back()) && + isUncondBranchOpcode(MBB.back().getOpcode()) && + TBB && MBB.isLayoutSuccessor(TBB)) + removeBranch(MBB); return true; - } + } - if (I == MBB.instr_begin()) + if (I == MBB.instr_begin()) return false; --I; @@ -586,18 +586,18 @@ bool ARMBaseInstrInfo::PredicateInstruction( MachineOperand &PMO = MI.getOperand(PIdx); PMO.setImm(Pred[0].getImm()); MI.getOperand(PIdx+1).setReg(Pred[1].getReg()); - - // Thumb 1 arithmetic instructions do not set CPSR when executed inside an - // IT block. This affects how they are printed. - const MCInstrDesc &MCID = MI.getDesc(); - if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { - assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand"); - assert((MI.getOperand(1).isDead() || - MI.getOperand(1).getReg() != ARM::CPSR) && - "if conversion tried to stop defining used CPSR"); - MI.getOperand(1).setReg(ARM::NoRegister); - } - + + // Thumb 1 arithmetic instructions do not set CPSR when executed inside an + // IT block. This affects how they are printed. + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) { + assert(MCID.OpInfo[1].isOptionalDef() && "CPSR def isn't expected operand"); + assert((MI.getOperand(1).isDead() || + MI.getOperand(1).getReg() != ARM::CPSR) && + "if conversion tried to stop defining used CPSR"); + MI.getOperand(1).setReg(ARM::NoRegister); + } + return true; } return false; @@ -629,23 +629,23 @@ bool ARMBaseInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, } } -bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI, - std::vector<MachineOperand> &Pred, - bool SkipDead) const { +bool ARMBaseInstrInfo::ClobbersPredicate(MachineInstr &MI, + std::vector<MachineOperand> &Pred, + bool SkipDead) const { bool Found = false; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); - bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR); - bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR; - if (ClobbersCPSR || IsCPSR) { - - // Filter out T1 instructions that have a dead CPSR, - // allowing IT blocks to be generated containing T1 instructions - const MCInstrDesc &MCID = MI.getDesc(); - if (MCID.TSFlags & ARMII::ThumbArithFlagSetting && MO.isDead() && - SkipDead) - continue; - + bool ClobbersCPSR = MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR); + bool IsCPSR = MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR; + if (ClobbersCPSR || IsCPSR) { + + // Filter out T1 instructions that have a dead CPSR, + // allowing IT blocks to be generated containing T1 instructions + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.TSFlags & ARMII::ThumbArithFlagSetting && MO.isDead() && + SkipDead) + continue; + Pred.push_back(MO); Found = true; } @@ -703,23 +703,23 @@ bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const { if (!isEligibleForITBlock(&MI)) return false; - const MachineFunction *MF = MI.getParent()->getParent(); + const MachineFunction *MF = MI.getParent()->getParent(); const ARMFunctionInfo *AFI = - MF->getInfo<ARMFunctionInfo>(); + MF->getInfo<ARMFunctionInfo>(); // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM. // In their ARM encoding, they can't be encoded in a conditional form. if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) return false; - // Make indirect control flow changes unpredicable when SLS mitigation is - // enabled. - const ARMSubtarget &ST = MF->getSubtarget<ARMSubtarget>(); - if (ST.hardenSlsRetBr() && isIndirectControlFlowNotComingBack(MI)) - return false; - if (ST.hardenSlsBlr() && isIndirectCall(MI)) - return false; - + // Make indirect control flow changes unpredicable when SLS mitigation is + // enabled. + const ARMSubtarget &ST = MF->getSubtarget<ARMSubtarget>(); + if (ST.hardenSlsRetBr() && isIndirectControlFlowNotComingBack(MI)) + return false; + if (ST.hardenSlsBlr() && isIndirectCall(MI)) + return false; + if (AFI->isThumb2Function()) { if (getSubtarget().restrictIT()) return isV8EligibleForIT(&MI); @@ -802,14 +802,14 @@ unsigned ARMBaseInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { Size = alignTo(Size, 4); return Size; } - case ARM::SpeculationBarrierISBDSBEndBB: - case ARM::t2SpeculationBarrierISBDSBEndBB: - // This gets lowered to 2 4-byte instructions. - return 8; - case ARM::SpeculationBarrierSBEndBB: - case ARM::t2SpeculationBarrierSBEndBB: - // This gets lowered to 1 4-byte instructions. - return 4; + case ARM::SpeculationBarrierISBDSBEndBB: + case ARM::t2SpeculationBarrierISBDSBEndBB: + // This gets lowered to 2 4-byte instructions. + return 8; + case ARM::SpeculationBarrierSBEndBB: + case ARM::t2SpeculationBarrierSBEndBB: + // This gets lowered to 1 4-byte instructions. + return 4; } } @@ -2175,12 +2175,12 @@ ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF, // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions. // ARM has a condition code field in every predicable instruction, using it // doesn't change code size. - if (!Subtarget.isThumb2()) - return 0; - - // It's possible that the size of the IT is restricted to a single block. - unsigned MaxInsts = Subtarget.restrictIT() ? 1 : 4; - return divideCeil(NumInsts, MaxInsts) * 2; + if (!Subtarget.isThumb2()) + return 0; + + // It's possible that the size of the IT is restricted to a single block. + unsigned MaxInsts = Subtarget.restrictIT() ? 1 : 4; + return divideCeil(NumInsts, MaxInsts) * 2; } unsigned @@ -3417,7 +3417,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, case ARM::t2SUBspImm: case ARM::t2ADDri: case ARM::t2SUBri: - MRI->constrainRegClass(UseMI.getOperand(0).getReg(), TRC); + MRI->constrainRegClass(UseMI.getOperand(0).getReg(), TRC); } return true; } @@ -4838,14 +4838,14 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI, } } } - if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) { - assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm()); - if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) || - MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) { - ErrInfo = "Incorrect array index for MVE_VMOV_q_rr"; - return false; - } - } + if (MI.getOpcode() == ARM::MVE_VMOV_q_rr) { + assert(MI.getOperand(4).isImm() && MI.getOperand(5).isImm()); + if ((MI.getOperand(4).getImm() != 2 && MI.getOperand(4).getImm() != 3) || + MI.getOperand(4).getImm() != MI.getOperand(5).getImm() + 2) { + ErrInfo = "Incorrect array index for MVE_VMOV_q_rr"; + return false; + } + } return true; } @@ -5531,8 +5531,8 @@ unsigned llvm::ConstantMaterializationCost(unsigned Val, return ForCodesize ? 4 : 1; if (ARM_AM::isSOImmTwoPartVal(Val)) // two instrs return ForCodesize ? 8 : 2; - if (ARM_AM::isSOImmTwoPartValNeg(Val)) // two instrs - return ForCodesize ? 8 : 2; + if (ARM_AM::isSOImmTwoPartValNeg(Val)) // two instrs + return ForCodesize ? 8 : 2; } if (Subtarget->useMovt()) // MOVW + MOVT return ForCodesize ? 8 : 2; @@ -5637,32 +5637,32 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, /// | Frame overhead in Bytes | 2 | 4 | /// | Stack fixup required | No | No | /// +-------------------------+--------+-----+ -/// -/// \p MachineOutlinerDefault implies that the function should be called with -/// a save and restore of LR to the stack. -/// -/// That is, -/// -/// I1 Save LR OUTLINED_FUNCTION: -/// I2 --> BL OUTLINED_FUNCTION I1 -/// I3 Restore LR I2 -/// I3 -/// BX LR -/// -/// +-------------------------+--------+-----+ -/// | | Thumb2 | ARM | -/// +-------------------------+--------+-----+ -/// | Call overhead in Bytes | 8 | 12 | -/// | Frame overhead in Bytes | 2 | 4 | -/// | Stack fixup required | Yes | Yes | -/// +-------------------------+--------+-----+ +/// +/// \p MachineOutlinerDefault implies that the function should be called with +/// a save and restore of LR to the stack. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// BX LR +/// +/// +-------------------------+--------+-----+ +/// | | Thumb2 | ARM | +/// +-------------------------+--------+-----+ +/// | Call overhead in Bytes | 8 | 12 | +/// | Frame overhead in Bytes | 2 | 4 | +/// | Stack fixup required | Yes | Yes | +/// +-------------------------+--------+-----+ enum MachineOutlinerClass { MachineOutlinerTailCall, MachineOutlinerThunk, MachineOutlinerNoLRSave, - MachineOutlinerRegSave, - MachineOutlinerDefault + MachineOutlinerRegSave, + MachineOutlinerDefault }; enum MachineOutlinerMBBFlags { @@ -5680,9 +5680,9 @@ struct OutlinerCosts { const int FrameNoLRSave; const int CallRegSave; const int FrameRegSave; - const int CallDefault; - const int FrameDefault; - const int SaveRestoreLROnStack; + const int CallDefault; + const int FrameDefault; + const int SaveRestoreLROnStack; OutlinerCosts(const ARMSubtarget &target) : CallTailCall(target.isThumb() ? 4 : 4), @@ -5692,10 +5692,10 @@ struct OutlinerCosts { CallNoLRSave(target.isThumb() ? 4 : 4), FrameNoLRSave(target.isThumb() ? 4 : 4), CallRegSave(target.isThumb() ? 8 : 12), - FrameRegSave(target.isThumb() ? 2 : 4), - CallDefault(target.isThumb() ? 8 : 12), - FrameDefault(target.isThumb() ? 2 : 4), - SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {} + FrameRegSave(target.isThumb() ? 2 : 4), + CallDefault(target.isThumb() ? 8 : 12), + FrameDefault(target.isThumb() ? 2 : 4), + SaveRestoreLROnStack(target.isThumb() ? 8 : 8) {} }; unsigned @@ -5720,37 +5720,37 @@ ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { return 0u; } -// Compute liveness of LR at the point after the interval [I, E), which -// denotes a *backward* iteration through instructions. Used only for return -// basic blocks, which do not end with a tail call. -static bool isLRAvailable(const TargetRegisterInfo &TRI, - MachineBasicBlock::reverse_iterator I, - MachineBasicBlock::reverse_iterator E) { - // At the end of the function LR dead. - bool Live = false; - for (; I != E; ++I) { - const MachineInstr &MI = *I; - - // Check defs of LR. - if (MI.modifiesRegister(ARM::LR, &TRI)) - Live = false; - - // Check uses of LR. - unsigned Opcode = MI.getOpcode(); - if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR || - Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET || - Opcode == ARM::tBXNS_RET) { - // These instructions use LR, but it's not an (explicit or implicit) - // operand. - Live = true; - continue; - } - if (MI.readsRegister(ARM::LR, &TRI)) - Live = true; - } - return !Live; -} - +// Compute liveness of LR at the point after the interval [I, E), which +// denotes a *backward* iteration through instructions. Used only for return +// basic blocks, which do not end with a tail call. +static bool isLRAvailable(const TargetRegisterInfo &TRI, + MachineBasicBlock::reverse_iterator I, + MachineBasicBlock::reverse_iterator E) { + // At the end of the function LR dead. + bool Live = false; + for (; I != E; ++I) { + const MachineInstr &MI = *I; + + // Check defs of LR. + if (MI.modifiesRegister(ARM::LR, &TRI)) + Live = false; + + // Check uses of LR. + unsigned Opcode = MI.getOpcode(); + if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR || + Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET || + Opcode == ARM::tBXNS_RET) { + // These instructions use LR, but it's not an (explicit or implicit) + // operand. + Live = true; + continue; + } + if (MI.readsRegister(ARM::LR, &TRI)) + Live = true; + } + return !Live; +} + outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; @@ -5796,7 +5796,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( // Erase every candidate that violates the restrictions above. (It could be // true that we have viable candidates, so it's not worth bailing out in // the case that, say, 1 out of 20 candidates violate the restructions.) - llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); + llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); // If the sequence doesn't have enough candidates left, then we're done. if (RepeatedSequenceLocs.size() < 2) @@ -5816,8 +5816,8 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( }; OutlinerCosts Costs(Subtarget); - unsigned FrameID = MachineOutlinerDefault; - unsigned NumBytesToCreateFrame = Costs.FrameDefault; + unsigned FrameID = MachineOutlinerDefault; + unsigned NumBytesToCreateFrame = Costs.FrameDefault; // If the last instruction in any candidate is a terminator, then we should // tail call all of the candidates. @@ -5826,31 +5826,31 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( NumBytesToCreateFrame = Costs.FrameTailCall; SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall); } else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX || - LastInstrOpcode == ARM::BLX_noip || LastInstrOpcode == ARM::tBL || - LastInstrOpcode == ARM::tBLXr || - LastInstrOpcode == ARM::tBLXr_noip || + LastInstrOpcode == ARM::BLX_noip || LastInstrOpcode == ARM::tBL || + LastInstrOpcode == ARM::tBLXr || + LastInstrOpcode == ARM::tBLXr_noip || LastInstrOpcode == ARM::tBLXi) { FrameID = MachineOutlinerThunk; NumBytesToCreateFrame = Costs.FrameThunk; SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk); } else { // We need to decide how to emit calls + frames. We can always emit the same - // frame if we don't need to save to the stack. If we have to save to the - // stack, then we need a different frame. + // frame if we don't need to save to the stack. If we have to save to the + // stack, then we need a different frame. unsigned NumBytesNoStackCalls = 0; std::vector<outliner::Candidate> CandidatesWithoutStackFixups; for (outliner::Candidate &C : RepeatedSequenceLocs) { C.initLRU(TRI); - // LR liveness is overestimated in return blocks, unless they end with a - // tail call. - const auto Last = C.getMBB()->rbegin(); - const bool LRIsAvailable = - C.getMBB()->isReturnBlock() && !Last->isCall() - ? isLRAvailable(TRI, Last, - (MachineBasicBlock::reverse_iterator)C.front()) - : C.LRU.available(ARM::LR); - if (LRIsAvailable) { + // LR liveness is overestimated in return blocks, unless they end with a + // tail call. + const auto Last = C.getMBB()->rbegin(); + const bool LRIsAvailable = + C.getMBB()->isReturnBlock() && !Last->isCall() + ? isLRAvailable(TRI, Last, + (MachineBasicBlock::reverse_iterator)C.front()) + : C.LRU.available(ARM::LR); + if (LRIsAvailable) { FrameID = MachineOutlinerNoLRSave; NumBytesNoStackCalls += Costs.CallNoLRSave; C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave); @@ -5865,157 +5865,157 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave); CandidatesWithoutStackFixups.push_back(C); } - - // Is SP used in the sequence at all? If not, we don't have to modify - // the stack, so we are guaranteed to get the same frame. - else if (C.UsedInSequence.available(ARM::SP)) { - NumBytesNoStackCalls += Costs.CallDefault; - C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault); - CandidatesWithoutStackFixups.push_back(C); - } - - // If we outline this, we need to modify the stack. Pretend we don't - // outline this by saving all of its bytes. - else - NumBytesNoStackCalls += SequenceSize; + + // Is SP used in the sequence at all? If not, we don't have to modify + // the stack, so we are guaranteed to get the same frame. + else if (C.UsedInSequence.available(ARM::SP)) { + NumBytesNoStackCalls += Costs.CallDefault; + C.setCallInfo(MachineOutlinerDefault, Costs.CallDefault); + CandidatesWithoutStackFixups.push_back(C); + } + + // If we outline this, we need to modify the stack. Pretend we don't + // outline this by saving all of its bytes. + else + NumBytesNoStackCalls += SequenceSize; } - // If there are no places where we have to save LR, then note that we don't - // have to update the stack. Otherwise, give every candidate the default - // call type - if (NumBytesNoStackCalls <= - RepeatedSequenceLocs.size() * Costs.CallDefault) { + // If there are no places where we have to save LR, then note that we don't + // have to update the stack. Otherwise, give every candidate the default + // call type + if (NumBytesNoStackCalls <= + RepeatedSequenceLocs.size() * Costs.CallDefault) { RepeatedSequenceLocs = CandidatesWithoutStackFixups; - FrameID = MachineOutlinerNoLRSave; + FrameID = MachineOutlinerNoLRSave; } else - SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault); - } - - // Does every candidate's MBB contain a call? If so, then we might have a - // call in the range. - if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { - // check if the range contains a call. These require a save + restore of - // the link register. - if (std::any_of(FirstCand.front(), FirstCand.back(), - [](const MachineInstr &MI) { return MI.isCall(); })) - NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; - - // Handle the last instruction separately. If it is tail call, then the - // last instruction is a call, we don't want to save + restore in this - // case. However, it could be possible that the last instruction is a - // call without it being valid to tail call this sequence. We should - // consider this as well. - else if (FrameID != MachineOutlinerThunk && - FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) - NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; - } - + SetCandidateCallInfo(MachineOutlinerDefault, Costs.CallDefault); + } + + // Does every candidate's MBB contain a call? If so, then we might have a + // call in the range. + if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { + // check if the range contains a call. These require a save + restore of + // the link register. + if (std::any_of(FirstCand.front(), FirstCand.back(), + [](const MachineInstr &MI) { return MI.isCall(); })) + NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; + + // Handle the last instruction separately. If it is tail call, then the + // last instruction is a call, we don't want to save + restore in this + // case. However, it could be possible that the last instruction is a + // call without it being valid to tail call this sequence. We should + // consider this as well. + else if (FrameID != MachineOutlinerThunk && + FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) + NumBytesToCreateFrame += Costs.SaveRestoreLROnStack; + } + return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID); } -bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI, - int64_t Fixup, - bool Updt) const { - int SPIdx = MI->findRegisterUseOperandIdx(ARM::SP); - unsigned AddrMode = (MI->getDesc().TSFlags & ARMII::AddrModeMask); - if (SPIdx < 0) - // No SP operand - return true; - else if (SPIdx != 1 && (AddrMode != ARMII::AddrModeT2_i8s4 || SPIdx != 2)) - // If SP is not the base register we can't do much - return false; - - // Stack might be involved but addressing mode doesn't handle any offset. - // Rq: AddrModeT1_[1|2|4] don't operate on SP - if (AddrMode == ARMII::AddrMode1 // Arithmetic instructions - || AddrMode == ARMII::AddrMode4 // Load/Store Multiple - || AddrMode == ARMII::AddrMode6 // Neon Load/Store Multiple - || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register - || AddrMode == ARMII::AddrModeT2_pc // PCrel access - || AddrMode == ARMII::AddrMode2 // Used by PRE and POST indexed LD/ST - || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE - || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE - || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR - || AddrMode == ARMII::AddrModeNone) - return false; - - unsigned NumOps = MI->getDesc().getNumOperands(); - unsigned ImmIdx = NumOps - 3; - - const MachineOperand &Offset = MI->getOperand(ImmIdx); - assert(Offset.isImm() && "Is not an immediate"); - int64_t OffVal = Offset.getImm(); - - if (OffVal < 0) - // Don't override data if the are below SP. - return false; - - unsigned NumBits = 0; - unsigned Scale = 1; - - switch (AddrMode) { - case ARMII::AddrMode3: - if (ARM_AM::getAM3Op(OffVal) == ARM_AM::sub) - return false; - OffVal = ARM_AM::getAM3Offset(OffVal); - NumBits = 8; - break; - case ARMII::AddrMode5: - if (ARM_AM::getAM5Op(OffVal) == ARM_AM::sub) - return false; - OffVal = ARM_AM::getAM5Offset(OffVal); - NumBits = 8; - Scale = 4; - break; - case ARMII::AddrMode5FP16: - if (ARM_AM::getAM5FP16Op(OffVal) == ARM_AM::sub) - return false; - OffVal = ARM_AM::getAM5FP16Offset(OffVal); - NumBits = 8; - Scale = 2; - break; - case ARMII::AddrModeT2_i8: - NumBits = 8; - break; - case ARMII::AddrModeT2_i8s4: - // FIXME: Values are already scaled in this addressing mode. - assert((Fixup & 3) == 0 && "Can't encode this offset!"); - NumBits = 10; - break; - case ARMII::AddrModeT2_ldrex: - NumBits = 8; - Scale = 4; - break; - case ARMII::AddrModeT2_i12: - case ARMII::AddrMode_i12: - NumBits = 12; - break; - case ARMII::AddrModeT1_s: // SP-relative LD/ST - NumBits = 8; - Scale = 4; - break; - default: - llvm_unreachable("Unsupported addressing mode!"); - } - // Make sure the offset is encodable for instructions that scale the - // immediate. - assert(((OffVal * Scale + Fixup) & (Scale - 1)) == 0 && - "Can't encode this offset!"); - OffVal += Fixup / Scale; - - unsigned Mask = (1 << NumBits) - 1; - - if (OffVal <= Mask) { - if (Updt) - MI->getOperand(ImmIdx).setImm(OffVal); - return true; - } - - return false; - -} - +bool ARMBaseInstrInfo::checkAndUpdateStackOffset(MachineInstr *MI, + int64_t Fixup, + bool Updt) const { + int SPIdx = MI->findRegisterUseOperandIdx(ARM::SP); + unsigned AddrMode = (MI->getDesc().TSFlags & ARMII::AddrModeMask); + if (SPIdx < 0) + // No SP operand + return true; + else if (SPIdx != 1 && (AddrMode != ARMII::AddrModeT2_i8s4 || SPIdx != 2)) + // If SP is not the base register we can't do much + return false; + + // Stack might be involved but addressing mode doesn't handle any offset. + // Rq: AddrModeT1_[1|2|4] don't operate on SP + if (AddrMode == ARMII::AddrMode1 // Arithmetic instructions + || AddrMode == ARMII::AddrMode4 // Load/Store Multiple + || AddrMode == ARMII::AddrMode6 // Neon Load/Store Multiple + || AddrMode == ARMII::AddrModeT2_so // SP can't be used as based register + || AddrMode == ARMII::AddrModeT2_pc // PCrel access + || AddrMode == ARMII::AddrMode2 // Used by PRE and POST indexed LD/ST + || AddrMode == ARMII::AddrModeT2_i7 // v8.1-M MVE + || AddrMode == ARMII::AddrModeT2_i7s2 // v8.1-M MVE + || AddrMode == ARMII::AddrModeT2_i7s4 // v8.1-M sys regs VLDR/VSTR + || AddrMode == ARMII::AddrModeNone) + return false; + + unsigned NumOps = MI->getDesc().getNumOperands(); + unsigned ImmIdx = NumOps - 3; + + const MachineOperand &Offset = MI->getOperand(ImmIdx); + assert(Offset.isImm() && "Is not an immediate"); + int64_t OffVal = Offset.getImm(); + + if (OffVal < 0) + // Don't override data if the are below SP. + return false; + + unsigned NumBits = 0; + unsigned Scale = 1; + + switch (AddrMode) { + case ARMII::AddrMode3: + if (ARM_AM::getAM3Op(OffVal) == ARM_AM::sub) + return false; + OffVal = ARM_AM::getAM3Offset(OffVal); + NumBits = 8; + break; + case ARMII::AddrMode5: + if (ARM_AM::getAM5Op(OffVal) == ARM_AM::sub) + return false; + OffVal = ARM_AM::getAM5Offset(OffVal); + NumBits = 8; + Scale = 4; + break; + case ARMII::AddrMode5FP16: + if (ARM_AM::getAM5FP16Op(OffVal) == ARM_AM::sub) + return false; + OffVal = ARM_AM::getAM5FP16Offset(OffVal); + NumBits = 8; + Scale = 2; + break; + case ARMII::AddrModeT2_i8: + NumBits = 8; + break; + case ARMII::AddrModeT2_i8s4: + // FIXME: Values are already scaled in this addressing mode. + assert((Fixup & 3) == 0 && "Can't encode this offset!"); + NumBits = 10; + break; + case ARMII::AddrModeT2_ldrex: + NumBits = 8; + Scale = 4; + break; + case ARMII::AddrModeT2_i12: + case ARMII::AddrMode_i12: + NumBits = 12; + break; + case ARMII::AddrModeT1_s: // SP-relative LD/ST + NumBits = 8; + Scale = 4; + break; + default: + llvm_unreachable("Unsupported addressing mode!"); + } + // Make sure the offset is encodable for instructions that scale the + // immediate. + assert(((OffVal * Scale + Fixup) & (Scale - 1)) == 0 && + "Can't encode this offset!"); + OffVal += Fixup / Scale; + + unsigned Mask = (1 << NumBits) - 1; + + if (OffVal <= Mask) { + if (Updt) + MI->getOperand(ImmIdx).setImm(OffVal); + return true; + } + + return false; + +} + bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom( MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { const Function &F = MF.getFunction(); @@ -6075,13 +6075,13 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) Flags |= MachineOutlinerMBBFlags::HasCalls; - // LR liveness is overestimated in return blocks. - - bool LRIsAvailable = - MBB.isReturnBlock() && !MBB.back().isCall() - ? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend()) - : LRU.available(ARM::LR); - if (!LRIsAvailable) + // LR liveness is overestimated in return blocks. + + bool LRIsAvailable = + MBB.isReturnBlock() && !MBB.back().isCall() + ? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend()) + : LRU.available(ARM::LR); + if (!LRIsAvailable) Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; return true; @@ -6119,9 +6119,9 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // Be conservative with ARMv8.1 MVE instructions. if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || - Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd || - Opc == ARM::t2LoopEndDec) + Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || + Opc == ARM::t2LoopDec || Opc == ARM::t2LoopEnd || + Opc == ARM::t2LoopEndDec) return outliner::InstrType::Illegal; const MCInstrDesc &MCID = MI.getDesc(); @@ -6155,56 +6155,56 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, return outliner::InstrType::Illegal; if (MI.isCall()) { - // Get the function associated with the call. Look at each operand and find - // the one that represents the calle and get its name. - const Function *Callee = nullptr; - for (const MachineOperand &MOP : MI.operands()) { - if (MOP.isGlobal()) { - Callee = dyn_cast<Function>(MOP.getGlobal()); - break; - } - } - - // Dont't outline calls to "mcount" like functions, in particular Linux - // kernel function tracing relies on it. - if (Callee && - (Callee->getName() == "\01__gnu_mcount_nc" || - Callee->getName() == "\01mcount" || Callee->getName() == "__mcount")) - return outliner::InstrType::Illegal; - + // Get the function associated with the call. Look at each operand and find + // the one that represents the calle and get its name. + const Function *Callee = nullptr; + for (const MachineOperand &MOP : MI.operands()) { + if (MOP.isGlobal()) { + Callee = dyn_cast<Function>(MOP.getGlobal()); + break; + } + } + + // Dont't outline calls to "mcount" like functions, in particular Linux + // kernel function tracing relies on it. + if (Callee && + (Callee->getName() == "\01__gnu_mcount_nc" || + Callee->getName() == "\01mcount" || Callee->getName() == "__mcount")) + return outliner::InstrType::Illegal; + // If we don't know anything about the callee, assume it depends on the // stack layout of the caller. In that case, it's only legal to outline // as a tail-call. Explicitly list the call instructions we know about so // we don't get unexpected results with call pseudo-instructions. auto UnknownCallOutlineType = outliner::InstrType::Illegal; if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX || - Opc == ARM::BLX_noip || Opc == ARM::tBLXr || Opc == ARM::tBLXr_noip || - Opc == ARM::tBLXi) + Opc == ARM::BLX_noip || Opc == ARM::tBLXr || Opc == ARM::tBLXr_noip || + Opc == ARM::tBLXi) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; - if (!Callee) - return UnknownCallOutlineType; - - // We have a function we have information about. Check if it's something we - // can safely outline. - MachineFunction *MF = MI.getParent()->getParent(); - MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); - - // We don't know what's going on with the callee at all. Don't touch it. - if (!CalleeMF) - return UnknownCallOutlineType; - - // Check if we know anything about the callee saves on the function. If we - // don't, then don't touch it, since that implies that we haven't computed - // anything about its stack frame yet. - MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); - if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || - MFI.getNumObjects() > 0) - return UnknownCallOutlineType; - - // At this point, we can say that CalleeMF ought to not pass anything on the - // stack. Therefore, we can outline it. - return outliner::InstrType::Legal; + if (!Callee) + return UnknownCallOutlineType; + + // We have a function we have information about. Check if it's something we + // can safely outline. + MachineFunction *MF = MI.getParent()->getParent(); + MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); + + // We don't know what's going on with the callee at all. Don't touch it. + if (!CalleeMF) + return UnknownCallOutlineType; + + // Check if we know anything about the callee saves on the function. If we + // don't, then don't touch it, since that implies that we haven't computed + // anything about its stack frame yet. + MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); + if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || + MFI.getNumObjects() > 0) + return UnknownCallOutlineType; + + // At this point, we can say that CalleeMF ought to not pass anything on the + // stack. Therefore, we can outline it. + return outliner::InstrType::Legal; } // Since calls are handled, don't touch LR or PC @@ -6227,19 +6227,19 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, if (!MightNeedStackFixUp) return outliner::InstrType::Legal; - // Any modification of SP will break our code to save/restore LR. - // FIXME: We could handle some instructions which add a constant offset to - // SP, with a bit more work. - if (MI.modifiesRegister(ARM::SP, TRI)) - return outliner::InstrType::Illegal; - - // At this point, we have a stack instruction that we might need to fix up. - // up. We'll handle it if it's a load or store. - if (checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), - false)) - return outliner::InstrType::Legal; - - // We can't fix it up, so don't outline it. + // Any modification of SP will break our code to save/restore LR. + // FIXME: We could handle some instructions which add a constant offset to + // SP, with a bit more work. + if (MI.modifiesRegister(ARM::SP, TRI)) + return outliner::InstrType::Illegal; + + // At this point, we have a stack instruction that we might need to fix up. + // up. We'll handle it if it's a load or store. + if (checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), + false)) + return outliner::InstrType::Legal; + + // We can't fix it up, so don't outline it. return outliner::InstrType::Illegal; } @@ -6255,104 +6255,104 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, return outliner::InstrType::Legal; } -void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { - for (MachineInstr &MI : MBB) { - checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), true); - } -} - -void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const { - unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM; - int Align = -Subtarget.getStackAlignment().value(); - BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP) - .addReg(ARM::LR, RegState::Kill) - .addReg(ARM::SP) - .addImm(Align) - .add(predOps(ARMCC::AL)); -} - -void ARMBaseInstrInfo::emitCFIForLRSaveOnStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int Align = Subtarget.getStackAlignment().value(); - // Add a CFI saying the stack was moved down. - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameSetup); - - // Add a CFI saying that the LR that we want to find is now higher than - // before. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameSetup); -} - -void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It, - Register Reg) const { - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - - int64_t LRPosEntry = MF.addFrameInst( - MCCFIInstruction::createRegister(nullptr, DwarfLR, DwarfReg)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameSetup); -} - -void ARMBaseInstrInfo::restoreLRFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; - MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR) - .addReg(ARM::SP, RegState::Define) - .addReg(ARM::SP); - if (!Subtarget.isThumb()) - MIB.addReg(0); - MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL)); -} - -void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - // Now stack has moved back up... - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); - - // ... and we have restored LR. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); -} - -void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); -} - +void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { + for (MachineInstr &MI : MBB) { + checkAndUpdateStackOffset(&MI, Subtarget.getStackAlignment().value(), true); + } +} + +void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const { + unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM; + int Align = -Subtarget.getStackAlignment().value(); + BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP) + .addReg(ARM::LR, RegState::Kill) + .addReg(ARM::SP) + .addImm(Align) + .add(predOps(ARMCC::AL)); +} + +void ARMBaseInstrInfo::emitCFIForLRSaveOnStack( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + int Align = Subtarget.getStackAlignment().value(); + // Add a CFI saying the stack was moved down. + int64_t StackPosEntry = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(StackPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + + // Add a CFI saying that the LR that we want to find is now higher than + // before. + int64_t LRPosEntry = + MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameSetup); +} + +void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It, + Register Reg) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + + int64_t LRPosEntry = MF.addFrameInst( + MCCFIInstruction::createRegister(nullptr, DwarfLR, DwarfReg)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameSetup); +} + +void ARMBaseInstrInfo::restoreLRFromStack( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; + MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP); + if (!Subtarget.isThumb()) + MIB.addReg(0); + MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL)); +} + +void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + // Now stack has moved back up... + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + int64_t StackPosEntry = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(StackPosEntry) + .setMIFlags(MachineInstr::FrameDestroy); + + // ... and we have restored LR. + int64_t LRPosEntry = + MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameDestroy); +} + +void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg( + MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + + int64_t LRPosEntry = + MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameDestroy); +} + void ARMBaseInstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { @@ -6374,57 +6374,57 @@ void ARMBaseInstrInfo::buildOutlinedFrame( Call->eraseFromParent(); } - // Is there a call in the outlined range? - auto IsNonTailCall = [](MachineInstr &MI) { - return MI.isCall() && !MI.isReturn(); - }; - if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { - MachineBasicBlock::iterator It = MBB.begin(); - MachineBasicBlock::iterator Et = MBB.end(); - - if (OF.FrameConstructionID == MachineOutlinerTailCall || - OF.FrameConstructionID == MachineOutlinerThunk) - Et = std::prev(MBB.end()); - - // We have to save and restore LR, we need to add it to the liveins if it - // is not already part of the set. This is suffient since outlined - // functions only have one block. - if (!MBB.isLiveIn(ARM::LR)) - MBB.addLiveIn(ARM::LR); - - // Insert a save before the outlined region - saveLROnStack(MBB, It); - emitCFIForLRSaveOnStack(MBB, It); - - // Fix up the instructions in the range, since we're going to modify the - // stack. - assert(OF.FrameConstructionID != MachineOutlinerDefault && - "Can only fix up stack references once"); - fixupPostOutline(MBB); - - // Insert a restore before the terminator for the function. Restore LR. - restoreLRFromStack(MBB, Et); - emitCFIForLRRestoreFromStack(MBB, Et); - } - - // If this is a tail call outlined function, then there's already a return. - if (OF.FrameConstructionID == MachineOutlinerTailCall || - OF.FrameConstructionID == MachineOutlinerThunk) - return; - + // Is there a call in the outlined range? + auto IsNonTailCall = [](MachineInstr &MI) { + return MI.isCall() && !MI.isReturn(); + }; + if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { + MachineBasicBlock::iterator It = MBB.begin(); + MachineBasicBlock::iterator Et = MBB.end(); + + if (OF.FrameConstructionID == MachineOutlinerTailCall || + OF.FrameConstructionID == MachineOutlinerThunk) + Et = std::prev(MBB.end()); + + // We have to save and restore LR, we need to add it to the liveins if it + // is not already part of the set. This is suffient since outlined + // functions only have one block. + if (!MBB.isLiveIn(ARM::LR)) + MBB.addLiveIn(ARM::LR); + + // Insert a save before the outlined region + saveLROnStack(MBB, It); + emitCFIForLRSaveOnStack(MBB, It); + + // Fix up the instructions in the range, since we're going to modify the + // stack. + assert(OF.FrameConstructionID != MachineOutlinerDefault && + "Can only fix up stack references once"); + fixupPostOutline(MBB); + + // Insert a restore before the terminator for the function. Restore LR. + restoreLRFromStack(MBB, Et); + emitCFIForLRRestoreFromStack(MBB, Et); + } + + // If this is a tail call outlined function, then there's already a return. + if (OF.FrameConstructionID == MachineOutlinerTailCall || + OF.FrameConstructionID == MachineOutlinerThunk) + return; + // Here we have to insert the return ourselves. Get the correct opcode from // current feature set. BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode())) .add(predOps(ARMCC::AL)); - - // Did we have to modify the stack by saving the link register? - if (OF.FrameConstructionID != MachineOutlinerDefault && - OF.Candidates[0].CallConstructionID != MachineOutlinerDefault) - return; - - // We modified the stack. - // Walk over the basic block and fix up all the stack accesses. - fixupPostOutline(MBB); + + // Did we have to modify the stack by saving the link register? + if (OF.FrameConstructionID != MachineOutlinerDefault && + OF.Candidates[0].CallConstructionID != MachineOutlinerDefault) + return; + + // We modified the stack. + // Walk over the basic block and fix up all the stack accesses. + fixupPostOutline(MBB); } MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( @@ -6456,14 +6456,14 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( CallMIB.add(predOps(ARMCC::AL)); CallMIB.addGlobalAddress(M.getNamedValue(MF.getName())); - if (C.CallConstructionID == MachineOutlinerNoLRSave || - C.CallConstructionID == MachineOutlinerThunk) { - // No, so just insert the call. - It = MBB.insert(It, CallMIB); - return It; - } - - const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>(); + if (C.CallConstructionID == MachineOutlinerNoLRSave || + C.CallConstructionID == MachineOutlinerThunk) { + // No, so just insert the call. + It = MBB.insert(It, CallMIB); + return It; + } + + const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>(); // Can we save to a register? if (C.CallConstructionID == MachineOutlinerRegSave) { unsigned Reg = findRegisterToSaveLRTo(C); @@ -6471,55 +6471,55 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( // Save and restore LR from that register. copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true); - if (!AFI.isLRSpilled()) - emitCFIForLRSaveToReg(MBB, It, Reg); + if (!AFI.isLRSpilled()) + emitCFIForLRSaveToReg(MBB, It, Reg); CallPt = MBB.insert(It, CallMIB); copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true); - if (!AFI.isLRSpilled()) - emitCFIForLRRestoreFromReg(MBB, It); + if (!AFI.isLRSpilled()) + emitCFIForLRRestoreFromReg(MBB, It); It--; return CallPt; } - // We have the default case. Save and restore from SP. - if (!MBB.isLiveIn(ARM::LR)) - MBB.addLiveIn(ARM::LR); - saveLROnStack(MBB, It); - if (!AFI.isLRSpilled()) - emitCFIForLRSaveOnStack(MBB, It); - CallPt = MBB.insert(It, CallMIB); - restoreLRFromStack(MBB, It); - if (!AFI.isLRSpilled()) - emitCFIForLRRestoreFromStack(MBB, It); - It--; - return CallPt; -} - -bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( - MachineFunction &MF) const { - return Subtarget.isMClass() && MF.getFunction().hasMinSize(); -} - -bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const { - // Try hard to rematerialize any VCTPs because if we spill P0, it will block - // the tail predication conversion. This means that the element count - // register has to be live for longer, but that has to be better than - // spill/restore and VPT predication. - return isVCTP(&MI) && !isPredicated(MI); -} - -unsigned llvm::getBLXOpcode(const MachineFunction &MF) { - return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_noip - : ARM::BLX; -} - -unsigned llvm::gettBLXrOpcode(const MachineFunction &MF) { - return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::tBLXr_noip - : ARM::tBLXr; -} - -unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) { - return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip - : ARM::BLX_pred; -} - + // We have the default case. Save and restore from SP. + if (!MBB.isLiveIn(ARM::LR)) + MBB.addLiveIn(ARM::LR); + saveLROnStack(MBB, It); + if (!AFI.isLRSpilled()) + emitCFIForLRSaveOnStack(MBB, It); + CallPt = MBB.insert(It, CallMIB); + restoreLRFromStack(MBB, It); + if (!AFI.isLRSpilled()) + emitCFIForLRRestoreFromStack(MBB, It); + It--; + return CallPt; +} + +bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( + MachineFunction &MF) const { + return Subtarget.isMClass() && MF.getFunction().hasMinSize(); +} + +bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const { + // Try hard to rematerialize any VCTPs because if we spill P0, it will block + // the tail predication conversion. This means that the element count + // register has to be live for longer, but that has to be better than + // spill/restore and VPT predication. + return isVCTP(&MI) && !isPredicated(MI); +} + +unsigned llvm::getBLXOpcode(const MachineFunction &MF) { + return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_noip + : ARM::BLX; +} + +unsigned llvm::gettBLXrOpcode(const MachineFunction &MF) { + return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::tBLXr_noip + : ARM::tBLXr; +} + +unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) { + return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip + : ARM::BLX_pred; +} + diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.h index 1b843c4281..e61d557c1d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseInstrInfo.h @@ -132,10 +132,10 @@ public: const ScheduleDAG *DAG) const override; ScheduleHazardRecognizer * - CreateTargetMIHazardRecognizer(const InstrItineraryData *II, - const ScheduleDAGMI *DAG) const override; - - ScheduleHazardRecognizer * + CreateTargetMIHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAGMI *DAG) const override; + + ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override; @@ -175,8 +175,8 @@ public: bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, ArrayRef<MachineOperand> Pred2) const override; - bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred, - bool SkipDead) const override; + bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred, + bool SkipDead) const override; bool isPredicable(const MachineInstr &MI) const override; @@ -361,60 +361,60 @@ public: MachineBasicBlock::iterator &It, MachineFunction &MF, const outliner::Candidate &C) const override; - /// Enable outlining by default at -Oz. - bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; - - bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { - return MI->getOpcode() == ARM::t2LoopEndDec || - MI->getOpcode() == ARM::t2DoLoopStartTP; - } - + /// Enable outlining by default at -Oz. + bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; + + bool isUnspillableTerminatorImpl(const MachineInstr *MI) const override { + return MI->getOpcode() == ARM::t2LoopEndDec || + MI->getOpcode() == ARM::t2DoLoopStartTP; + } + private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; - // Adds an instruction which saves the link register on top of the stack into - /// the MachineBasicBlock \p MBB at position \p It. - void saveLROnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Adds an instruction which restores the link register from the top the - /// stack into the MachineBasicBlock \p MBB at position \p It. - void restoreLRFromStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// for the case when the LR is saved on the stack. - void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// for the case when the LR is saved in the register \p Reg. - void emitCFIForLRSaveToReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It, - Register Reg) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// after the LR is was restored from the stack. - void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// after the LR is was restored from a register. - void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - /// \brief Sets the offsets on outlined instructions in \p MBB which use SP - /// so that they will be valid post-outlining. - /// - /// \param MBB A \p MachineBasicBlock in an outlined function. - void fixupPostOutline(MachineBasicBlock &MBB) const; - - /// Returns true if the machine instruction offset can handle the stack fixup - /// and updates it if requested. - bool checkAndUpdateStackOffset(MachineInstr *MI, int64_t Fixup, - bool Updt) const; - + // Adds an instruction which saves the link register on top of the stack into + /// the MachineBasicBlock \p MBB at position \p It. + void saveLROnStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + /// Adds an instruction which restores the link register from the top the + /// stack into the MachineBasicBlock \p MBB at position \p It. + void restoreLRFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, + /// for the case when the LR is saved on the stack. + void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, + /// for the case when the LR is saved in the register \p Reg. + void emitCFIForLRSaveToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It, + Register Reg) const; + + /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, + /// after the LR is was restored from the stack. + void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + + /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, + /// after the LR is was restored from a register. + void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It) const; + /// \brief Sets the offsets on outlined instructions in \p MBB which use SP + /// so that they will be valid post-outlining. + /// + /// \param MBB A \p MachineBasicBlock in an outlined function. + void fixupPostOutline(MachineBasicBlock &MBB) const; + + /// Returns true if the machine instruction offset can handle the stack fixup + /// and updates it if requested. + bool checkAndUpdateStackOffset(MachineInstr *MI, int64_t Fixup, + bool Updt) const; + unsigned getInstBundleLength(const MachineInstr &MI) const; int getVLDMDefCycle(const InstrItineraryData *ItinData, @@ -477,9 +477,9 @@ private: MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AAResults *AA) const override; - + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const override; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. @@ -644,77 +644,77 @@ static inline bool isJumpTableBranchOpcode(int Opc) { Opc == ARM::t2BR_JT; } -static inline bool isLowOverheadTerminatorOpcode(int Opc) { - return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || - Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec; -} - +static inline bool isLowOverheadTerminatorOpcode(int Opc) { + return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart || + Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec; +} + static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; } -static inline bool isIndirectCall(const MachineInstr &MI) { - int Opc = MI.getOpcode(); - switch (Opc) { - // indirect calls: - case ARM::BLX: - case ARM::BLX_noip: - case ARM::BLX_pred: - case ARM::BLX_pred_noip: - case ARM::BX_CALL: - case ARM::BMOVPCRX_CALL: - case ARM::TCRETURNri: - case ARM::TAILJMPr: - case ARM::TAILJMPr4: - case ARM::tBLXr: - case ARM::tBLXr_noip: - case ARM::tBLXNSr: - case ARM::tBLXNS_CALL: - case ARM::tBX_CALL: - case ARM::tTAILJMPr: - assert(MI.isCall(MachineInstr::IgnoreBundle)); - return true; - // direct calls: - case ARM::BL: - case ARM::BL_pred: - case ARM::BMOVPCB_CALL: - case ARM::BL_PUSHLR: - case ARM::BLXi: - case ARM::TCRETURNdi: - case ARM::TAILJMPd: - case ARM::SVC: - case ARM::HVC: - case ARM::TPsoft: - case ARM::tTAILJMPd: - case ARM::t2SMC: - case ARM::t2HVC: - case ARM::tBL: - case ARM::tBLXi: - case ARM::tBL_PUSHLR: - case ARM::tTAILJMPdND: - case ARM::tSVC: - case ARM::tTPsoft: - assert(MI.isCall(MachineInstr::IgnoreBundle)); - return false; - } - assert(!MI.isCall(MachineInstr::IgnoreBundle)); - return false; -} - -static inline bool isIndirectControlFlowNotComingBack(const MachineInstr &MI) { - int opc = MI.getOpcode(); - return MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode()) || - isJumpTableBranchOpcode(opc); -} - -static inline bool isSpeculationBarrierEndBBOpcode(int Opc) { - return Opc == ARM::SpeculationBarrierISBDSBEndBB || - Opc == ARM::SpeculationBarrierSBEndBB || - Opc == ARM::t2SpeculationBarrierISBDSBEndBB || - Opc == ARM::t2SpeculationBarrierSBEndBB; -} - +static inline bool isIndirectCall(const MachineInstr &MI) { + int Opc = MI.getOpcode(); + switch (Opc) { + // indirect calls: + case ARM::BLX: + case ARM::BLX_noip: + case ARM::BLX_pred: + case ARM::BLX_pred_noip: + case ARM::BX_CALL: + case ARM::BMOVPCRX_CALL: + case ARM::TCRETURNri: + case ARM::TAILJMPr: + case ARM::TAILJMPr4: + case ARM::tBLXr: + case ARM::tBLXr_noip: + case ARM::tBLXNSr: + case ARM::tBLXNS_CALL: + case ARM::tBX_CALL: + case ARM::tTAILJMPr: + assert(MI.isCall(MachineInstr::IgnoreBundle)); + return true; + // direct calls: + case ARM::BL: + case ARM::BL_pred: + case ARM::BMOVPCB_CALL: + case ARM::BL_PUSHLR: + case ARM::BLXi: + case ARM::TCRETURNdi: + case ARM::TAILJMPd: + case ARM::SVC: + case ARM::HVC: + case ARM::TPsoft: + case ARM::tTAILJMPd: + case ARM::t2SMC: + case ARM::t2HVC: + case ARM::tBL: + case ARM::tBLXi: + case ARM::tBL_PUSHLR: + case ARM::tTAILJMPdND: + case ARM::tSVC: + case ARM::tTPsoft: + assert(MI.isCall(MachineInstr::IgnoreBundle)); + return false; + } + assert(!MI.isCall(MachineInstr::IgnoreBundle)); + return false; +} + +static inline bool isIndirectControlFlowNotComingBack(const MachineInstr &MI) { + int opc = MI.getOpcode(); + return MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode()) || + isJumpTableBranchOpcode(opc); +} + +static inline bool isSpeculationBarrierEndBBOpcode(int Opc) { + return Opc == ARM::SpeculationBarrierISBDSBEndBB || + Opc == ARM::SpeculationBarrierSBEndBB || + Opc == ARM::t2SpeculationBarrierISBDSBEndBB || + Opc == ARM::t2SpeculationBarrierSBEndBB; +} + static inline bool isPopOpcode(int Opc) { return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET || Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD || @@ -886,17 +886,17 @@ inline bool isLegalAddressImm(unsigned Opcode, int Imm, return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0; case ARMII::AddrModeT2_i7s4: return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0; - case ARMII::AddrModeT2_i8: - return std::abs(Imm) < (((1 << 8) * 1) - 1); - case ARMII::AddrModeT2_i12: - return Imm >= 0 && Imm < (((1 << 12) * 1) - 1); + case ARMII::AddrModeT2_i8: + return std::abs(Imm) < (((1 << 8) * 1) - 1); + case ARMII::AddrModeT2_i12: + return Imm >= 0 && Imm < (((1 << 12) * 1) - 1); default: llvm_unreachable("Unhandled Addressing mode"); } } -// Return true if the given intrinsic is a gather -inline bool isGather(IntrinsicInst *IntInst) { +// Return true if the given intrinsic is a gather +inline bool isGather(IntrinsicInst *IntInst) { if (IntInst == nullptr) return false; unsigned IntrinsicID = IntInst->getIntrinsicID(); @@ -906,15 +906,15 @@ inline bool isGather(IntrinsicInst *IntInst) { IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated); -} - -// Return true if the given intrinsic is a scatter -inline bool isScatter(IntrinsicInst *IntInst) { - if (IntInst == nullptr) - return false; - unsigned IntrinsicID = IntInst->getIntrinsicID(); - return (IntrinsicID == Intrinsic::masked_scatter || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated); +} + +// Return true if the given intrinsic is a scatter +inline bool isScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + unsigned IntrinsicID = IntInst->getIntrinsicID(); + return (IntrinsicID == Intrinsic::masked_scatter || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || @@ -923,17 +923,17 @@ inline bool isScatter(IntrinsicInst *IntInst) { IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); } -// Return true if the given intrinsic is a gather or scatter -inline bool isGatherScatter(IntrinsicInst *IntInst) { - if (IntInst == nullptr) - return false; - return isGather(IntInst) || isScatter(IntInst); -} - -unsigned getBLXOpcode(const MachineFunction &MF); -unsigned gettBLXrOpcode(const MachineFunction &MF); -unsigned getBLXpredOpcode(const MachineFunction &MF); - +// Return true if the given intrinsic is a gather or scatter +inline bool isGatherScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + return isGather(IntInst) || isScatter(IntInst); +} + +unsigned getBLXOpcode(const MachineFunction &MF); +unsigned gettBLXrOpcode(const MachineFunction &MF); +unsigned getBLXpredOpcode(const MachineFunction &MF); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 1a264dabee..138431e36d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -55,9 +55,9 @@ using namespace llvm; ARMBaseRegisterInfo::ARMBaseRegisterInfo() - : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) { - ARM_MC::initLLVMToCVRegMapping(this); -} + : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) { + ARM_MC::initLLVMToCVRegMapping(this); +} static unsigned getFramePointerReg(const ARMSubtarget &STI) { return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11; @@ -330,13 +330,13 @@ bool ARMBaseRegisterInfo::getRegAllocationHints( case ARMRI::RegPairOdd: Odd = 1; break; - case ARMRI::RegLR: + case ARMRI::RegLR: TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); - if (MRI.getRegClass(VirtReg)->contains(ARM::LR)) - Hints.push_back(ARM::LR); + if (MRI.getRegClass(VirtReg)->contains(ARM::LR)) + Hints.push_back(ARM::LR); return false; - default: - return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); + default: + return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); } // This register should preferably be even (Odd == 0) or odd (Odd == 1). @@ -640,10 +640,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { /// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to /// be a pointer to FrameIdx at the beginning of the basic block. -Register -ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - int FrameIdx, - int64_t Offset) const { +Register +ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + int FrameIdx, + int64_t Offset) const { ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri); @@ -657,7 +657,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const MCInstrDesc &MCID = TII.get(ADDriOpc); - Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass); + Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass); MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg) @@ -665,8 +665,8 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, if (!AFI->isThumb1OnlyFunction()) MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); - - return BaseReg; + + return BaseReg; } void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.h index 5afb6c6aa0..53e8aa657c 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -32,11 +32,11 @@ class LiveIntervals; namespace ARMRI { enum { - // Used for LDRD register pairs + // Used for LDRD register pairs RegPairOdd = 1, - RegPairEven = 2, - // Used to hint for lr in t2DoLoopStart - RegLR = 3 + RegPairEven = 2, + // Used to hint for lr in t2DoLoopStart + RegLR = 3 }; } // end namespace ARMRI @@ -168,8 +168,8 @@ public: int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, - int64_t Offset) const override; + Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, + int64_t Offset) const override; void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMBlockPlacement.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMBlockPlacement.cpp index 9ba16003a9..2cc6a5b4c1 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMBlockPlacement.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMBlockPlacement.cpp @@ -1,228 +1,228 @@ -//===-- ARMBlockPlacement.cpp - ARM block placement pass ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass re-arranges machine basic blocks to suit target requirements. -// Currently it only moves blocks to fix backwards WLS branches. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "ARMBaseInstrInfo.h" -#include "ARMBasicBlockInfo.h" -#include "ARMSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm-block-placement" -#define DEBUG_PREFIX "ARM Block Placement: " - -namespace llvm { -class ARMBlockPlacement : public MachineFunctionPass { -private: - const ARMBaseInstrInfo *TII; - std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; - MachineLoopInfo *MLI = nullptr; - -public: - static char ID; - ARMBlockPlacement() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After); - bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<MachineLoopInfo>(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // namespace llvm - -FunctionPass *llvm::createARMBlockPlacementPass() { - return new ARMBlockPlacement(); -} - -char ARMBlockPlacement::ID = 0; - -INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false, - false) - -bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget()); - if (!ST.hasLOB()) - return false; - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n"); - MLI = &getAnalysis<MachineLoopInfo>(); - TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo()); - BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); - MF.RenumberBlocks(); - BBUtils->computeAllBlockSizes(); - BBUtils->adjustBBOffsetsAfter(&MF.front()); - bool Changed = false; - - // Find loops with a backwards branching WLS. - // This requires looping over the loops in the function, checking each - // preheader for a WLS and if its target is before the preheader. If moving - // the target block wouldn't produce another backwards WLS or a new forwards - // LE branch then move the target block after the preheader. - for (auto *ML : *MLI) { - MachineBasicBlock *Preheader = ML->getLoopPredecessor(); - if (!Preheader) - continue; - - for (auto &Terminator : Preheader->terminators()) { - if (Terminator.getOpcode() != ARM::t2WhileLoopStart) - continue; - MachineBasicBlock *LoopExit = Terminator.getOperand(1).getMBB(); - // We don't want to move the function's entry block. - if (!LoopExit->getPrevNode()) - continue; - if (blockIsBefore(Preheader, LoopExit)) - continue; - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from " - << Preheader->getFullName() << " to " - << LoopExit->getFullName() << "\n"); - - // Make sure that moving the target block doesn't cause any of its WLSs - // that were previously not backwards to become backwards - bool CanMove = true; - for (auto &LoopExitTerminator : LoopExit->terminators()) { - if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStart) - continue; - // An example loop structure where the LoopExit can't be moved, since - // bb1's WLS will become backwards once it's moved after bb3 bb1: - - // LoopExit - // WLS bb2 - LoopExit2 - // bb2: - // ... - // bb3: - Preheader - // WLS bb1 - // bb4: - Header - MachineBasicBlock *LoopExit2 = - LoopExitTerminator.getOperand(1).getMBB(); - // If the WLS from LoopExit to LoopExit2 is already backwards then - // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is - // after the Preheader then moving will keep it as a forward branch, so - // it can be moved. If LoopExit2 is between the Preheader and LoopExit - // then moving LoopExit will make it a backwards branch, so it can't be - // moved since we'd fix one and introduce one backwards branch. - // TODO: Analyse the blocks to make a decision if it would be worth - // moving LoopExit even if LoopExit2 is between the Preheader and - // LoopExit. - if (!blockIsBefore(LoopExit2, LoopExit) && - (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX - << "Can't move the target block as it would " - "introduce a new backwards WLS branch\n"); - CanMove = false; - break; - } - } - - if (CanMove) { - // Make sure no LEs become forwards. - // An example loop structure where the LoopExit can't be moved, since - // bb2's LE will become forwards once bb1 is moved after bb3. - // bb1: - LoopExit - // bb2: - // LE bb1 - Terminator - // bb3: - Preheader - // WLS bb1 - // bb4: - Header - for (auto It = LoopExit->getIterator(); It != Preheader->getIterator(); - It++) { - MachineBasicBlock *MBB = &*It; - for (auto &Terminator : MBB->terminators()) { - if (Terminator.getOpcode() != ARM::t2LoopEndDec) - continue; - MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB(); - // The LE will become forwards branching if it branches to LoopExit - // which isn't allowed by the architecture, so we should avoid - // introducing these. - // TODO: Analyse the blocks to make a decision if it would be worth - // moving LoopExit even if we'd introduce a forwards LE - if (LETarget == LoopExit) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX - << "Can't move the target block as it would " - "introduce a new forwards LE branch\n"); - CanMove = false; - break; - } - } - } - - if (!CanMove) - break; - } - - if (CanMove) { - moveBasicBlock(LoopExit, Preheader); - Changed = true; - break; - } - } - } - - return Changed; -} - -bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB, - MachineBasicBlock *Other) { - return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB); -} - -void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB, - MachineBasicBlock *After) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after " - << After->getName() << "\n"); - MachineBasicBlock *BBPrevious = BB->getPrevNode(); - assert(BBPrevious && "Cannot move the function entry basic block"); - MachineBasicBlock *AfterNext = After->getNextNode(); - MachineBasicBlock *BBNext = BB->getNextNode(); - - BB->moveAfter(After); - - auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) { - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from " - << From->getName() << " to " << To->getName() << "\n"); - assert(From->isSuccessor(To) && - "'To' is expected to be a successor of 'From'"); - MachineInstr &Terminator = *(--From->terminators().end()); - if (!Terminator.isUnconditionalBranch()) { - // The BB doesn't have an unconditional branch so it relied on - // fall-through. Fix by adding an unconditional branch to the moved BB. - MachineInstrBuilder MIB = - BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B)); - MIB.addMBB(To); - MIB.addImm(ARMCC::CondCodes::AL); - MIB.addReg(ARM::NoRegister); - LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from " - << From->getName() << " to " << To->getName() << ": " - << *MIB.getInstr()); - } - }; - - // Fix fall-through to the moved BB from the one that used to be before it. - if (BBPrevious->isSuccessor(BB)) - FixFallthrough(BBPrevious, BB); - // Fix fall through from the destination BB to the one that used to follow. - if (AfterNext && After->isSuccessor(AfterNext)) - FixFallthrough(After, AfterNext); - // Fix fall through from the moved BB to the one that used to follow. - if (BBNext && BB->isSuccessor(BBNext)) - FixFallthrough(BB, BBNext); - - BBUtils->adjustBBOffsetsAfter(After); -} +//===-- ARMBlockPlacement.cpp - ARM block placement pass ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass re-arranges machine basic blocks to suit target requirements. +// Currently it only moves blocks to fix backwards WLS branches. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMBaseInstrInfo.h" +#include "ARMBasicBlockInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-block-placement" +#define DEBUG_PREFIX "ARM Block Placement: " + +namespace llvm { +class ARMBlockPlacement : public MachineFunctionPass { +private: + const ARMBaseInstrInfo *TII; + std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; + MachineLoopInfo *MLI = nullptr; + +public: + static char ID; + ARMBlockPlacement() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + void moveBasicBlock(MachineBasicBlock *BB, MachineBasicBlock *After); + bool blockIsBefore(MachineBasicBlock *BB, MachineBasicBlock *Other); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // namespace llvm + +FunctionPass *llvm::createARMBlockPlacementPass() { + return new ARMBlockPlacement(); +} + +char ARMBlockPlacement::ID = 0; + +INITIALIZE_PASS(ARMBlockPlacement, DEBUG_TYPE, "ARM block placement", false, + false) + +bool ARMBlockPlacement::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + const ARMSubtarget &ST = static_cast<const ARMSubtarget &>(MF.getSubtarget()); + if (!ST.hasLOB()) + return false; + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Running on " << MF.getName() << "\n"); + MLI = &getAnalysis<MachineLoopInfo>(); + TII = static_cast<const ARMBaseInstrInfo *>(ST.getInstrInfo()); + BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); + MF.RenumberBlocks(); + BBUtils->computeAllBlockSizes(); + BBUtils->adjustBBOffsetsAfter(&MF.front()); + bool Changed = false; + + // Find loops with a backwards branching WLS. + // This requires looping over the loops in the function, checking each + // preheader for a WLS and if its target is before the preheader. If moving + // the target block wouldn't produce another backwards WLS or a new forwards + // LE branch then move the target block after the preheader. + for (auto *ML : *MLI) { + MachineBasicBlock *Preheader = ML->getLoopPredecessor(); + if (!Preheader) + continue; + + for (auto &Terminator : Preheader->terminators()) { + if (Terminator.getOpcode() != ARM::t2WhileLoopStart) + continue; + MachineBasicBlock *LoopExit = Terminator.getOperand(1).getMBB(); + // We don't want to move the function's entry block. + if (!LoopExit->getPrevNode()) + continue; + if (blockIsBefore(Preheader, LoopExit)) + continue; + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Found a backwards WLS from " + << Preheader->getFullName() << " to " + << LoopExit->getFullName() << "\n"); + + // Make sure that moving the target block doesn't cause any of its WLSs + // that were previously not backwards to become backwards + bool CanMove = true; + for (auto &LoopExitTerminator : LoopExit->terminators()) { + if (LoopExitTerminator.getOpcode() != ARM::t2WhileLoopStart) + continue; + // An example loop structure where the LoopExit can't be moved, since + // bb1's WLS will become backwards once it's moved after bb3 bb1: - + // LoopExit + // WLS bb2 - LoopExit2 + // bb2: + // ... + // bb3: - Preheader + // WLS bb1 + // bb4: - Header + MachineBasicBlock *LoopExit2 = + LoopExitTerminator.getOperand(1).getMBB(); + // If the WLS from LoopExit to LoopExit2 is already backwards then + // moving LoopExit won't affect it, so it can be moved. If LoopExit2 is + // after the Preheader then moving will keep it as a forward branch, so + // it can be moved. If LoopExit2 is between the Preheader and LoopExit + // then moving LoopExit will make it a backwards branch, so it can't be + // moved since we'd fix one and introduce one backwards branch. + // TODO: Analyse the blocks to make a decision if it would be worth + // moving LoopExit even if LoopExit2 is between the Preheader and + // LoopExit. + if (!blockIsBefore(LoopExit2, LoopExit) && + (LoopExit2 == Preheader || blockIsBefore(LoopExit2, Preheader))) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX + << "Can't move the target block as it would " + "introduce a new backwards WLS branch\n"); + CanMove = false; + break; + } + } + + if (CanMove) { + // Make sure no LEs become forwards. + // An example loop structure where the LoopExit can't be moved, since + // bb2's LE will become forwards once bb1 is moved after bb3. + // bb1: - LoopExit + // bb2: + // LE bb1 - Terminator + // bb3: - Preheader + // WLS bb1 + // bb4: - Header + for (auto It = LoopExit->getIterator(); It != Preheader->getIterator(); + It++) { + MachineBasicBlock *MBB = &*It; + for (auto &Terminator : MBB->terminators()) { + if (Terminator.getOpcode() != ARM::t2LoopEndDec) + continue; + MachineBasicBlock *LETarget = Terminator.getOperand(2).getMBB(); + // The LE will become forwards branching if it branches to LoopExit + // which isn't allowed by the architecture, so we should avoid + // introducing these. + // TODO: Analyse the blocks to make a decision if it would be worth + // moving LoopExit even if we'd introduce a forwards LE + if (LETarget == LoopExit) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX + << "Can't move the target block as it would " + "introduce a new forwards LE branch\n"); + CanMove = false; + break; + } + } + } + + if (!CanMove) + break; + } + + if (CanMove) { + moveBasicBlock(LoopExit, Preheader); + Changed = true; + break; + } + } + } + + return Changed; +} + +bool ARMBlockPlacement::blockIsBefore(MachineBasicBlock *BB, + MachineBasicBlock *Other) { + return BBUtils->getOffsetOf(Other) > BBUtils->getOffsetOf(BB); +} + +void ARMBlockPlacement::moveBasicBlock(MachineBasicBlock *BB, + MachineBasicBlock *After) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Moving " << BB->getName() << " after " + << After->getName() << "\n"); + MachineBasicBlock *BBPrevious = BB->getPrevNode(); + assert(BBPrevious && "Cannot move the function entry basic block"); + MachineBasicBlock *AfterNext = After->getNextNode(); + MachineBasicBlock *BBNext = BB->getNextNode(); + + BB->moveAfter(After); + + auto FixFallthrough = [&](MachineBasicBlock *From, MachineBasicBlock *To) { + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Checking for fallthrough from " + << From->getName() << " to " << To->getName() << "\n"); + assert(From->isSuccessor(To) && + "'To' is expected to be a successor of 'From'"); + MachineInstr &Terminator = *(--From->terminators().end()); + if (!Terminator.isUnconditionalBranch()) { + // The BB doesn't have an unconditional branch so it relied on + // fall-through. Fix by adding an unconditional branch to the moved BB. + MachineInstrBuilder MIB = + BuildMI(From, Terminator.getDebugLoc(), TII->get(ARM::t2B)); + MIB.addMBB(To); + MIB.addImm(ARMCC::CondCodes::AL); + MIB.addReg(ARM::NoRegister); + LLVM_DEBUG(dbgs() << DEBUG_PREFIX << "Adding unconditional branch from " + << From->getName() << " to " << To->getName() << ": " + << *MIB.getInstr()); + } + }; + + // Fix fall-through to the moved BB from the one that used to be before it. + if (BBPrevious->isSuccessor(BB)) + FixFallthrough(BBPrevious, BB); + // Fix fall through from the destination BB to the one that used to follow. + if (AfterNext && After->isSuccessor(AfterNext)) + FixFallthrough(After, AfterNext); + // Fix fall through from the moved BB to the one that used to follow. + if (BBNext && BB->isSuccessor(BBNext)) + FixFallthrough(BB, BBNext); + + BBUtils->adjustBBOffsetsAfter(After); +} diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.cpp index 6feed82596..471474788e 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.cpp @@ -85,11 +85,11 @@ namespace { /// Helper class for values going out through an ABI boundary (used for handling /// function return values and call parameters). -struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler { - ARMOutgoingValueHandler(MachineIRBuilder &MIRBuilder, - MachineRegisterInfo &MRI, MachineInstrBuilder &MIB, - CCAssignFn *AssignFn) - : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} +struct ARMOutgoingValueHandler : public CallLowering::OutgoingValueHandler { + ARMOutgoingValueHandler(MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI, MachineInstrBuilder &MIB, + CCAssignFn *AssignFn) + : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -257,14 +257,14 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg()); - ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, - AssignFn); + ARMOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, + AssignFn); return handleAssignments(MIRBuilder, SplitRetInfos, RetHandler); } bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, ArrayRef<Register> VRegs, - FunctionLoweringInfo &FLI) const { + const Value *Val, ArrayRef<Register> VRegs, + FunctionLoweringInfo &FLI) const { assert(!Val == VRegs.empty() && "Return value without a vreg"); auto const &ST = MIRBuilder.getMF().getSubtarget<ARMSubtarget>(); @@ -282,10 +282,10 @@ namespace { /// Helper class for values coming in through an ABI boundary (used for handling /// formal arguments and call return values). -struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler { - ARMIncomingValueHandler(MachineIRBuilder &MIRBuilder, - MachineRegisterInfo &MRI, CCAssignFn AssignFn) - : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} +struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler { + ARMIncomingValueHandler(MachineIRBuilder &MIRBuilder, + MachineRegisterInfo &MRI, CCAssignFn AssignFn) + : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -335,8 +335,8 @@ struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler { assert(VA.isRegLoc() && "Value shouldn't be assigned to reg"); assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?"); - uint64_t ValSize = VA.getValVT().getFixedSizeInBits(); - uint64_t LocSize = VA.getLocVT().getFixedSizeInBits(); + uint64_t ValSize = VA.getValVT().getFixedSizeInBits(); + uint64_t LocSize = VA.getLocVT().getFixedSizeInBits(); assert(ValSize <= 64 && "Unsupported value size"); assert(LocSize <= 64 && "Unsupported location size"); @@ -397,10 +397,10 @@ struct ARMIncomingValueHandler : public CallLowering::IncomingValueHandler { virtual void markPhysRegUsed(unsigned PhysReg) = 0; }; -struct FormalArgHandler : public ARMIncomingValueHandler { +struct FormalArgHandler : public ARMIncomingValueHandler { FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn AssignFn) - : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn) {} + : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { MIRBuilder.getMRI()->addLiveIn(PhysReg); @@ -410,10 +410,10 @@ struct FormalArgHandler : public ARMIncomingValueHandler { } // end anonymous namespace -bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, - const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, - FunctionLoweringInfo &FLI) const { +bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const { auto &TLI = *getTLI<ARMTargetLowering>(); auto Subtarget = TLI.getSubtarget(); @@ -434,7 +434,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, for (auto &Arg : F.args()) { if (!isSupportedType(DL, TLI, Arg.getType())) return false; - if (Arg.hasPassPointeeByValueCopyAttr()) + if (Arg.hasPassPointeeByValueCopyAttr()) return false; } @@ -468,10 +468,10 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, namespace { -struct CallReturnHandler : public ARMIncomingValueHandler { +struct CallReturnHandler : public ARMIncomingValueHandler { CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn) - : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + : ARMIncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} void markPhysRegUsed(unsigned PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); @@ -481,16 +481,16 @@ struct CallReturnHandler : public ARMIncomingValueHandler { }; // FIXME: This should move to the ARMSubtarget when it supports all the opcodes. -unsigned getCallOpcode(const MachineFunction &MF, const ARMSubtarget &STI, - bool isDirect) { +unsigned getCallOpcode(const MachineFunction &MF, const ARMSubtarget &STI, + bool isDirect) { if (isDirect) return STI.isThumb() ? ARM::tBL : ARM::BL; if (STI.isThumb()) - return gettBLXrOpcode(MF); + return gettBLXrOpcode(MF); if (STI.hasV5TOps()) - return getBLXOpcode(MF); + return getBLXOpcode(MF); if (STI.hasV4TOps()) return ARM::BX_CALL; @@ -518,7 +518,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo & // Create the call instruction so we can add the implicit uses of arg // registers, but don't insert it yet. bool IsDirect = !Info.Callee.isReg(); - auto CallOpcode = getCallOpcode(MF, STI, IsDirect); + auto CallOpcode = getCallOpcode(MF, STI, IsDirect); auto MIB = MIRBuilder.buildInstrNoInsert(CallOpcode); bool IsThumb = STI.isThumb(); @@ -549,8 +549,8 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo & splitToValueTypes(Arg, ArgInfos, MF); } - auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, Info.IsVarArg); - ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn); + auto ArgAssignFn = TLI.CCAssignFnForCall(Info.CallConv, Info.IsVarArg); + ARMOutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler)) return false; @@ -563,7 +563,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo & ArgInfos.clear(); splitToValueTypes(Info.OrigRet, ArgInfos, MF); - auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg); + auto RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, Info.IsVarArg); CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler)) return false; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.h b/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.h index 3be73d497d..9bff3564c5 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMCallLowering.h @@ -33,12 +33,12 @@ public: ARMCallLowering(const ARMTargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<Register> VRegs, - FunctionLoweringInfo &FLI) const override; + ArrayRef<Register> VRegs, + FunctionLoweringInfo &FLI) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, - FunctionLoweringInfo &FLI) const override; + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMConstantIslandPass.cpp index 630490f6f9..86faf511c9 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -338,32 +338,32 @@ LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() { } #endif -// Align blocks where the previous block does not fall through. This may add -// extra NOP's but they will not be executed. It uses the PrefLoopAlignment as a -// measure of how much to align, and only runs at CodeGenOpt::Aggressive. -static bool AlignBlocks(MachineFunction *MF) { - if (MF->getTarget().getOptLevel() != CodeGenOpt::Aggressive || - MF->getFunction().hasOptSize()) - return false; - - auto *TLI = MF->getSubtarget().getTargetLowering(); - const Align Alignment = TLI->getPrefLoopAlignment(); - if (Alignment < 4) - return false; - - bool Changed = false; - bool PrevCanFallthough = true; - for (auto &MBB : *MF) { - if (!PrevCanFallthough) { - Changed = true; - MBB.setAlignment(Alignment); - } - PrevCanFallthough = MBB.canFallThrough(); - } - - return Changed; -} - +// Align blocks where the previous block does not fall through. This may add +// extra NOP's but they will not be executed. It uses the PrefLoopAlignment as a +// measure of how much to align, and only runs at CodeGenOpt::Aggressive. +static bool AlignBlocks(MachineFunction *MF) { + if (MF->getTarget().getOptLevel() != CodeGenOpt::Aggressive || + MF->getFunction().hasOptSize()) + return false; + + auto *TLI = MF->getSubtarget().getTargetLowering(); + const Align Alignment = TLI->getPrefLoopAlignment(); + if (Alignment < 4) + return false; + + bool Changed = false; + bool PrevCanFallthough = true; + for (auto &MBB : *MF) { + if (!PrevCanFallthough) { + Changed = true; + MBB.setAlignment(Alignment); + } + PrevCanFallthough = MBB.canFallThrough(); + } + + return Changed; +} + bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MCP = mf.getConstantPool(); @@ -385,10 +385,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isThumb2 = AFI->isThumb2Function(); bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB); - // TBB generation code in this constant island pass has not been adapted to - // deal with speculation barriers. - if (STI->hardenSlsRetBr()) - GenerateTBB = false; + // TBB generation code in this constant island pass has not been adapted to + // deal with speculation barriers. + if (STI->hardenSlsRetBr()) + GenerateTBB = false; // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. @@ -406,9 +406,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { MF->RenumberBlocks(); } - // Align any non-fallthrough blocks - MadeChange |= AlignBlocks(MF); - + // Align any non-fallthrough blocks + MadeChange |= AlignBlocks(MF); + // Perform the initial placement of the constant pool entries. To start with, // we put them all at the end of the function. std::vector<MachineInstr*> CPEMIs; @@ -524,11 +524,11 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // The function needs to be as aligned as the basic blocks. The linker may // move functions around based on their alignment. - // Special case: halfword literals still need word alignment on the function. - Align FuncAlign = MaxAlign; - if (MaxAlign == 2) - FuncAlign = Align(4); - MF->ensureAlignment(FuncAlign); + // Special case: halfword literals still need word alignment on the function. + Align FuncAlign = MaxAlign; + if (MaxAlign == 2) + FuncAlign = Align(4); + MF->ensureAlignment(FuncAlign); // Order the entries in BB by descending alignment. That ensures correct // alignment of all entries as long as BB is sufficiently aligned. Keep @@ -543,7 +543,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { - unsigned Size = CPs[i].getSizeInBytes(TD); + unsigned Size = CPs[i].getSizeInBytes(TD); Align Alignment = CPs[i].getAlign(); // Verify that all constant pool entries are a multiple of their alignment. // If not, we would have to pad them out so that instructions stay aligned. @@ -586,12 +586,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement( MachineBasicBlock *LastCorrectlyNumberedBB = nullptr; for (MachineBasicBlock &MBB : *MF) { auto MI = MBB.getLastNonDebugInstr(); - // Look past potential SpeculationBarriers at end of BB. - while (MI != MBB.end() && - (isSpeculationBarrierEndBBOpcode(MI->getOpcode()) || - MI->isDebugInstr())) - --MI; - + // Look past potential SpeculationBarriers at end of BB. + while (MI != MBB.end() && + (isSpeculationBarrierEndBBOpcode(MI->getOpcode()) || + MI->isDebugInstr())) + --MI; + if (MI == MBB.end()) continue; @@ -814,26 +814,26 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // Taking the address of a CP entry. case ARM::LEApcrel: - case ARM::LEApcrelJT: { - // This takes a SoImm, which is 8 bit immediate rotated. We'll - // pretend the maximum offset is 255 * 4. Since each instruction - // 4 byte wide, this is always correct. We'll check for other - // displacements that fits in a SoImm as well. - Bits = 8; - NegOk = true; - IsSoImm = true; - unsigned CPI = I.getOperand(op).getIndex(); - assert(CPI < CPEMIs.size()); - MachineInstr *CPEMI = CPEMIs[CPI]; - const Align CPEAlign = getCPEAlign(CPEMI); - const unsigned LogCPEAlign = Log2(CPEAlign); - if (LogCPEAlign >= 2) - Scale = 4; - else - // For constants with less than 4-byte alignment, - // we'll pretend the maximum offset is 255 * 1. - Scale = 1; - } + case ARM::LEApcrelJT: { + // This takes a SoImm, which is 8 bit immediate rotated. We'll + // pretend the maximum offset is 255 * 4. Since each instruction + // 4 byte wide, this is always correct. We'll check for other + // displacements that fits in a SoImm as well. + Bits = 8; + NegOk = true; + IsSoImm = true; + unsigned CPI = I.getOperand(op).getIndex(); + assert(CPI < CPEMIs.size()); + MachineInstr *CPEMI = CPEMIs[CPI]; + const Align CPEAlign = getCPEAlign(CPEMI); + const unsigned LogCPEAlign = Log2(CPEAlign); + if (LogCPEAlign >= 2) + Scale = 4; + else + // For constants with less than 4-byte alignment, + // we'll pretend the maximum offset is 255 * 1. + Scale = 1; + } break; case ARM::t2LEApcrel: case ARM::t2LEApcrelJT: @@ -2124,7 +2124,7 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) { MachineFunction *MF = MBB->getParent(); ++MBB; - return MBB != MF->end() && !MBB->empty() && &*MBB->begin() == CPEMI; + return MBB != MF->end() && !MBB->empty() && &*MBB->begin() == CPEMI; } static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMExpandPseudoInsts.cpp index a7f1765a93..a38327ffe6 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -875,25 +875,25 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!"); unsigned ImmVal = (unsigned)MO.getImm(); - unsigned SOImmValV1 = 0, SOImmValV2 = 0; - - if (ARM_AM::isSOImmTwoPartVal(ImmVal)) { // Expand into a movi + orr. - LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); - HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg); - SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); - SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); - } else { // Expand into a mvn + sub. - LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), DstReg); - HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg); - SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(-ImmVal); - SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(-ImmVal); - SOImmValV1 = ~(-SOImmValV1); - } - + unsigned SOImmValV1 = 0, SOImmValV2 = 0; + + if (ARM_AM::isSOImmTwoPartVal(ImmVal)) { // Expand into a movi + orr. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); + SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + } else { // Expand into a mvn + sub. + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi), DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg); + SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(-ImmVal); + SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(-ImmVal); + SOImmValV1 = ~(-SOImmValV1); + } + unsigned MIFlags = MI.getFlags(); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); @@ -1871,66 +1871,66 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, default: return false; - case ARM::VBSPd: - case ARM::VBSPq: { - Register DstReg = MI.getOperand(0).getReg(); - if (DstReg == MI.getOperand(3).getReg()) { - // Expand to VBIT - unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBITd : ARM::VBITq; - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) - .add(MI.getOperand(0)) - .add(MI.getOperand(3)) - .add(MI.getOperand(2)) - .add(MI.getOperand(1)) - .addImm(MI.getOperand(4).getImm()) - .add(MI.getOperand(5)); - } else if (DstReg == MI.getOperand(2).getReg()) { - // Expand to VBIF - unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBIFd : ARM::VBIFq; - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) - .add(MI.getOperand(0)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .add(MI.getOperand(1)) - .addImm(MI.getOperand(4).getImm()) - .add(MI.getOperand(5)); - } else { - // Expand to VBSL - unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBSLd : ARM::VBSLq; - if (DstReg == MI.getOperand(1).getReg()) { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1)) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .addImm(MI.getOperand(4).getImm()) - .add(MI.getOperand(5)); - } else { - // Use move to satisfy constraints - unsigned MoveOpc = Opcode == ARM::VBSPd ? ARM::VORRd : ARM::VORRq; - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MoveOpc)) - .addReg(DstReg, - RegState::Define | - getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(1)) - .add(MI.getOperand(1)) - .addImm(MI.getOperand(4).getImm()) - .add(MI.getOperand(5)); - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) - .add(MI.getOperand(0)) - .addReg(DstReg, - RegState::Kill | - getRenamableRegState(MI.getOperand(0).isRenamable())) - .add(MI.getOperand(2)) - .add(MI.getOperand(3)) - .addImm(MI.getOperand(4).getImm()) - .add(MI.getOperand(5)); - } - } - MI.eraseFromParent(); - return true; - } - + case ARM::VBSPd: + case ARM::VBSPq: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to VBIT + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBITd : ARM::VBITq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to VBIF + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBIFd : ARM::VBIFq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else { + // Expand to VBSL + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBSLd : ARM::VBSLq; + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else { + // Use move to satisfy constraints + unsigned MoveOpc = Opcode == ARM::VBSPd ? ARM::VORRd : ARM::VORRq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MoveOpc)) + .addReg(DstReg, + RegState::Define | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } + } + MI.eraseFromParent(); + return true; + } + case ARM::TCRETURNdi: case ARM::TCRETURNri: { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -2304,9 +2304,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); - MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Thumb ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF))); + MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Thumb ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF))); if (Thumb) MIB.add(predOps(ARMCC::AL)); MIB.addReg(Reg, RegState::Kill); diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMFastISel.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMFastISel.cpp index da1d9af8d5..483aeb4d72 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMFastISel.cpp @@ -606,9 +606,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { } } - if ((Subtarget->isTargetELF() && Subtarget->isGVInGOT(GV)) || - (Subtarget->isTargetMachO() && IsIndirect) || - Subtarget->genLongCalls()) { + if ((Subtarget->isTargetELF() && Subtarget->isGVInGOT(GV)) || + (Subtarget->isTargetMachO() && IsIndirect) || + Subtarget->genLongCalls()) { MachineInstrBuilder MIB; unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); if (isThumb2) @@ -2175,7 +2175,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) { unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) { if (UseReg) - return isThumb2 ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF); + return isThumb2 ? gettBLXrOpcode(*MF) : getBLXOpcode(*MF); else return isThumb2 ? ARM::tBL : ARM::BL; } @@ -2266,11 +2266,11 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { // BL / BLX don't take a predicate, but tBL / tBLX do. if (isThumb2) MIB.add(predOps(ARMCC::AL)); - if (Subtarget->genLongCalls()) { - CalleeReg = - constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0); + if (Subtarget->genLongCalls()) { + CalleeReg = + constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0); MIB.addReg(CalleeReg); - } else + } else MIB.addExternalSymbol(TLI.getLibcallName(Call)); // Add implicit physical register uses to the call. @@ -2408,11 +2408,11 @@ bool ARMFastISel::SelectCall(const Instruction *I, // ARM calls don't take a predicate, but tBL / tBLX do. if(isThumb2) MIB.add(predOps(ARMCC::AL)); - if (UseReg) { - CalleeReg = - constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0); + if (UseReg) { + CalleeReg = + constrainOperandRegClass(TII.get(CallOpc), CalleeReg, isThumb2 ? 2 : 0); MIB.addReg(CalleeReg); - } else if (!IntrMemName) + } else if (!IntrMemName) MIB.addGlobalAddress(GV, 0, 0); else MIB.addExternalSymbol(IntrMemName, 0); diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMFeatures.h b/contrib/libs/llvm12/lib/Target/ARM/ARMFeatures.h index 99e0ef05b5..6d8e75a2ec 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMFeatures.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMFeatures.h @@ -75,7 +75,7 @@ inline bool isV8EligibleForIT(const InstrType *Instr) { // there are some "conditionally deprecated" opcodes case ARM::tADDspr: case ARM::tBLXr: - case ARM::tBLXr_noip: + case ARM::tBLXr_noip: return Instr->getOperand(2).getReg() != ARM::PC; // ADD PC, SP and BLX PC were always unpredictable, // now on top of it they're deprecated diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.cpp index 9eeb7f20dc..e0a657b505 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.cpp @@ -883,10 +883,10 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, /// debug info. It's the same as what we use for resolving the code-gen /// references for now. FIXME: This can go wrong when references are /// SP-relative and simple call frames aren't used. -StackOffset ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - Register &FrameReg) const { - return StackOffset::getFixed(ResolveFrameIndexReference(MF, FI, FrameReg, 0)); +StackOffset ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + Register &FrameReg) const { + return StackOffset::getFixed(ResolveFrameIndexReference(MF, FI, FrameReg, 0)); } int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, @@ -2114,7 +2114,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned NumExtras = TargetAlign.value() / 4; SmallVector<unsigned, 2> Extras; while (NumExtras && !UnspilledCS1GPRs.empty()) { - unsigned Reg = UnspilledCS1GPRs.pop_back_val(); + unsigned Reg = UnspilledCS1GPRs.pop_back_val(); if (!MRI.isReserved(Reg) && (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) { Extras.push_back(Reg); @@ -2124,7 +2124,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // For non-Thumb1 functions, also check for hi-reg CS registers if (!AFI->isThumb1OnlyFunction()) { while (NumExtras && !UnspilledCS2GPRs.empty()) { - unsigned Reg = UnspilledCS2GPRs.pop_back_val(); + unsigned Reg = UnspilledCS2GPRs.pop_back_val(); if (!MRI.isReserved(Reg)) { Extras.push_back(Reg); NumExtras--; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.h b/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.h index 9822e2321b..c609c07043 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMFrameLowering.h @@ -10,7 +10,7 @@ #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H #include "llvm/CodeGen/TargetFrameLowering.h" -#include "llvm/Support/TypeSize.h" +#include "llvm/Support/TypeSize.h" namespace llvm { @@ -48,8 +48,8 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; - StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, - Register &FrameReg) const override; + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; int ResolveFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg, int SPAdj) const; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.cpp index f083fa6662..48df96b5e6 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.cpp @@ -10,19 +10,19 @@ #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMSubtarget.h" -#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/Support/CommandLine.h" - +#include "llvm/Support/CommandLine.h" + using namespace llvm; -static cl::opt<int> DataBankMask("arm-data-bank-mask", cl::init(-1), - cl::Hidden); -static cl::opt<bool> AssumeITCMConflict("arm-assume-itcm-bankconflict", - cl::init(false), cl::Hidden); - +static cl::opt<int> DataBankMask("arm-data-bank-mask", cl::init(-1), + cl::Hidden); +static cl::opt<bool> AssumeITCMConflict("arm-assume-itcm-bankconflict", + cl::init(false), cl::Hidden); + static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, const TargetRegisterInfo &TRI) { // FIXME: Detect integer instructions properly. @@ -39,7 +39,7 @@ static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI, } ScheduleHazardRecognizer::HazardType -ARMHazardRecognizerFPMLx::getHazardType(SUnit *SU, int Stalls) { +ARMHazardRecognizerFPMLx::getHazardType(SUnit *SU, int Stalls) { assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead"); MachineInstr *MI = SU->getInstr(); @@ -76,15 +76,15 @@ ARMHazardRecognizerFPMLx::getHazardType(SUnit *SU, int Stalls) { } } } - return NoHazard; + return NoHazard; } -void ARMHazardRecognizerFPMLx::Reset() { +void ARMHazardRecognizerFPMLx::Reset() { LastMI = nullptr; FpMLxStalls = 0; } -void ARMHazardRecognizerFPMLx::EmitInstruction(SUnit *SU) { +void ARMHazardRecognizerFPMLx::EmitInstruction(SUnit *SU) { MachineInstr *MI = SU->getInstr(); if (!MI->isDebugInstr()) { LastMI = MI; @@ -92,177 +92,177 @@ void ARMHazardRecognizerFPMLx::EmitInstruction(SUnit *SU) { } } -void ARMHazardRecognizerFPMLx::AdvanceCycle() { +void ARMHazardRecognizerFPMLx::AdvanceCycle() { if (FpMLxStalls && --FpMLxStalls == 0) // Stalled for 4 cycles but still can't schedule any other instructions. LastMI = nullptr; } -void ARMHazardRecognizerFPMLx::RecedeCycle() { +void ARMHazardRecognizerFPMLx::RecedeCycle() { llvm_unreachable("reverse ARM hazard checking unsupported"); } - -///////// Bank conflicts handled as hazards ////////////// - -static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, - int64_t &Offset) { - - uint64_t TSFlags = MI.getDesc().TSFlags; - unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); - unsigned IndexMode = - (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; - - // Address mode tells us what we want to know about operands for T2 - // instructions (but not size). It tells us size (but not about operands) - // for T1 instructions. - switch (AddrMode) { - default: - return false; - case ARMII::AddrModeT2_i8: - // t2LDRBT, t2LDRB_POST, t2LDRB_PRE, t2LDRBi8, - // t2LDRHT, t2LDRH_POST, t2LDRH_PRE, t2LDRHi8, - // t2LDRSBT, t2LDRSB_POST, t2LDRSB_PRE, t2LDRSBi8, - // t2LDRSHT, t2LDRSH_POST, t2LDRSH_PRE, t2LDRSHi8, - // t2LDRT, t2LDR_POST, t2LDR_PRE, t2LDRi8 - BaseOp = &MI.getOperand(1); - Offset = (IndexMode == ARMII::IndexModePost) - ? 0 - : (IndexMode == ARMII::IndexModePre || - IndexMode == ARMII::IndexModeUpd) - ? MI.getOperand(3).getImm() - : MI.getOperand(2).getImm(); - return true; - case ARMII::AddrModeT2_i12: - // t2LDRBi12, t2LDRHi12 - // t2LDRSBi12, t2LDRSHi12 - // t2LDRi12 - BaseOp = &MI.getOperand(1); - Offset = MI.getOperand(2).getImm(); - return true; - case ARMII::AddrModeT2_i8s4: - // t2LDRD_POST, t2LDRD_PRE, t2LDRDi8 - BaseOp = &MI.getOperand(2); - Offset = (IndexMode == ARMII::IndexModePost) - ? 0 - : (IndexMode == ARMII::IndexModePre || - IndexMode == ARMII::IndexModeUpd) - ? MI.getOperand(4).getImm() - : MI.getOperand(3).getImm(); - return true; - case ARMII::AddrModeT1_1: - // tLDRBi, tLDRBr (watch out!), TLDRSB - case ARMII::AddrModeT1_2: - // tLDRHi, tLDRHr (watch out!), TLDRSH - case ARMII::AddrModeT1_4: - // tLDRi, tLDRr (watch out!) - BaseOp = &MI.getOperand(1); - Offset = MI.getOperand(2).isImm() ? MI.getOperand(2).getImm() : 0; - return MI.getOperand(2).isImm(); - } - return false; -} - -ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer( - const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict) - : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()), - DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask) - : CPUBankMask), - AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences() - ? AssumeITCMConflict - : CPUAssumeITCMConflict) { - MaxLookAhead = 1; -} - -ScheduleHazardRecognizer::HazardType -ARMBankConflictHazardRecognizer::CheckOffsets(unsigned O0, unsigned O1) { - return (((O0 ^ O1) & DataMask) != 0) ? NoHazard : Hazard; -} - -ScheduleHazardRecognizer::HazardType -ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { - MachineInstr &L0 = *SU->getInstr(); - if (!L0.mayLoad() || L0.mayStore() || L0.getNumMemOperands() != 1) - return NoHazard; - - auto MO0 = *L0.memoperands().begin(); - auto BaseVal0 = MO0->getValue(); - auto BasePseudoVal0 = MO0->getPseudoValue(); - int64_t Offset0 = 0; - - if (MO0->getSize() > 4) - return NoHazard; - - bool SPvalid = false; - const MachineOperand *SP = nullptr; - int64_t SPOffset0 = 0; - - for (auto L1 : Accesses) { - auto MO1 = *L1->memoperands().begin(); - auto BaseVal1 = MO1->getValue(); - auto BasePseudoVal1 = MO1->getPseudoValue(); - int64_t Offset1 = 0; - - // Pointers to the same object - if (BaseVal0 && BaseVal1) { - const Value *Ptr0, *Ptr1; - Ptr0 = GetPointerBaseWithConstantOffset(BaseVal0, Offset0, DL, true); - Ptr1 = GetPointerBaseWithConstantOffset(BaseVal1, Offset1, DL, true); - if (Ptr0 == Ptr1 && Ptr0) - return CheckOffsets(Offset0, Offset1); - } - - if (BasePseudoVal0 && BasePseudoVal1 && - BasePseudoVal0->kind() == BasePseudoVal1->kind() && - BasePseudoVal0->kind() == PseudoSourceValue::FixedStack) { - // Spills/fills - auto FS0 = cast<FixedStackPseudoSourceValue>(BasePseudoVal0); - auto FS1 = cast<FixedStackPseudoSourceValue>(BasePseudoVal1); - Offset0 = MF.getFrameInfo().getObjectOffset(FS0->getFrameIndex()); - Offset1 = MF.getFrameInfo().getObjectOffset(FS1->getFrameIndex()); - return CheckOffsets(Offset0, Offset1); - } - - // Constant pools (likely in ITCM) - if (BasePseudoVal0 && BasePseudoVal1 && - BasePseudoVal0->kind() == BasePseudoVal1->kind() && - BasePseudoVal0->isConstantPool() && AssumeITCMBankConflict) - return Hazard; - - // Is this a stack pointer-relative access? We could in general try to - // use "is this the same register and is it unchanged?", but the - // memory operand tracking is highly likely to have already found that. - // What we're after here is bank conflicts between different objects in - // the stack frame. - if (!SPvalid) { // set up SP - if (!getBaseOffset(L0, SP, SPOffset0) || SP->getReg().id() != ARM::SP) - SP = nullptr; - SPvalid = true; - } - if (SP) { - int64_t SPOffset1; - const MachineOperand *SP1; - if (getBaseOffset(*L1, SP1, SPOffset1) && SP1->getReg().id() == ARM::SP) - return CheckOffsets(SPOffset0, SPOffset1); - } - } - - return NoHazard; -} - -void ARMBankConflictHazardRecognizer::Reset() { Accesses.clear(); } - -void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) { - MachineInstr &MI = *SU->getInstr(); - if (!MI.mayLoad() || MI.mayStore() || MI.getNumMemOperands() != 1) - return; - - auto MO = *MI.memoperands().begin(); - uint64_t Size1 = MO->getSize(); - if (Size1 > 4) - return; - Accesses.push_back(&MI); -} - -void ARMBankConflictHazardRecognizer::AdvanceCycle() { Accesses.clear(); } - -void ARMBankConflictHazardRecognizer::RecedeCycle() { Accesses.clear(); } + +///////// Bank conflicts handled as hazards ////////////// + +static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, + int64_t &Offset) { + + uint64_t TSFlags = MI.getDesc().TSFlags; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + unsigned IndexMode = + (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift; + + // Address mode tells us what we want to know about operands for T2 + // instructions (but not size). It tells us size (but not about operands) + // for T1 instructions. + switch (AddrMode) { + default: + return false; + case ARMII::AddrModeT2_i8: + // t2LDRBT, t2LDRB_POST, t2LDRB_PRE, t2LDRBi8, + // t2LDRHT, t2LDRH_POST, t2LDRH_PRE, t2LDRHi8, + // t2LDRSBT, t2LDRSB_POST, t2LDRSB_PRE, t2LDRSBi8, + // t2LDRSHT, t2LDRSH_POST, t2LDRSH_PRE, t2LDRSHi8, + // t2LDRT, t2LDR_POST, t2LDR_PRE, t2LDRi8 + BaseOp = &MI.getOperand(1); + Offset = (IndexMode == ARMII::IndexModePost) + ? 0 + : (IndexMode == ARMII::IndexModePre || + IndexMode == ARMII::IndexModeUpd) + ? MI.getOperand(3).getImm() + : MI.getOperand(2).getImm(); + return true; + case ARMII::AddrModeT2_i12: + // t2LDRBi12, t2LDRHi12 + // t2LDRSBi12, t2LDRSHi12 + // t2LDRi12 + BaseOp = &MI.getOperand(1); + Offset = MI.getOperand(2).getImm(); + return true; + case ARMII::AddrModeT2_i8s4: + // t2LDRD_POST, t2LDRD_PRE, t2LDRDi8 + BaseOp = &MI.getOperand(2); + Offset = (IndexMode == ARMII::IndexModePost) + ? 0 + : (IndexMode == ARMII::IndexModePre || + IndexMode == ARMII::IndexModeUpd) + ? MI.getOperand(4).getImm() + : MI.getOperand(3).getImm(); + return true; + case ARMII::AddrModeT1_1: + // tLDRBi, tLDRBr (watch out!), TLDRSB + case ARMII::AddrModeT1_2: + // tLDRHi, tLDRHr (watch out!), TLDRSH + case ARMII::AddrModeT1_4: + // tLDRi, tLDRr (watch out!) + BaseOp = &MI.getOperand(1); + Offset = MI.getOperand(2).isImm() ? MI.getOperand(2).getImm() : 0; + return MI.getOperand(2).isImm(); + } + return false; +} + +ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer( + const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict) + : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()), + DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask) + : CPUBankMask), + AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences() + ? AssumeITCMConflict + : CPUAssumeITCMConflict) { + MaxLookAhead = 1; +} + +ScheduleHazardRecognizer::HazardType +ARMBankConflictHazardRecognizer::CheckOffsets(unsigned O0, unsigned O1) { + return (((O0 ^ O1) & DataMask) != 0) ? NoHazard : Hazard; +} + +ScheduleHazardRecognizer::HazardType +ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + MachineInstr &L0 = *SU->getInstr(); + if (!L0.mayLoad() || L0.mayStore() || L0.getNumMemOperands() != 1) + return NoHazard; + + auto MO0 = *L0.memoperands().begin(); + auto BaseVal0 = MO0->getValue(); + auto BasePseudoVal0 = MO0->getPseudoValue(); + int64_t Offset0 = 0; + + if (MO0->getSize() > 4) + return NoHazard; + + bool SPvalid = false; + const MachineOperand *SP = nullptr; + int64_t SPOffset0 = 0; + + for (auto L1 : Accesses) { + auto MO1 = *L1->memoperands().begin(); + auto BaseVal1 = MO1->getValue(); + auto BasePseudoVal1 = MO1->getPseudoValue(); + int64_t Offset1 = 0; + + // Pointers to the same object + if (BaseVal0 && BaseVal1) { + const Value *Ptr0, *Ptr1; + Ptr0 = GetPointerBaseWithConstantOffset(BaseVal0, Offset0, DL, true); + Ptr1 = GetPointerBaseWithConstantOffset(BaseVal1, Offset1, DL, true); + if (Ptr0 == Ptr1 && Ptr0) + return CheckOffsets(Offset0, Offset1); + } + + if (BasePseudoVal0 && BasePseudoVal1 && + BasePseudoVal0->kind() == BasePseudoVal1->kind() && + BasePseudoVal0->kind() == PseudoSourceValue::FixedStack) { + // Spills/fills + auto FS0 = cast<FixedStackPseudoSourceValue>(BasePseudoVal0); + auto FS1 = cast<FixedStackPseudoSourceValue>(BasePseudoVal1); + Offset0 = MF.getFrameInfo().getObjectOffset(FS0->getFrameIndex()); + Offset1 = MF.getFrameInfo().getObjectOffset(FS1->getFrameIndex()); + return CheckOffsets(Offset0, Offset1); + } + + // Constant pools (likely in ITCM) + if (BasePseudoVal0 && BasePseudoVal1 && + BasePseudoVal0->kind() == BasePseudoVal1->kind() && + BasePseudoVal0->isConstantPool() && AssumeITCMBankConflict) + return Hazard; + + // Is this a stack pointer-relative access? We could in general try to + // use "is this the same register and is it unchanged?", but the + // memory operand tracking is highly likely to have already found that. + // What we're after here is bank conflicts between different objects in + // the stack frame. + if (!SPvalid) { // set up SP + if (!getBaseOffset(L0, SP, SPOffset0) || SP->getReg().id() != ARM::SP) + SP = nullptr; + SPvalid = true; + } + if (SP) { + int64_t SPOffset1; + const MachineOperand *SP1; + if (getBaseOffset(*L1, SP1, SPOffset1) && SP1->getReg().id() == ARM::SP) + return CheckOffsets(SPOffset0, SPOffset1); + } + } + + return NoHazard; +} + +void ARMBankConflictHazardRecognizer::Reset() { Accesses.clear(); } + +void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) { + MachineInstr &MI = *SU->getInstr(); + if (!MI.mayLoad() || MI.mayStore() || MI.getNumMemOperands() != 1) + return; + + auto MO = *MI.memoperands().begin(); + uint64_t Size1 = MO->getSize(); + if (Size1 > 4) + return; + Accesses.push_back(&MI); +} + +void ARMBankConflictHazardRecognizer::AdvanceCycle() { Accesses.clear(); } + +void ARMBankConflictHazardRecognizer::RecedeCycle() { Accesses.clear(); } diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.h index c1f1bcd0a6..e6b5304488 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMHazardRecognizer.h @@ -13,28 +13,28 @@ #ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H #define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H -#include "ARMBaseInstrInfo.h" -#include "llvm/ADT/BitmaskEnum.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/Support/DataTypes.h" -#include <array> -#include <initializer_list> +#include "ARMBaseInstrInfo.h" +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/Support/DataTypes.h" +#include <array> +#include <initializer_list> namespace llvm { -class DataLayout; -class MachineFunction; +class DataLayout; +class MachineFunction; class MachineInstr; -class ScheduleDAG; +class ScheduleDAG; -// Hazards related to FP MLx instructions -class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer { +// Hazards related to FP MLx instructions +class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer { MachineInstr *LastMI = nullptr; unsigned FpMLxStalls = 0; public: - ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; } + ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; } HazardType getHazardType(SUnit *SU, int Stalls) override; void Reset() override; @@ -43,27 +43,27 @@ public: void RecedeCycle() override; }; -// Hazards related to bank conflicts -class ARMBankConflictHazardRecognizer : public ScheduleHazardRecognizer { - SmallVector<MachineInstr *, 8> Accesses; - const MachineFunction &MF; - const DataLayout &DL; - int64_t DataMask; - bool AssumeITCMBankConflict; - -public: - ARMBankConflictHazardRecognizer(const ScheduleDAG *DAG, int64_t DDM, - bool ABC); - HazardType getHazardType(SUnit *SU, int Stalls) override; - void Reset() override; - void EmitInstruction(SUnit *SU) override; - void AdvanceCycle() override; - void RecedeCycle() override; - -private: - inline HazardType CheckOffsets(unsigned O0, unsigned O1); -}; - +// Hazards related to bank conflicts +class ARMBankConflictHazardRecognizer : public ScheduleHazardRecognizer { + SmallVector<MachineInstr *, 8> Accesses; + const MachineFunction &MF; + const DataLayout &DL; + int64_t DataMask; + bool AssumeITCMBankConflict; + +public: + ARMBankConflictHazardRecognizer(const ScheduleDAG *DAG, int64_t DDM, + bool ABC); + HazardType getHazardType(SUnit *SU, int Stalls) override; + void Reset() override; + void EmitInstruction(SUnit *SU) override; + void AdvanceCycle() override; + void RecedeCycle() override; + +private: + inline HazardType CheckOffsets(unsigned O0, unsigned O1); +}; + } // end namespace llvm #endif diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.cpp index 598062672a..2daf77fb5e 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.cpp @@ -143,7 +143,7 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); -cl::opt<unsigned> +cl::opt<unsigned> MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2)); @@ -289,8 +289,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); @@ -337,8 +337,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; @@ -443,9 +443,9 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); } } @@ -994,8 +994,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMAX); setTargetDAGCombine(ISD::FP_EXTEND); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); } if (!Subtarget->hasFP64()) { @@ -1725,11 +1725,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VCVTL: return "ARMISD::VCVTL"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; - case ARMISD::VQDMULH: return "ARMISD::VQDMULH"; + case ARMISD::VQDMULH: return "ARMISD::VQDMULH"; case ARMISD::VADDVs: return "ARMISD::VADDVs"; case ARMISD::VADDVu: return "ARMISD::VADDVu"; - case ARMISD::VADDVps: return "ARMISD::VADDVps"; - case ARMISD::VADDVpu: return "ARMISD::VADDVpu"; + case ARMISD::VADDVps: return "ARMISD::VADDVps"; + case ARMISD::VADDVpu: return "ARMISD::VADDVpu"; case ARMISD::VADDLVs: return "ARMISD::VADDLVs"; case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; @@ -1740,20 +1740,20 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; - case ARMISD::VMLAVps: return "ARMISD::VMLAVps"; - case ARMISD::VMLAVpu: return "ARMISD::VMLAVpu"; + case ARMISD::VMLAVps: return "ARMISD::VMLAVps"; + case ARMISD::VMLAVpu: return "ARMISD::VMLAVpu"; case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; case ARMISD::VMLALVu: return "ARMISD::VMLALVu"; - case ARMISD::VMLALVps: return "ARMISD::VMLALVps"; - case ARMISD::VMLALVpu: return "ARMISD::VMLALVpu"; + case ARMISD::VMLALVps: return "ARMISD::VMLALVps"; + case ARMISD::VMLALVpu: return "ARMISD::VMLALVpu"; case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs"; case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu"; - case ARMISD::VMLALVAps: return "ARMISD::VMLALVAps"; - case ARMISD::VMLALVApu: return "ARMISD::VMLALVApu"; - case ARMISD::VMINVu: return "ARMISD::VMINVu"; - case ARMISD::VMINVs: return "ARMISD::VMINVs"; - case ARMISD::VMAXVu: return "ARMISD::VMAXVu"; - case ARMISD::VMAXVs: return "ARMISD::VMAXVs"; + case ARMISD::VMLALVAps: return "ARMISD::VMLALVAps"; + case ARMISD::VMLALVApu: return "ARMISD::VMLALVApu"; + case ARMISD::VMINVu: return "ARMISD::VMINVu"; + case ARMISD::VMINVs: return "ARMISD::VMINVs"; + case ARMISD::VMAXVu: return "ARMISD::VMAXVu"; + case ARMISD::VMAXVs: return "ARMISD::VMAXVs"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; @@ -1777,7 +1777,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; - case ARMISD::VBSP: return "ARMISD::VBSP"; + case ARMISD::VBSP: return "ARMISD::VBSP"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; @@ -2531,9 +2531,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); @@ -3342,7 +3342,7 @@ ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i32, DL, Chain, DescAddr, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); @@ -3556,7 +3556,7 @@ static bool allUsersAreInFunction(const Value *V, const Function *F) { while (!Worklist.empty()) { auto *U = Worklist.pop_back_val(); if (isa<ConstantExpr>(U)) { - append_range(Worklist, U->users()); + append_range(Worklist, U->users()); continue; } @@ -4443,26 +4443,26 @@ SDValue ARMTargetLowering::LowerFormalArguments( } // varargs - if (isVarArg && MFI.hasVAStart()) { - VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), + if (isVarArg && MFI.hasVAStart()) { + VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), TotalArgRegsSaveSize); - if (AFI->isCmseNSEntryFunction()) { - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "secure entry function must not be variadic", dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - } + if (AFI->isCmseNSEntryFunction()) { + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "secure entry function must not be variadic", dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + } AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); - if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { - DiagnosticInfoUnsupported Diag( - DAG.getMachineFunction().getFunction(), - "secure entry function requires arguments on stack", dl.getDebugLoc()); - DAG.getContext()->diagnose(Diag); - } - + if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "secure entry function requires arguments on stack", dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + return Chain; } @@ -5034,68 +5034,68 @@ static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, // x < k ? (x < -k ? -k : x) : k // etc. // -// LLVM canonicalizes these to either a min(max()) or a max(min()) -// pattern. This function tries to match one of these and will return a SSAT -// node if successful. +// LLVM canonicalizes these to either a min(max()) or a max(min()) +// pattern. This function tries to match one of these and will return a SSAT +// node if successful. // -// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 -// is a power of 2. -static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - SDValue V1 = Op.getOperand(0); - SDValue K1 = Op.getOperand(1); +// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 +// is a power of 2. +static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + SDValue V1 = Op.getOperand(0); + SDValue K1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); SDValue FalseVal1 = Op.getOperand(3); ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; if (Op2.getOpcode() != ISD::SELECT_CC) - return SDValue(); + return SDValue(); - SDValue V2 = Op2.getOperand(0); - SDValue K2 = Op2.getOperand(1); + SDValue V2 = Op2.getOperand(0); + SDValue K2 = Op2.getOperand(1); SDValue TrueVal2 = Op2.getOperand(2); SDValue FalseVal2 = Op2.getOperand(3); ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); - SDValue V1Tmp = V1; - SDValue V2Tmp = V2; + SDValue V1Tmp = V1; + SDValue V2Tmp = V2; - // Check that the registers and the constants match a max(min()) or min(max()) - // pattern - if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || - K2 != FalseVal2 || - !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) - return SDValue(); + // Check that the registers and the constants match a max(min()) or min(max()) + // pattern + if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || + K2 != FalseVal2 || + !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) + return SDValue(); // Check that the constant in the lower-bound check is // the opposite of the constant in the upper-bound check // in 1's complement. - if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) - return SDValue(); - - int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); - int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); + if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) + return SDValue(); + + int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); + int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); int64_t NegVal = std::min(Val1, Val2); - if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || - !isPowerOf2_64(PosVal + 1)) - return SDValue(); + if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || + !isPowerOf2_64(PosVal + 1)) + return SDValue(); - // Handle the difference between USAT (unsigned) and SSAT (signed) - // saturation - // At this point, PosVal is guaranteed to be positive - uint64_t K = PosVal; - SDLoc dl(Op); - if (Val1 == ~Val2) - return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, - DAG.getConstant(countTrailingOnes(K), dl, VT)); - if (NegVal == 0) - return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, - DAG.getConstant(countTrailingOnes(K), dl, VT)); + // Handle the difference between USAT (unsigned) and SSAT (signed) + // saturation + // At this point, PosVal is guaranteed to be positive + uint64_t K = PosVal; + SDLoc dl(Op); + if (Val1 == ~Val2) + return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, + DAG.getConstant(countTrailingOnes(K), dl, VT)); + if (NegVal == 0) + return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, + DAG.getConstant(countTrailingOnes(K), dl, VT)); - return SDValue(); + return SDValue(); } // Check if a condition of the type x < k ? k : x can be converted into a @@ -5155,9 +5155,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // Try to convert two saturating conditional selects into a single SSAT - if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) - if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) - return SatValue; + if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) + if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) + return SatValue; // Try to convert expressions of the form x < k ? k : x (and similar forms) // into more efficient bit operations, which is possible when k is 0 or -1 @@ -5166,7 +5166,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // instructions. // Only allow this transformation on full-width (32-bit) operations SDValue LowerSatConstant; - SDValue SatValue; + SDValue SatValue; if (VT == MVT::i32 && isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, @@ -7750,19 +7750,19 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); - uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); - uint64_t VTSize = VT.getFixedSizeInBits(); - if (SrcVTSize == VTSize) + uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); + uint64_t VTSize = VT.getFixedSizeInBits(); + if (SrcVTSize == VTSize) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); - unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); + unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - if (SrcVTSize < VTSize) { - if (2 * SrcVTSize != VTSize) + if (SrcVTSize < VTSize) { + if (2 * SrcVTSize != VTSize) return SDValue(); // We can pad out the smaller vector for free, so if it's part of a // shuffle... @@ -7772,7 +7772,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, continue; } - if (SrcVTSize != 2 * VTSize) + if (SrcVTSize != 2 * VTSize) return SDValue(); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { @@ -7840,7 +7840,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); - int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), + int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; @@ -8642,23 +8642,23 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } -// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). -static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - assert(ST->hasMVEIntegerOps() && "Expected MVE!"); - EVT VT = N.getValueType(); - assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && - "Expected a vector i1 type!"); - SDValue Op = N.getOperand(0); - EVT FromVT = Op.getValueType(); - SDLoc DL(N); - - SDValue And = - DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); - return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), - DAG.getCondCode(ISD::SETNE)); -} - +// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). +static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(ST->hasMVEIntegerOps() && "Expected MVE!"); + EVT VT = N.getValueType(); + assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && + "Expected a vector i1 type!"); + SDValue Op = N.getOperand(0); + EVT FromVT = Op.getValueType(); + SDLoc DL(N); + + SDValue And = + DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); + return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), + DAG.getCondCode(ISD::SETNE)); +} + /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. @@ -8723,11 +8723,11 @@ static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return false; } -/// isZeroExtended - Check if a node is a vector value that is zero-extended (or -/// any-extended) or a constant BUILD_VECTOR with zero-extended elements. +/// isZeroExtended - Check if a node is a vector value that is zero-extended (or +/// any-extended) or a constant BUILD_VECTOR with zero-extended elements. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || - ISD::isZEXTLoad(N)) + if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || + ISD::isZEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; @@ -8795,14 +8795,14 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, -/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return -/// the unextended value. The unextended vector should be 64 bits so that it can +/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return +/// the unextended value. The unextended vector should be 64 bits so that it can /// be used as an operand to a VMULL instruction. If the original vector size /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND || - N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) + if (N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), @@ -9770,7 +9770,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); - case ISD::TRUNCATE: return LowerTruncatei1(Op, DAG, Subtarget); + case ISD::TRUNCATE: return LowerTruncatei1(Op, DAG, Subtarget); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: @@ -10403,7 +10403,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. - SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); + SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); if (SMBB->isEHPad()) { @@ -10887,7 +10887,7 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); - BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) + BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) .add(predOps(ARMCC::AL)) .addReg(Reg, RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Kill) @@ -11266,14 +11266,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return EmitLowered__chkstk(MI, BB); case ARM::WIN__DBZCHK: return EmitLowered__dbzchk(MI, BB); - case ARM::t2DoLoopStart: - // We are just here to set a register allocation hint, prefering lr for the - // input register to make it more likely to be movable and removable, later - // in the pipeline. - Register R = MI.getOperand(1).getReg(); - MachineFunction *MF = MI.getParent()->getParent(); - MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0); - return BB; + case ARM::t2DoLoopStart: + // We are just here to set a register allocation hint, prefering lr for the + // input register to make it more likely to be movable and removable, later + // in the pipeline. + Register R = MI.getOperand(1).getReg(); + MachineFunction *MF = MI.getParent()->getParent(); + MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0); + return BB; } } @@ -12115,198 +12115,198 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } -static SDValue PerformSELECTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - SDLoc dl(N); - SDValue SetCC; - SDValue LHS; - SDValue RHS; - ISD::CondCode CC; - SDValue TrueVal; - SDValue FalseVal; - - if (N->getOpcode() == ISD::SELECT && - N->getOperand(0)->getOpcode() == ISD::SETCC) { - SetCC = N->getOperand(0); - LHS = SetCC->getOperand(0); - RHS = SetCC->getOperand(1); - CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); - TrueVal = N->getOperand(1); - FalseVal = N->getOperand(2); - } else if (N->getOpcode() == ISD::SELECT_CC) { - LHS = N->getOperand(0); - RHS = N->getOperand(1); - CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - TrueVal = N->getOperand(2); - FalseVal = N->getOperand(3); - } else { - return SDValue(); - } - - unsigned int Opcode = 0; - if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || - FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && - (CC == ISD::SETULT || CC == ISD::SETUGT)) { - Opcode = ARMISD::VMINVu; - if (CC == ISD::SETUGT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || - FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && - (CC == ISD::SETLT || CC == ISD::SETGT)) { - Opcode = ARMISD::VMINVs; - if (CC == ISD::SETGT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || - FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && - (CC == ISD::SETUGT || CC == ISD::SETULT)) { - Opcode = ARMISD::VMAXVu; - if (CC == ISD::SETULT) - std::swap(TrueVal, FalseVal); - } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || - FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && - (CC == ISD::SETGT || CC == ISD::SETLT)) { - Opcode = ARMISD::VMAXVs; - if (CC == ISD::SETLT) - std::swap(TrueVal, FalseVal); - } else - return SDValue(); - - // Normalise to the right hand side being the vector reduction - switch (TrueVal->getOpcode()) { - case ISD::VECREDUCE_UMIN: - case ISD::VECREDUCE_SMIN: - case ISD::VECREDUCE_UMAX: - case ISD::VECREDUCE_SMAX: - std::swap(LHS, RHS); - std::swap(TrueVal, FalseVal); - break; - } - - EVT VectorType = FalseVal->getOperand(0).getValueType(); - - if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && - VectorType != MVT::v4i32) - return SDValue(); - - EVT VectorScalarType = VectorType.getVectorElementType(); - - // The values being selected must also be the ones being compared - if (TrueVal != LHS || FalseVal != RHS) - return SDValue(); - - EVT LeftType = LHS->getValueType(0); - EVT RightType = RHS->getValueType(0); - - // The types must match the reduced type too - if (LeftType != VectorScalarType || RightType != VectorScalarType) - return SDValue(); - - // Legalise the scalar to an i32 - if (VectorScalarType != MVT::i32) - LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); - - // Generate the reduction as an i32 for legalisation purposes - auto Reduction = - DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); - - // The result isn't actually an i32 so truncate it back to its original type - if (VectorScalarType != MVT::i32) - Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); - - return Reduction; -} - -// A special combine for the vqdmulh family of instructions. This is one of the -// potential set of patterns that could patch this instruction. The base pattern -// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). -// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), -// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as -// the max is unnecessary. -static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDValue Shft; - ConstantSDNode *Clamp; - - if (N->getOpcode() == ISD::SMIN) { - Shft = N->getOperand(0); - Clamp = isConstOrConstSplat(N->getOperand(1)); - } else if (N->getOpcode() == ISD::VSELECT) { - // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. - SDValue Cmp = N->getOperand(0); - if (Cmp.getOpcode() != ISD::SETCC || - cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || - Cmp.getOperand(0) != N->getOperand(1) || - Cmp.getOperand(1) != N->getOperand(2)) - return SDValue(); - Shft = N->getOperand(1); - Clamp = isConstOrConstSplat(N->getOperand(2)); - } else - return SDValue(); - - if (!Clamp) - return SDValue(); - - MVT ScalarType; - int ShftAmt = 0; - switch (Clamp->getSExtValue()) { - case (1 << 7) - 1: - ScalarType = MVT::i8; - ShftAmt = 7; - break; - case (1 << 15) - 1: - ScalarType = MVT::i16; - ShftAmt = 15; - break; - case (1ULL << 31) - 1: - ScalarType = MVT::i32; - ShftAmt = 31; - break; - default: - return SDValue(); - } - - if (Shft.getOpcode() != ISD::SRA) - return SDValue(); - ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); - if (!N1 || N1->getSExtValue() != ShftAmt) - return SDValue(); - - SDValue Mul = Shft.getOperand(0); - if (Mul.getOpcode() != ISD::MUL) - return SDValue(); - - SDValue Ext0 = Mul.getOperand(0); - SDValue Ext1 = Mul.getOperand(1); - if (Ext0.getOpcode() != ISD::SIGN_EXTEND || - Ext1.getOpcode() != ISD::SIGN_EXTEND) - return SDValue(); - EVT VecVT = Ext0.getOperand(0).getValueType(); - if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8) - return SDValue(); - if (Ext1.getOperand(0).getValueType() != VecVT || - VecVT.getScalarType() != ScalarType || - VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) - return SDValue(); - - SDLoc DL(Mul); - SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0), - Ext1.getOperand(0)); - return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH); -} - +static SDValue PerformSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + SDLoc dl(N); + SDValue SetCC; + SDValue LHS; + SDValue RHS; + ISD::CondCode CC; + SDValue TrueVal; + SDValue FalseVal; + + if (N->getOpcode() == ISD::SELECT && + N->getOperand(0)->getOpcode() == ISD::SETCC) { + SetCC = N->getOperand(0); + LHS = SetCC->getOperand(0); + RHS = SetCC->getOperand(1); + CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + TrueVal = N->getOperand(1); + FalseVal = N->getOperand(2); + } else if (N->getOpcode() == ISD::SELECT_CC) { + LHS = N->getOperand(0); + RHS = N->getOperand(1); + CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + TrueVal = N->getOperand(2); + FalseVal = N->getOperand(3); + } else { + return SDValue(); + } + + unsigned int Opcode = 0; + if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || + FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && + (CC == ISD::SETULT || CC == ISD::SETUGT)) { + Opcode = ARMISD::VMINVu; + if (CC == ISD::SETUGT) + std::swap(TrueVal, FalseVal); + } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || + FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && + (CC == ISD::SETLT || CC == ISD::SETGT)) { + Opcode = ARMISD::VMINVs; + if (CC == ISD::SETGT) + std::swap(TrueVal, FalseVal); + } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || + FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && + (CC == ISD::SETUGT || CC == ISD::SETULT)) { + Opcode = ARMISD::VMAXVu; + if (CC == ISD::SETULT) + std::swap(TrueVal, FalseVal); + } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || + FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && + (CC == ISD::SETGT || CC == ISD::SETLT)) { + Opcode = ARMISD::VMAXVs; + if (CC == ISD::SETLT) + std::swap(TrueVal, FalseVal); + } else + return SDValue(); + + // Normalise to the right hand side being the vector reduction + switch (TrueVal->getOpcode()) { + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_SMAX: + std::swap(LHS, RHS); + std::swap(TrueVal, FalseVal); + break; + } + + EVT VectorType = FalseVal->getOperand(0).getValueType(); + + if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && + VectorType != MVT::v4i32) + return SDValue(); + + EVT VectorScalarType = VectorType.getVectorElementType(); + + // The values being selected must also be the ones being compared + if (TrueVal != LHS || FalseVal != RHS) + return SDValue(); + + EVT LeftType = LHS->getValueType(0); + EVT RightType = RHS->getValueType(0); + + // The types must match the reduced type too + if (LeftType != VectorScalarType || RightType != VectorScalarType) + return SDValue(); + + // Legalise the scalar to an i32 + if (VectorScalarType != MVT::i32) + LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // Generate the reduction as an i32 for legalisation purposes + auto Reduction = + DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); + + // The result isn't actually an i32 so truncate it back to its original type + if (VectorScalarType != MVT::i32) + Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); + + return Reduction; +} + +// A special combine for the vqdmulh family of instructions. This is one of the +// potential set of patterns that could patch this instruction. The base pattern +// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). +// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), +// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as +// the max is unnecessary. +static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue Shft; + ConstantSDNode *Clamp; + + if (N->getOpcode() == ISD::SMIN) { + Shft = N->getOperand(0); + Clamp = isConstOrConstSplat(N->getOperand(1)); + } else if (N->getOpcode() == ISD::VSELECT) { + // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. + SDValue Cmp = N->getOperand(0); + if (Cmp.getOpcode() != ISD::SETCC || + cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || + Cmp.getOperand(0) != N->getOperand(1) || + Cmp.getOperand(1) != N->getOperand(2)) + return SDValue(); + Shft = N->getOperand(1); + Clamp = isConstOrConstSplat(N->getOperand(2)); + } else + return SDValue(); + + if (!Clamp) + return SDValue(); + + MVT ScalarType; + int ShftAmt = 0; + switch (Clamp->getSExtValue()) { + case (1 << 7) - 1: + ScalarType = MVT::i8; + ShftAmt = 7; + break; + case (1 << 15) - 1: + ScalarType = MVT::i16; + ShftAmt = 15; + break; + case (1ULL << 31) - 1: + ScalarType = MVT::i32; + ShftAmt = 31; + break; + default: + return SDValue(); + } + + if (Shft.getOpcode() != ISD::SRA) + return SDValue(); + ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); + if (!N1 || N1->getSExtValue() != ShftAmt) + return SDValue(); + + SDValue Mul = Shft.getOperand(0); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue Ext0 = Mul.getOperand(0); + SDValue Ext1 = Mul.getOperand(1); + if (Ext0.getOpcode() != ISD::SIGN_EXTEND || + Ext1.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + EVT VecVT = Ext0.getOperand(0).getValueType(); + if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8) + return SDValue(); + if (Ext1.getOperand(0).getValueType() != VecVT || + VecVT.getScalarType() != ScalarType || + VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) + return SDValue(); + + SDLoc DL(Mul); + SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0), + Ext1.getOperand(0)); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH); +} + static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - if (!Subtarget->hasMVEIntegerOps()) - return SDValue(); - - if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) - return V; - + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) + return V; + // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). // // We need to re-implement this optimization here as the implementation in the @@ -12456,14 +12456,14 @@ static SDValue PerformADDVecReduce(SDNode *N, return M; if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) - return M; - if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) - return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) + return M; return SDValue(); } @@ -13358,7 +13358,7 @@ static SDValue PerformORCombine(SDNode *N, // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, + SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); @@ -13669,12 +13669,12 @@ static SDValue PerformVMOVrhCombine(SDNode *N, SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - // fold (VMOVrh (fpconst x)) -> const x - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { - APFloat V = C->getValueAPF(); - return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); - } - + // fold (VMOVrh (fpconst x)) -> const x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { + APFloat V = C->getValueAPF(); + return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); + } + // fold (VMOVrh (load x)) -> (zextload (i16*)x) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); @@ -13849,23 +13849,23 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); } - // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce - // more VPNOT which might get folded as else predicates. - if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { - SDValue X = - DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); - SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, - DCI.DAG.getConstant(65535, dl, MVT::i32)); - return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); - } - - // Only the bottom 16 bits of the source register are used. - if (Op.getValueType() == MVT::i32) { - APInt DemandedMask = APInt::getLowBitsSet(32, 16); - const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) - return SDValue(N, 0); - } + // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce + // more VPNOT which might get folded as else predicates. + if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { + SDValue X = + DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); + SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, + DCI.DAG.getConstant(65535, dl, MVT::i32)); + return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); + } + + // Only the bottom 16 bits of the source register are used. + if (Op.getValueType() == MVT::i32) { + APInt DemandedMask = APInt::getLowBitsSet(32, 16); + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) + return SDValue(N, 0); + } return SDValue(); } @@ -14078,13 +14078,13 @@ static SDValue CombineBaseUpdate(SDNode *N, NumVecs = 3; break; case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; NumVecs = 4; break; - case Intrinsic::arm_neon_vld1x2: - case Intrinsic::arm_neon_vld1x3: - case Intrinsic::arm_neon_vld1x4: + case Intrinsic::arm_neon_vld1x2: + case Intrinsic::arm_neon_vld1x3: + case Intrinsic::arm_neon_vld1x4: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: - // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip + // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip // combining base updates for such intrinsics. continue; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; @@ -14676,39 +14676,39 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, // use the VMOVN over splitting the store. We are looking for patterns of: // !rev: 0 N 1 N+1 2 N+2 ... // rev: N 0 N+1 1 N+2 2 ... - // The shuffle may either be a single source (in which case N = NumElts/2) or - // two inputs extended with concat to the same size (in which case N = - // NumElts). - auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { - ArrayRef<int> M = SVN->getMask(); + // The shuffle may either be a single source (in which case N = NumElts/2) or + // two inputs extended with concat to the same size (in which case N = + // NumElts). + auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { + ArrayRef<int> M = SVN->getMask(); unsigned NumElts = ToVT.getVectorNumElements(); - if (SVN->getOperand(1).isUndef()) - NumElts /= 2; + if (SVN->getOperand(1).isUndef()) + NumElts /= 2; - unsigned Off0 = Rev ? NumElts : 0; - unsigned Off1 = Rev ? 0 : NumElts; + unsigned Off0 = Rev ? NumElts : 0; + unsigned Off1 = Rev ? 0 : NumElts; - for (unsigned I = 0; I < NumElts; I += 2) { - if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) + for (unsigned I = 0; I < NumElts; I += 2) { + if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) return false; - if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) + if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) return false; } return true; }; - // It may be preferable to keep the store unsplit as the trunc may end up - // being removed. Check that here. - if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) { - if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) { - DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U); - return SDValue(); - } - } - if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) - if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) + // It may be preferable to keep the store unsplit as the trunc may end up + // being removed. Check that here. + if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) { + if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) { + DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U); return SDValue(); + } + } + if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) + if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) + return SDValue(); LLVMContext &C = *DAG.getContext(); SDLoc DL(St); @@ -14728,8 +14728,8 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SmallVector<SDValue, 4> Stores; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); + SDValue NewPtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), @@ -14782,15 +14782,15 @@ static SDValue PerformSTORECombine(SDNode *N, SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore( St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), - BasePtr, St->getPointerInfo(), St->getOriginalAlign(), + BasePtr, St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), - OffsetPtr, St->getPointerInfo().getWithOffset(4), - St->getOriginalAlign(), + OffsetPtr, St->getPointerInfo().getWithOffset(4), + St->getOriginalAlign(), St->getMemOperand()->getFlags()); } @@ -14964,107 +14964,107 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, // VADDLV u/s 32 // VMLALV u/s 16/32 - // If the input vector is smaller than legal (v4i8/v4i16 for example) we can - // extend it and use v4i32 instead. - auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { - EVT AVT = A.getValueType(); - if (!AVT.is128BitVector()) - A = DAG.getNode(ExtendCode, dl, - AVT.changeVectorElementType(MVT::getIntegerVT( - 128 / AVT.getVectorMinNumElements())), - A); - return A; - }; + // If the input vector is smaller than legal (v4i8/v4i16 for example) we can + // extend it and use v4i32 instead. + auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { + EVT AVT = A.getValueType(); + if (!AVT.is128BitVector()) + A = DAG.getNode(ExtendCode, dl, + AVT.changeVectorElementType(MVT::getIntegerVT( + 128 / AVT.getVectorMinNumElements())), + A); + return A; + }; auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { if (ResVT != RetTy || N0->getOpcode() != ExtendCode) return SDValue(); SDValue A = N0->getOperand(0); if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) - return ExtendIfNeeded(A, ExtendCode); - return SDValue(); - }; - auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, - ArrayRef<MVT> ExtTypes, SDValue &Mask) { - if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || - !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) - return SDValue(); - Mask = N0->getOperand(0); - SDValue Ext = N0->getOperand(1); - if (Ext->getOpcode() != ExtendCode) - return SDValue(); - SDValue A = Ext->getOperand(0); - if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) - return ExtendIfNeeded(A, ExtendCode); + return ExtendIfNeeded(A, ExtendCode); return SDValue(); }; + auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, + ArrayRef<MVT> ExtTypes, SDValue &Mask) { + if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || + !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) + return SDValue(); + Mask = N0->getOperand(0); + SDValue Ext = N0->getOperand(1); + if (Ext->getOpcode() != ExtendCode) + return SDValue(); + SDValue A = Ext->getOperand(0); + if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return ExtendIfNeeded(A, ExtendCode); + return SDValue(); + }; auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, SDValue &A, SDValue &B) { - // For a vmla we are trying to match a larger pattern: - // ExtA = sext/zext A - // ExtB = sext/zext B - // Mul = mul ExtA, ExtB - // vecreduce.add Mul - // There might also be en extra extend between the mul and the addreduce, so - // long as the bitwidth is high enough to make them equivalent (for example - // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). - if (ResVT != RetTy) + // For a vmla we are trying to match a larger pattern: + // ExtA = sext/zext A + // ExtB = sext/zext B + // Mul = mul ExtA, ExtB + // vecreduce.add Mul + // There might also be en extra extend between the mul and the addreduce, so + // long as the bitwidth is high enough to make them equivalent (for example + // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). + if (ResVT != RetTy) return false; - SDValue Mul = N0; - if (Mul->getOpcode() == ExtendCode && - Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= - ResVT.getScalarSizeInBits()) - Mul = Mul->getOperand(0); - if (Mul->getOpcode() != ISD::MUL) - return false; - SDValue ExtA = Mul->getOperand(0); - SDValue ExtB = Mul->getOperand(1); + SDValue Mul = N0; + if (Mul->getOpcode() == ExtendCode && + Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= + ResVT.getScalarSizeInBits()) + Mul = Mul->getOperand(0); + if (Mul->getOpcode() != ISD::MUL) + return false; + SDValue ExtA = Mul->getOperand(0); + SDValue ExtB = Mul->getOperand(1); if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) return false; A = ExtA->getOperand(0); B = ExtB->getOperand(0); if (A.getValueType() == B.getValueType() && - llvm::any_of(ExtTypes, - [&A](MVT Ty) { return A.getValueType() == Ty; })) { - A = ExtendIfNeeded(A, ExtendCode); - B = ExtendIfNeeded(B, ExtendCode); + llvm::any_of(ExtTypes, + [&A](MVT Ty) { return A.getValueType() == Ty; })) { + A = ExtendIfNeeded(A, ExtendCode); + B = ExtendIfNeeded(B, ExtendCode); return true; - } - return false; - }; - auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, - SDValue &A, SDValue &B, SDValue &Mask) { - // Same as the pattern above with a select for the zero predicated lanes - // ExtA = sext/zext A - // ExtB = sext/zext B - // Mul = mul ExtA, ExtB - // N0 = select Mask, Mul, 0 - // vecreduce.add N0 - if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || - !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) - return false; - Mask = N0->getOperand(0); - SDValue Mul = N0->getOperand(1); - if (Mul->getOpcode() == ExtendCode && - Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= - ResVT.getScalarSizeInBits()) - Mul = Mul->getOperand(0); - if (Mul->getOpcode() != ISD::MUL) - return false; - SDValue ExtA = Mul->getOperand(0); - SDValue ExtB = Mul->getOperand(1); - if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) - return false; - A = ExtA->getOperand(0); - B = ExtB->getOperand(0); - if (A.getValueType() == B.getValueType() && - llvm::any_of(ExtTypes, - [&A](MVT Ty) { return A.getValueType() == Ty; })) { - A = ExtendIfNeeded(A, ExtendCode); - B = ExtendIfNeeded(B, ExtendCode); - return true; - } + } return false; }; + auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, + SDValue &A, SDValue &B, SDValue &Mask) { + // Same as the pattern above with a select for the zero predicated lanes + // ExtA = sext/zext A + // ExtB = sext/zext B + // Mul = mul ExtA, ExtB + // N0 = select Mask, Mul, 0 + // vecreduce.add N0 + if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || + !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) + return false; + Mask = N0->getOperand(0); + SDValue Mul = N0->getOperand(1); + if (Mul->getOpcode() == ExtendCode && + Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= + ResVT.getScalarSizeInBits()) + Mul = Mul->getOperand(0); + if (Mul->getOpcode() != ISD::MUL) + return false; + SDValue ExtA = Mul->getOperand(0); + SDValue ExtB = Mul->getOperand(1); + if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) + return false; + A = ExtA->getOperand(0); + B = ExtB->getOperand(0); + if (A.getValueType() == B.getValueType() && + llvm::any_of(ExtTypes, + [&A](MVT Ty) { return A.getValueType() == Ty; })) { + A = ExtendIfNeeded(A, ExtendCode); + B = ExtendIfNeeded(B, ExtendCode); + return true; + } + return false; + }; auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, @@ -15075,93 +15075,93 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); - if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, - {MVT::v4i8, MVT::v4i16, MVT::v4i32})) + if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, + {MVT::v4i8, MVT::v4i16, MVT::v4i32})) return Create64bitNode(ARMISD::VADDLVs, {A}); - if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, - {MVT::v4i8, MVT::v4i16, MVT::v4i32})) + if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, + {MVT::v4i8, MVT::v4i16, MVT::v4i32})) return Create64bitNode(ARMISD::VADDLVu, {A}); - if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); - if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); - - SDValue Mask; - if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) - return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); - if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) - return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); - if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, - {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) - return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); - if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, - {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) - return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); - if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); - if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); - + if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); + if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); + + SDValue Mask; + if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) + return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); + if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) + return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); + if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, + {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) + return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); + if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, + {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) + return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); + if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); + if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); + SDValue A, B; if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); - if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, - {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) + if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, + {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) return Create64bitNode(ARMISD::VMLALVs, {A, B}); - if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, - {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) + if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, + {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) return Create64bitNode(ARMISD::VMLALVu, {A, B}); - if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); - if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); - - if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); - if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); - if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, - {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, - B, Mask)) - return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); - if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, - {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, - B, Mask)) - return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); - if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); - if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) - return DAG.getNode(ISD::TRUNCATE, dl, ResVT, - DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); - - // Some complications. We can get a case where the two inputs of the mul are - // the same, then the output sext will have been helpfully converted to a - // zext. Turn it back. - SDValue Op = N0; - if (Op->getOpcode() == ISD::VSELECT) - Op = Op->getOperand(1); - if (Op->getOpcode() == ISD::ZERO_EXTEND && - Op->getOperand(0)->getOpcode() == ISD::MUL) { - SDValue Mul = Op->getOperand(0); - if (Mul->getOperand(0) == Mul->getOperand(1) && - Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { - SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); - if (Op != N0) - Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), - N0->getOperand(0), Ext, N0->getOperand(2)); - return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); - } - } - + if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); + if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); + + if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) + return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); + if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) + return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); + if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, + {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, + B, Mask)) + return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); + if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, + {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, + B, Mask)) + return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); + if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); + if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) + return DAG.getNode(ISD::TRUNCATE, dl, ResVT, + DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); + + // Some complications. We can get a case where the two inputs of the mul are + // the same, then the output sext will have been helpfully converted to a + // zext. Turn it back. + SDValue Op = N0; + if (Op->getOpcode() == ISD::VSELECT) + Op = Op->getOperand(1); + if (Op->getOpcode() == ISD::ZERO_EXTEND && + Op->getOperand(0)->getOpcode() == ISD::MUL) { + SDValue Mul = Op->getOperand(0); + if (Mul->getOperand(0) == Mul->getOperand(1) && + Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); + if (Op != N0) + Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), + N0->getOperand(0), Ext, N0->getOperand(2)); + return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); + } + } + return SDValue(); } @@ -15613,13 +15613,13 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SmallVector<SDValue, 4> Chains; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; - SDValue NewPtr = - DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); + SDValue NewPtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment, MMOFlags, AAInfo); + Alignment, MMOFlags, AAInfo); Loads.push_back(NewLoad); Chains.push_back(SDValue(NewLoad.getNode(), 1)); } @@ -15707,9 +15707,9 @@ static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, if (!ST->hasMVEIntegerOps()) return SDValue(); - if (SDValue V = PerformVQDMULHCombine(N, DAG)) - return V; - + if (SDValue V = PerformVQDMULHCombine(N, DAG)) + return V; + if (VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); @@ -16317,8 +16317,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; - case ISD::SELECT_CC: - case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); + case ISD::SELECT_CC: + case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); @@ -16735,19 +16735,19 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, switch (II->getIntrinsicID()) { case Intrinsic::fma: return !IsFMS(I); - case Intrinsic::arm_mve_add_predicated: - case Intrinsic::arm_mve_mul_predicated: - case Intrinsic::arm_mve_qadd_predicated: - case Intrinsic::arm_mve_hadd_predicated: - case Intrinsic::arm_mve_vqdmull_predicated: - case Intrinsic::arm_mve_qdmulh_predicated: - case Intrinsic::arm_mve_qrdmulh_predicated: - case Intrinsic::arm_mve_fma_predicated: - return true; - case Intrinsic::arm_mve_sub_predicated: - case Intrinsic::arm_mve_qsub_predicated: - case Intrinsic::arm_mve_hsub_predicated: - return Operand == 1; + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_fma_predicated: + return true; + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_hsub_predicated: + return Operand == 1; default: return false; } @@ -17476,7 +17476,7 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, return; KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); - Known = KnownBits::commonBits(Known, KnownRHS); + Known = KnownBits::commonBits(Known, KnownRHS); return; } case ISD::INTRINSIC_W_CHAIN: { @@ -18349,9 +18349,9 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; - if (VT == MVT::f32 && Subtarget->hasFullFP16() && - ARM_AM::getFP32FP16Imm(Imm) != -1) - return true; + if (VT == MVT::f32 && Subtarget->hasFullFP16() && + ARM_AM::getFP32FP16Imm(Imm) != -1) + return true; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && Subtarget->hasFP64()) @@ -18661,8 +18661,8 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { : AtomicExpansionKind::None; } -// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 -// bits, and up to 64 bits on the non-M profiles. +// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 +// bits, and up to 64 bits on the non-M profiles. TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // At -O0, fast-regalloc cannot cope with the live vregs necessary to @@ -18670,11 +18670,11 @@ ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. - unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); + unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); bool HasAtomicCmpXchg = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); - if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && - Size <= (Subtarget->isMClass() ? 32U : 64U)) + if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && + Size <= (Subtarget->isMClass() ? 32U : 64U)) return AtomicExpansionKind::LLSC; return AtomicExpansionKind::None; } @@ -19129,7 +19129,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 6> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); - append_range(Ops, Shuffles); + append_range(Ops, Shuffles); Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); } else { @@ -19145,7 +19145,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SmallVector<Value *, 6> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); - append_range(Ops, Shuffles); + append_range(Ops, Shuffles); for (unsigned F = 0; F < Factor; F++) { Ops.push_back(Builder.getInt32(F)); Builder.CreateCall(VstNFunc, Ops); diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.h b/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.h index 61a127af07..5b04ba8c6d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMISelLowering.h @@ -216,37 +216,37 @@ class VectorType; VMULLs, // ...signed VMULLu, // ...unsigned - VQDMULH, // MVE vqdmulh instruction - + VQDMULH, // MVE vqdmulh instruction + // MVE reductions VADDVs, // sign- or zero-extend the elements of a vector to i32, VADDVu, // add them all together, and return an i32 of their sum - VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask - VADDVpu, + VADDVps, // Same as VADDV[su] but with a v4i1 predicate mask + VADDVpu, VADDLVs, // sign- or zero-extend elements to i64 and sum, returning VADDLVu, // the low and high 32-bit halves of the sum - VADDLVAs, // Same as VADDLV[su] but also add an input accumulator + VADDLVAs, // Same as VADDLV[su] but also add an input accumulator VADDLVAu, // provided as low and high halves - VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask - VADDLVpu, - VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask - VADDLVApu, - VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply them - VMLAVu, // and add the results together, returning an i32 of their sum - VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask - VMLAVpu, - VMLALVs, // Same as VMLAV but with i64, returning the low and - VMLALVu, // high 32-bit halves of the sum - VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask - VMLALVpu, - VMLALVAs, // Same as VMLALV but also add an input accumulator - VMLALVAu, // provided as low and high halves - VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask - VMLALVApu, - VMINVu, // Find minimum unsigned value of a vector and register - VMINVs, // Find minimum signed value of a vector and register - VMAXVu, // Find maximum unsigned value of a vector and register - VMAXVs, // Find maximum signed value of a vector and register + VADDLVps, // Same as VADDLV[su] but with a v4i1 predicate mask + VADDLVpu, + VADDLVAps, // Same as VADDLVp[su] but with a v4i1 predicate mask + VADDLVApu, + VMLAVs, // sign- or zero-extend the elements of two vectors to i32, multiply them + VMLAVu, // and add the results together, returning an i32 of their sum + VMLAVps, // Same as VMLAV[su] with a v4i1 predicate mask + VMLAVpu, + VMLALVs, // Same as VMLAV but with i64, returning the low and + VMLALVu, // high 32-bit halves of the sum + VMLALVps, // Same as VMLALV[su] with a v4i1 predicate mask + VMLALVpu, + VMLALVAs, // Same as VMLALV but also add an input accumulator + VMLALVAu, // provided as low and high halves + VMLALVAps, // Same as VMLALVA[su] with a v4i1 predicate mask + VMLALVApu, + VMINVu, // Find minimum unsigned value of a vector and register + VMINVs, // Find minimum signed value of a vector and register + VMAXVu, // Find maximum unsigned value of a vector and register + VMAXVs, // Find maximum signed value of a vector and register SMULWB, // Signed multiply word by half word, bottom SMULWT, // Signed multiply word by half word, top @@ -285,8 +285,8 @@ class VectorType; // Vector AND with NOT of immediate VBICIMM, - // Pseudo vector bitwise select - VBSP, + // Pseudo vector bitwise select + VBSP, // Pseudo-instruction representing a memory copy using ldm/stm // instructions. diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrFormats.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrFormats.td index 85da7c5a53..7937353678 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrFormats.td @@ -403,9 +403,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, bit isUnaryDataProc = 0; bit canXformTo16Bit = 0; // The instruction is a 16-bit flag setting Thumb instruction. Used - // by the parser and if-converter to determine whether to require the 'S' - // suffix on the mnemonic (when not in an IT block) or preclude it (when - // in an IT block). + // by the parser and if-converter to determine whether to require the 'S' + // suffix on the mnemonic (when not in an IT block) or preclude it (when + // in an IT block). bit thumbArithFlagSetting = 0; bit validForTailPredication = 0; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrInfo.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrInfo.td index 8dcb319923..2fe8cbc613 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrInfo.td @@ -162,9 +162,9 @@ def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, [SDNPInGlue]>; def ARMsubs : SDNode<"ARMISD::SUBS", SDTIntBinOp, [SDNPOutGlue]>; -def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; +def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; -def ARMusat : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; +def ARMusat : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -371,11 +371,11 @@ def imm_not_XFORM : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(~(int)N->getZExtValue(), SDLoc(N), MVT::i32); }]>; -// asr_imm_XFORM - Returns a shift immediate with bit {5} set to 1 -def asr_imm_XFORM : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(0x20 | N->getZExtValue(), SDLoc(N), MVT:: i32); -}]>; - +// asr_imm_XFORM - Returns a shift immediate with bit {5} set to 1 +def asr_imm_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(0x20 | N->getZExtValue(), SDLoc(N), MVT:: i32); +}]>; + /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. def imm16_31 : ImmLeaf<i32, [{ return (int32_t)Imm >= 16 && (int32_t)Imm < 32; @@ -442,8 +442,8 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>; def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>; -def asr_imm : ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }], asr_imm_XFORM>; - +def asr_imm : ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }], asr_imm_XFORM>; + //===----------------------------------------------------------------------===// // NEON/MVE pattern fragments // @@ -496,19 +496,19 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{ }]>; -def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>; -def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>; -def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>; -def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>; - -def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{ - ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); - unsigned EltBits = 0; - uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); - return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01); -}]>; - +def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>; +def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>; +def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>; +def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>; +def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{ + ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); + unsigned EltBits = 0; + uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits); + return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01); +}]>; + + //===----------------------------------------------------------------------===// // Operand Definitions. // @@ -822,9 +822,9 @@ def mod_imm_neg : Operand<i32>, PatLeaf<(imm), [{ def arm_i32imm : IntImmLeaf<i32, [{ if (Subtarget->useMovt()) return true; - if (ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue())) - return true; - return ARM_AM::isSOImmTwoPartValNeg(Imm.getZExtValue()); + if (ARM_AM::isSOImmTwoPartVal(Imm.getZExtValue())) + return true; + return ARM_AM::isSOImmTwoPartValNeg(Imm.getZExtValue()); }]>; /// imm0_1 predicate - Immediate in the range [0,1]. @@ -2492,30 +2492,30 @@ let isCall = 1, } // ARMv5T and above - def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", []>, + def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm, IIC_Br, "blx\t$func", []>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{31-4} = 0b1110000100101111111111110011; let Inst{3-0} = func; } - def BLX_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func), - 4, IIC_Br, [], (BLX GPR:$func)>, - Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>; - + def BLX_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func), + 4, IIC_Br, [], (BLX GPR:$func)>, + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>; + def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm, - IIC_Br, "blx", "\t$func", []>, + IIC_Br, "blx", "\t$func", []>, Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> { bits<4> func; let Inst{27-4} = 0b000100101111111111110011; let Inst{3-0} = func; } - def BLX_pred_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func), - 4, IIC_Br, [], - (BLX_pred GPR:$func, (ops 14, zero_reg))>, - Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>; - + def BLX_pred_noip : ARMPseudoExpand<(outs), (ins GPRnoip:$func), + 4, IIC_Br, [], + (BLX_pred GPR:$func, (ops 14, zero_reg))>, + Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]>; + // ARMv4T // Note: Restrict $func to the tGPR regclass to prevent it being in LR. def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func), @@ -2540,16 +2540,16 @@ let isCall = 1, Requires<[IsARM]>, Sched<[WriteBr]>; } -def : ARMPat<(ARMcall GPR:$func), (BLX $func)>, - Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>; -def : ARMPat<(ARMcall GPRnoip:$func), (BLX_noip $func)>, - Requires<[IsARM, HasV5T, SLSBLRMitigation]>; -def : ARMPat<(ARMcall_pred GPR:$func), (BLX_pred $func)>, - Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>; -def : ARMPat<(ARMcall_pred GPRnoip:$func), (BLX_pred_noip $func)>, - Requires<[IsARM, HasV5T, SLSBLRMitigation]>; - - +def : ARMPat<(ARMcall GPR:$func), (BLX $func)>, + Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>; +def : ARMPat<(ARMcall GPRnoip:$func), (BLX_noip $func)>, + Requires<[IsARM, HasV5T, SLSBLRMitigation]>; +def : ARMPat<(ARMcall_pred GPR:$func), (BLX_pred $func)>, + Requires<[IsARM, HasV5T, NoSLSBLRMitigation]>; +def : ARMPat<(ARMcall_pred GPRnoip:$func), (BLX_pred_noip $func)>, + Requires<[IsARM, HasV5T, SLSBLRMitigation]>; + + let isBranch = 1, isTerminator = 1 in { // FIXME: should be able to write a pattern for ARMBrcond, but can't use // a two-value operand where a dag node expects two operands. :( @@ -4089,32 +4089,32 @@ def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), (USAT imm0_31:$pos, GPRnopc:$a, 0)>; -def : ARMPat<(ARMssat GPRnopc:$Rn, imm0_31:$imm), +def : ARMPat<(ARMssat GPRnopc:$Rn, imm0_31:$imm), (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; -def : ARMPat<(ARMusat GPRnopc:$Rn, imm0_31:$imm), +def : ARMPat<(ARMusat GPRnopc:$Rn, imm0_31:$imm), (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos), (SSAT16 imm1_16:$pos, GPRnopc:$a)>; def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), (USAT16 imm0_15:$pos, GPRnopc:$a)>; -def : ARMV6Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), - (SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : ARMV6Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), - (SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; -def : ARMV6Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), - (USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : ARMV6Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), - (USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; -def : ARMPat<(ARMssat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos), - (SSAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>; -def : ARMPat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), - (SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; -def : ARMPat<(ARMusat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos), - (USAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>; -def : ARMPat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), - (USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; - - +def : ARMV6Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : ARMV6Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : ARMV6Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : ARMV6Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : ARMPat<(ARMssat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos), + (SSAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>; +def : ARMPat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), + (SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; +def : ARMPat<(ARMusat (shl GPRnopc:$Rn, imm0_31:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$Rn, imm0_31:$shft)>; +def : ARMPat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; + + //===----------------------------------------------------------------------===// // Bitwise Instructions. // @@ -6381,15 +6381,15 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn), NoItinerary, [(set GPR:$Rd, (int_arm_space timm:$size, GPR:$Rn))]>; -// SpeculationBarrierEndBB must only be used after an unconditional control -// flow, i.e. after a terminator for which isBarrier is True. -let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { - def SpeculationBarrierISBDSBEndBB - : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; - def SpeculationBarrierSBEndBB - : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; -} - +// SpeculationBarrierEndBB must only be used after an unconditional control +// flow, i.e. after a terminator for which isBarrier is True. +let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + def SpeculationBarrierISBDSBEndBB + : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; + def SpeculationBarrierSBEndBB + : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; +} + //===---------------------------------- // Atomic cmpxchg for -O0 //===---------------------------------- diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrMVE.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrMVE.td index 0dfea68887..64cef5d967 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrMVE.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrMVE.td @@ -318,78 +318,78 @@ def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>; def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>; def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>; -multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt, - dag PredOperands, Instruction Inst, - SDPatternOperator IdentityVec = null_frag> { - // Unpredicated - def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; - - // Predicated with select - if !ne(VTI.Size, 0b11) then { - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask), - (VTI.Vec (Op (VTI.Vec MQPR:$Qm), - (VTI.Vec MQPR:$Qn))), - (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; - - // Optionally with the select folded through the op - def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), - (VTI.Vec (vselect (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$Qn), - (VTI.Vec IdentityVec))))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$Qm)))>; - } - - // Predicated with intrinsic - def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), - PredOperands, - (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; -} - -multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt, - dag PredOperands, Instruction Inst, - SDPatternOperator IdentityVec = null_frag> { - // Unpredicated - def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>; - - // Predicated with select - if !ne(VTI.Size, 0b11) then { - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask), - (VTI.Vec (Op (VTI.Vec MQPR:$Qm), - (VTI.Vec (ARMvdup rGPR:$Rn)))), - (VTI.Vec MQPR:$inactive))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; - - // Optionally with the select folded through the op - def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), - (VTI.Vec (vselect (VTI.Pred VCCR:$mask), - (ARMvdup rGPR:$Rn), - (VTI.Vec IdentityVec))))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$Qm)))>; - } - - // Predicated with intrinsic - def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))), - PredOperands, - (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))), - (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, - ARMVCCThen, (VTI.Pred VCCR:$mask), - (VTI.Vec MQPR:$inactive)))>; -} - +multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt, + dag PredOperands, Instruction Inst, + SDPatternOperator IdentityVec = null_frag> { + // Unpredicated + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + + // Predicated with select + if !ne(VTI.Size, 0b11) then { + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn))), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + + // Optionally with the select folded through the op + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qn), + (VTI.Vec IdentityVec))))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qm)))>; + } + + // Predicated with intrinsic + def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), + PredOperands, + (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + +multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt, + dag PredOperands, Instruction Inst, + SDPatternOperator IdentityVec = null_frag> { + // Unpredicated + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>; + + // Predicated with select + if !ne(VTI.Size, 0b11) then { + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$Rn)))), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + + // Optionally with the select folded through the op + def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), + (VTI.Vec (vselect (VTI.Pred VCCR:$mask), + (ARMvdup rGPR:$Rn), + (VTI.Vec IdentityVec))))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$Qm)))>; + } + + // Predicated with intrinsic + def : Pat<(VTI.Vec !con((PredInt (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))), + PredOperands, + (? (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn, + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + // --------- Start of base classes for the instructions themselves class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm, @@ -450,7 +450,7 @@ class MVE_ScalarShift<string iname, dag oops, dag iops, string asm, string cstr, : MVE_MI_with_pred<oops, iops, NoItinerary, iname, asm, cstr, pattern> { let Inst{31-20} = 0b111010100101; let Inst{8} = 0b1; - let validForTailPredication=1; + let validForTailPredication=1; } class MVE_ScalarShiftSingleReg<string iname, dag iops, string asm, string cstr, @@ -684,13 +684,13 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr, let validForTailPredication = 1; } -def SDTVecReduceP : SDTypeProfile<1, 2, [ // VADDLVp - SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2> -]>; +def SDTVecReduceP : SDTypeProfile<1, 2, [ // VADDLVp + SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2> +]>; def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; -def ARMVADDVps : SDNode<"ARMISD::VADDVps", SDTVecReduceP>; -def ARMVADDVpu : SDNode<"ARMISD::VADDVpu", SDTVecReduceP>; +def ARMVADDVps : SDNode<"ARMISD::VADDVps", SDTVecReduceP>; +def ARMVADDVpu : SDNode<"ARMISD::VADDVpu", SDTVecReduceP>; multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> { def acc : MVE_VADDV<"vaddva", VTI.Suffix, @@ -707,39 +707,39 @@ multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> { if VTI.Unsigned then { def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))), (i32 (InstN $vec))>; - def : Pat<(i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec MQPR:$vec), - (VTI.Vec ARMimmAllZerosV))))), - (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$vec), + (VTI.Vec ARMimmAllZerosV))))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), (i32 (InstN $vec))>; - def : Pat<(i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), - (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))), (i32 tGPREven:$acc))), (i32 (InstA $acc, $vec))>; - def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec MQPR:$vec), - (VTI.Vec ARMimmAllZerosV))))), - (i32 tGPREven:$acc))), - (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$vec), + (VTI.Vec ARMimmAllZerosV))))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), (i32 tGPREven:$acc))), (i32 (InstA $acc, $vec))>; - def : Pat<(i32 (add (i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), - (i32 tGPREven:$acc))), - (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (i32 (ARMVADDVpu (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; } else { def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), (i32 (InstN $vec))>; def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), (i32 tGPREven:$acc))), (i32 (InstA $acc, $vec))>; - def : Pat<(i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), - (i32 (InstN $vec, ARMVCCThen, $pred))>; - def : Pat<(i32 (add (i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), - (i32 tGPREven:$acc))), - (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (i32 (ARMVADDVps (VTI.Vec MQPR:$vec), (VTI.Pred VCCR:$pred))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; } def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), @@ -944,14 +944,14 @@ multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> { defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>; } -def SDTVecReduceR : SDTypeProfile<1, 2, [ // Reduction of an integer and vector into an integer - SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> -]>; -def ARMVMINVu : SDNode<"ARMISD::VMINVu", SDTVecReduceR>; -def ARMVMINVs : SDNode<"ARMISD::VMINVs", SDTVecReduceR>; -def ARMVMAXVu : SDNode<"ARMISD::VMAXVu", SDTVecReduceR>; -def ARMVMAXVs : SDNode<"ARMISD::VMAXVs", SDTVecReduceR>; - +def SDTVecReduceR : SDTypeProfile<1, 2, [ // Reduction of an integer and vector into an integer + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> +]>; +def ARMVMINVu : SDNode<"ARMISD::VMINVu", SDTVecReduceR>; +def ARMVMINVs : SDNode<"ARMISD::VMINVs", SDTVecReduceR>; +def ARMVMAXVu : SDNode<"ARMISD::VMAXVu", SDTVecReduceR>; +def ARMVMAXVs : SDNode<"ARMISD::VMAXVs", SDTVecReduceR>; + defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">; defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">; @@ -982,32 +982,32 @@ let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_umin (v4i32 MQPR:$src))), (i32 (MVE_VMINVu32 (t2MOVi (i32 4294967295)), $src))>; - def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v16i8 MQPR:$src))), - (i32 (MVE_VMINVu8 $x, $src))>; - def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v8i16 MQPR:$src))), - (i32 (MVE_VMINVu16 $x, $src))>; - def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v4i32 MQPR:$src))), - (i32 (MVE_VMINVu32 $x, $src))>; - def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v16i8 MQPR:$src))), - (i32 (MVE_VMINVs8 $x, $src))>; - def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v8i16 MQPR:$src))), - (i32 (MVE_VMINVs16 $x, $src))>; - def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v4i32 MQPR:$src))), - (i32 (MVE_VMINVs32 $x, $src))>; - - def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v16i8 MQPR:$src))), - (i32 (MVE_VMAXVu8 $x, $src))>; - def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v8i16 MQPR:$src))), - (i32 (MVE_VMAXVu16 $x, $src))>; - def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v4i32 MQPR:$src))), - (i32 (MVE_VMAXVu32 $x, $src))>; - def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v16i8 MQPR:$src))), - (i32 (MVE_VMAXVs8 $x, $src))>; - def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v8i16 MQPR:$src))), - (i32 (MVE_VMAXVs16 $x, $src))>; - def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v4i32 MQPR:$src))), - (i32 (MVE_VMAXVs32 $x, $src))>; - + def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v16i8 MQPR:$src))), + (i32 (MVE_VMINVu8 $x, $src))>; + def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v8i16 MQPR:$src))), + (i32 (MVE_VMINVu16 $x, $src))>; + def : Pat<(i32 (ARMVMINVu (i32 rGPR:$x), (v4i32 MQPR:$src))), + (i32 (MVE_VMINVu32 $x, $src))>; + def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v16i8 MQPR:$src))), + (i32 (MVE_VMINVs8 $x, $src))>; + def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v8i16 MQPR:$src))), + (i32 (MVE_VMINVs16 $x, $src))>; + def : Pat<(i32 (ARMVMINVs (i32 rGPR:$x), (v4i32 MQPR:$src))), + (i32 (MVE_VMINVs32 $x, $src))>; + + def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVu8 $x, $src))>; + def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVu16 $x, $src))>; + def : Pat<(i32 (ARMVMAXVu (i32 rGPR:$x), (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVu32 $x, $src))>; + def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v16i8 MQPR:$src))), + (i32 (MVE_VMAXVs8 $x, $src))>; + def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v8i16 MQPR:$src))), + (i32 (MVE_VMAXVs16 $x, $src))>; + def : Pat<(i32 (ARMVMAXVs (i32 rGPR:$x), (v4i32 MQPR:$src))), + (i32 (MVE_VMAXVs32 $x, $src))>; + } multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> { @@ -1139,28 +1139,28 @@ def SDTVecReduce2LA : SDTypeProfile<2, 4, [ // VMLALVA SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, SDTCisVec<4>, SDTCisVec<5> ]>; -def SDTVecReduce2P : SDTypeProfile<1, 3, [ // VMLAV - SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3> -]>; -def SDTVecReduce2LP : SDTypeProfile<2, 3, [ // VMLALV - SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4> -]>; -def SDTVecReduce2LAP : SDTypeProfile<2, 5, [ // VMLALVA - SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, - SDTCisVec<4>, SDTCisVec<5>, SDTCisVec<6> -]>; +def SDTVecReduce2P : SDTypeProfile<1, 3, [ // VMLAV + SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3> +]>; +def SDTVecReduce2LP : SDTypeProfile<2, 3, [ // VMLALV + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4> +]>; +def SDTVecReduce2LAP : SDTypeProfile<2, 5, [ // VMLALVA + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5>, SDTCisVec<6> +]>; def ARMVMLAVs : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>; def ARMVMLAVu : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>; def ARMVMLALVs : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>; def ARMVMLALVu : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>; -def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>; -def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>; -def ARMVMLAVps : SDNode<"ARMISD::VMLAVps", SDTVecReduce2P>; -def ARMVMLAVpu : SDNode<"ARMISD::VMLAVpu", SDTVecReduce2P>; -def ARMVMLALVps : SDNode<"ARMISD::VMLALVps", SDTVecReduce2LP>; -def ARMVMLALVpu : SDNode<"ARMISD::VMLALVpu", SDTVecReduce2LP>; -def ARMVMLALVAps : SDNode<"ARMISD::VMLALVAps", SDTVecReduce2LAP>; -def ARMVMLALVApu : SDNode<"ARMISD::VMLALVApu", SDTVecReduce2LAP>; +def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>; +def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>; +def ARMVMLAVps : SDNode<"ARMISD::VMLAVps", SDTVecReduce2P>; +def ARMVMLAVpu : SDNode<"ARMISD::VMLAVpu", SDTVecReduce2P>; +def ARMVMLALVps : SDNode<"ARMISD::VMLALVps", SDTVecReduce2LP>; +def ARMVMLALVpu : SDNode<"ARMISD::VMLALVpu", SDTVecReduce2LP>; +def ARMVMLALVAps : SDNode<"ARMISD::VMLALVAps", SDTVecReduce2LAP>; +def ARMVMLALVApu : SDNode<"ARMISD::VMLALVApu", SDTVecReduce2LAP>; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), @@ -1179,68 +1179,68 @@ let Predicates = [HasMVEInt] in { (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), - (i32 tGPREven:$src3))), + (i32 tGPREven:$src3))), (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>; def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))), - (i32 tGPREven:$src3))), + (i32 tGPREven:$src3))), (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>; def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))), - (i32 tGPREven:$src3))), + (i32 tGPREven:$src3))), (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>; def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; - - // Predicated - def : Pat<(i32 (vecreduce_add (vselect (v4i1 VCCR:$pred), - (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)), - (v4i32 ARMimmAllZerosV)))), - (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (vecreduce_add (vselect (v8i1 VCCR:$pred), - (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)), - (v8i16 ARMimmAllZerosV)))), - (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))), - (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))), - (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (vecreduce_add (vselect (v16i1 VCCR:$pred), - (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)), - (v16i8 ARMimmAllZerosV)))), - (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))), - (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))), - (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; - - def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v4i1 VCCR:$pred), - (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)), - (v4i32 ARMimmAllZerosV)))), - (i32 tGPREven:$src3))), - (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v8i1 VCCR:$pred), - (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)), - (v8i16 ARMimmAllZerosV)))), - (i32 tGPREven:$src3))), - (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (add (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)), - (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (add (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)), - (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v16i1 VCCR:$pred), - (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)), - (v16i8 ARMimmAllZerosV)))), - (i32 tGPREven:$src3))), - (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred))>; - def : Pat<(i32 (add (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)), - (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; - def : Pat<(i32 (add (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)), - (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; + + // Predicated + def : Pat<(i32 (vecreduce_add (vselect (v4i1 VCCR:$pred), + (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)), + (v4i32 ARMimmAllZerosV)))), + (i32 (MVE_VMLADAVu32 $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (vecreduce_add (vselect (v8i1 VCCR:$pred), + (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)), + (v8i16 ARMimmAllZerosV)))), + (i32 (MVE_VMLADAVu16 $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))), + (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred))), + (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (vecreduce_add (vselect (v16i1 VCCR:$pred), + (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)), + (v16i8 ARMimmAllZerosV)))), + (i32 (MVE_VMLADAVu8 $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))), + (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred))), + (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; + + def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v4i1 VCCR:$pred), + (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)), + (v4i32 ARMimmAllZerosV)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau32 $src3, $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v8i1 VCCR:$pred), + (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)), + (v8i16 ARMimmAllZerosV)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau16 $src3, $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (ARMVMLAVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (add (ARMVMLAVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (add (i32 (vecreduce_add (vselect (v16i1 VCCR:$pred), + (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)), + (v16i8 ARMimmAllZerosV)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau8 $src3, $src1, $src2, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (ARMVMLAVps (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; + def : Pat<(i32 (add (ARMVMLAVpu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), (v16i1 VCCR:$pred)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2), ARMVCCThen, $pred))>; } // vmlav aliases vmladav @@ -1360,25 +1360,25 @@ let Predicates = [HasMVEInt] in { (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; - - // Predicated - def : Pat<(ARMVMLALVps (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), - (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVpu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), - (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), - (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), - (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; - - def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), - (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), - (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), - (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; - def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), - (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; + + // Predicated + def : Pat<(ARMVMLALVps (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), + (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVpu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), + (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVps (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), + (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVpu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), + (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; + + def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), + (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), (v4i1 VCCR:$pred)), + (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVAps tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), + (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; + def : Pat<(ARMVMLALVApu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), (v8i1 VCCR:$pred)), + (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2), ARMVCCThen, $pred)>; } // vmlalv aliases vmlaldav @@ -1426,7 +1426,7 @@ class MVE_comp<InstrItinClass itin, string iname, string suffix, } class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21, - list<dag> pattern=[]> + list<dag> pattern=[]> : MVE_comp<NoItinerary, iname, suffix, "", pattern> { let Inst{28} = 0b1; @@ -1442,18 +1442,18 @@ class MVE_VMINMAXNM<string iname, string suffix, bit sz, bit bit_21, let Predicates = [HasMVEFloat]; } -multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> { - def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>; +multiclass MVE_VMINMAXNM_m<string iname, bit bit_4, MVEVectorVTInfo VTI, SDNode Op, Intrinsic PredInt> { + def "" : MVE_VMINMAXNM<iname, VTI.Suffix, VTI.Size{0}, bit_4>; - let Predicates = [HasMVEFloat] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>; - } + let Predicates = [HasMVEFloat] in { + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 0)), !cast<Instruction>(NAME)>; + } } -defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, fmaxnum, int_arm_mve_max_predicated>; -defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, fmaxnum, int_arm_mve_max_predicated>; -defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, fminnum, int_arm_mve_min_predicated>; -defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_mve_min_predicated>; +defm MVE_VMAXNMf32 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v4f32, fmaxnum, int_arm_mve_max_predicated>; +defm MVE_VMAXNMf16 : MVE_VMINMAXNM_m<"vmaxnm", 0b0, MVE_v8f16, fmaxnum, int_arm_mve_max_predicated>; +defm MVE_VMINNMf32 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v4f32, fminnum, int_arm_mve_min_predicated>; +defm MVE_VMINNMf16 : MVE_VMINMAXNM_m<"vminnm", 0b1, MVE_v8f16, fminnum, int_arm_mve_min_predicated>; class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size, @@ -1472,11 +1472,11 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size, } multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>; let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), !cast<Instruction>(NAME)>; } } @@ -1649,39 +1649,39 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>; } -let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; - defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; - defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; - defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; - - defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; - - defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; - defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; - - defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_bic_predicated, (? ), MVE_VBIC>; - defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_bic_predicated, (? ), MVE_VBIC>; - defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_bic_predicated, (? ), MVE_VBIC>; - defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_bic_predicated, (? ), MVE_VBIC>; - - defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_orn_predicated, (? ), MVE_VORN>; - defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_orn_predicated, (? ), MVE_VORN>; - defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_orn_predicated, (? ), MVE_VORN>; - defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, - int_arm_mve_orn_predicated, (? ), MVE_VORN>; +let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; + defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; + defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; + defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>; + + defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>; + + defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>; + + defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_bic_predicated, (? ), MVE_VBIC>; + defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_bic_predicated, (? ), MVE_VBIC>; + defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_bic_predicated, (? ), MVE_VBIC>; + defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_bic_predicated, (? ), MVE_VBIC>; + + defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_orn_predicated, (? ), MVE_VORN>; + defm : MVE_TwoOpPattern<MVE_v8i16, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_orn_predicated, (? ), MVE_VORN>; + defm : MVE_TwoOpPattern<MVE_v4i32, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_orn_predicated, (? ), MVE_VORN>; + defm : MVE_TwoOpPattern<MVE_v2i64, BinOpFrag<(or node:$LHS, (vnotq node:$RHS))>, + int_arm_mve_orn_predicated, (? ), MVE_VORN>; } class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps> @@ -1718,8 +1718,8 @@ multiclass MVE_bit_cmode_p<string iname, bit opcode, defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm)); let Predicates = [HasMVEInt] in { - def : Pat<UnpredPat, - (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>; + def : Pat<UnpredPat, + (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>; def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), UnpredPat, (VTI.Vec MQPR:$src))), (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm, @@ -1929,18 +1929,18 @@ class MVE_VMULt1<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -multiclass MVE_VMUL_m<MVEVectorVTInfo VTI> { - def "" : MVE_VMULt1<"vmul", VTI.Suffix, VTI.Size>; +multiclass MVE_VMUL_m<MVEVectorVTInfo VTI> { + def "" : MVE_VMULt1<"vmul", VTI.Suffix, VTI.Size>; let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, mul, int_arm_mve_mul_predicated, (? ), - !cast<Instruction>(NAME), ARMimmOneV>; + defm : MVE_TwoOpPattern<VTI, mul, int_arm_mve_mul_predicated, (? ), + !cast<Instruction>(NAME), ARMimmOneV>; } } -defm MVE_VMULi8 : MVE_VMUL_m<MVE_v16i8>; -defm MVE_VMULi16 : MVE_VMUL_m<MVE_v8i16>; -defm MVE_VMULi32 : MVE_VMUL_m<MVE_v4i32>; +defm MVE_VMULi8 : MVE_VMUL_m<MVE_v16i8>; +defm MVE_VMULi16 : MVE_VMUL_m<MVE_v8i16>; +defm MVE_VMULi32 : MVE_VMUL_m<MVE_v4i32>; class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding, list<dag> pattern=[]> @@ -1952,30 +1952,30 @@ class MVE_VQxDMULH_Base<string iname, string suffix, bits<2> size, bit rounding, let Inst{12-8} = 0b01011; let Inst{4} = 0b0; let Inst{0} = 0b0; - let validForTailPredication = 1; + let validForTailPredication = 1; } -def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>; - +def MVEvqdmulh : SDNode<"ARMISD::VQDMULH", SDTIntBinOp>; + multiclass MVE_VQxDMULH_m<string iname, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic unpred_int, Intrinsic pred_int, + SDNode Op, Intrinsic unpred_int, Intrinsic pred_int, bit rounding> { def "" : MVE_VQxDMULH_Base<iname, VTI.Suffix, VTI.Size, rounding>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, Op, pred_int, (? ), Inst>; - - // Extra unpredicated multiply intrinsic patterns - def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), + defm : MVE_TwoOpPattern<VTI, Op, pred_int, (? ), Inst>; + + // Extra unpredicated multiply intrinsic patterns + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; } } multiclass MVE_VQxDMULH<string iname, MVEVectorVTInfo VTI, bit rounding> - : MVE_VQxDMULH_m<iname, VTI, !if(rounding, null_frag, - MVEvqdmulh), - !if(rounding, int_arm_mve_vqrdmulh, + : MVE_VQxDMULH_m<iname, VTI, !if(rounding, null_frag, + MVEvqdmulh), + !if(rounding, int_arm_mve_vqrdmulh, int_arm_mve_vqdmulh), !if(rounding, int_arm_mve_qrdmulh_predicated, int_arm_mve_qdmulh_predicated), @@ -2003,12 +2003,12 @@ class MVE_VADDSUB<string iname, string suffix, bits<2> size, bit subtract, } multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VADDSUB<iname, VTI.Suffix, VTI.Size, subtract>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>; } } @@ -2046,13 +2046,13 @@ class MVE_VQSUB_<string suffix, bit U, bits<2> size> : MVE_VQADDSUB<"vqsub", suffix, U, 0b1, size>; multiclass MVE_VQADD_m<MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VQADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), - !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), + !cast<Instruction>(NAME)>; } } @@ -2067,13 +2067,13 @@ defm MVE_VQADDu16 : MVE_VQADD<MVE_v8u16, uaddsat>; defm MVE_VQADDu32 : MVE_VQADD<MVE_v4u32, uaddsat>; multiclass MVE_VQSUB_m<MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VQSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), - !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), + !cast<Instruction>(NAME)>; } } @@ -2199,32 +2199,32 @@ defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>; // modelling that here with these patterns, but we're using no wrap forms of // add to ensure that the extra bit of information is not needed for the // arithmetic or the rounding. -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvmovImm (i32 3585)))), - (i32 1))), - (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvmovImm (i32 2049)))), - (i32 1))), - (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvmovImm (i32 1)))), - (i32 1))), - (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), - (v16i8 (ARMvmovImm (i32 3585)))), - (i32 1))), - (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), - (v8i16 (ARMvmovImm (i32 2049)))), - (i32 1))), - (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), - (v4i32 (ARMvmovImm (i32 1)))), - (i32 1))), - (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; -} +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; +} class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, @@ -2473,9 +2473,9 @@ multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate, let Predicates = [HasMVEInt] in { // VQABS and VQNEG have more difficult isel patterns defined elsewhere - if !not(saturate) then { - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), - (VTI.Vec (Inst $v))>; + if !not(saturate) then { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), + (VTI.Vec (Inst $v))>; } def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), @@ -3032,7 +3032,7 @@ multiclass MVE_VSHRN_patterns<MVE_shift_imm_partial inst, defvar outparams = (inst (OutVTI.Vec MQPR:$QdSrc), (InVTI.Vec MQPR:$Qm), (imm:$imm)); - def : Pat<(OutVTI.Vec !setdagop(inparams, int_arm_mve_vshrn)), + def : Pat<(OutVTI.Vec !setdagop(inparams, int_arm_mve_vshrn)), (OutVTI.Vec outparams)>; def : Pat<(OutVTI.Vec !con(inparams, (int_arm_mve_vshrn_predicated (InVTI.Pred VCCR:$pred)))), @@ -3234,7 +3234,7 @@ multiclass MVE_VSxI_patterns<MVE_VSxI_imm inst, string name, defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # name); defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # name # "_predicated"); - def : Pat<(VTI.Vec !setdagop(inparams, unpred_int)), + def : Pat<(VTI.Vec !setdagop(inparams, unpred_int)), (VTI.Vec outparams)>; def : Pat<(VTI.Vec !con(inparams, (pred_int (VTI.Pred VCCR:$pred)))), (VTI.Vec !con(outparams, (? ARMVCCThen, VCCR:$pred)))>; @@ -3586,12 +3586,12 @@ class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]> } multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; } } @@ -3682,23 +3682,23 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> { let Predicates = [HasMVEFloat] in { if fms then { - def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), - (Inst $add, $m1, $m2)>; - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec (fma (fneg m1), m2, add)), - add)), - (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), + (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec (fma (fneg m1), m2, add)), + add)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)), (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)), (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; } else { - def : Pat<(VTI.Vec (fma m1, m2, add)), - (Inst $add, $m1, $m2)>; - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec (fma m1, m2, add)), - add)), - (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + def : Pat<(VTI.Vec (fma m1, m2, add)), + (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec (fma m1, m2, add)), + add)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)), (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; } @@ -3711,14 +3711,14 @@ defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>; defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>; multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> { let validForTailPredication = 1; } defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { - defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; + defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>; } } @@ -3820,15 +3820,15 @@ multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI> : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>; defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>; -defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; - -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))), - (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>; - def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))), - (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>; -} - +defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; + +let Predicates = [HasMVEFloat] in { + def : Pat<(v8f16 (fabs (fsub (v8f16 MQPR:$Qm), (v8f16 MQPR:$Qn)))), + (MVE_VABDf16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4f32 (fabs (fsub (v4f32 MQPR:$Qm), (v4f32 MQPR:$Qn)))), + (MVE_VABDf32 MQPR:$Qm, MQPR:$Qn)>; +} + class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, Operand imm_operand_type> : MVE_float<"vcvt", suffix, @@ -4047,8 +4047,8 @@ multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int, defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), - (VTI.Vec (Inst $v))>; + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), + (VTI.Vec (Inst $v))>; def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>; @@ -4083,8 +4083,8 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12, let Inst{4} = 0b0; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; - - let isCommutable = 1; + + let isCommutable = 1; } multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI, @@ -4410,10 +4410,10 @@ let Predicates = [HasMVEInt] in { // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; -def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 4; -}]>; - +def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; + let Predicates = [HasMVEInt] in { foreach VT = [ v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), @@ -4426,13 +4426,13 @@ let Predicates = [HasMVEInt] in { (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } - // If we happen to be casting from a load we can convert that straight - // into a predicate load, so long as the load is of the correct type. - foreach VT = [ v4i1, v8i1, v16i1 ] in { - def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))), - (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>; - } - + // If we happen to be casting from a load we can convert that straight + // into a predicate load, so long as the load is of the correct type. + foreach VT = [ v4i1, v8i1, v16i1 ] in { + def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))), + (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>; + } + // Here we match the specific SDNode type 'ARMVectorRegCastImpl' // rather than the more general 'ARMVectorRegCast' which would also // match some bitconverts. If we use the latter in cases where the @@ -4441,8 +4441,8 @@ let Predicates = [HasMVEInt] in { foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in - def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), - (VT MQPR:$src)>; + def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), + (VT MQPR:$src)>; } // end of MVE compares @@ -4770,7 +4770,7 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17, let Inst{16} = 0b1; let Inst{12} = T; let Inst{8} = 0b0; - let Inst{7} = !not(bit_17); + let Inst{7} = !not(bit_17); let Inst{0} = 0b1; let validForTailPredication = 1; let retainsPreviousHalfElement = 1; @@ -4801,7 +4801,7 @@ multiclass MVE_VMOVN_p<Instruction Inst, bit top, (VTI.Vec MQPR:$Qm), (i32 top))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>; - if !not(top) then { + if !not(top) then { // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd // lanes of a with the odd lanes of b. In other words, the lanes we're // _keeping_ from a are the even ones. So we can flip it round and say that @@ -5173,11 +5173,11 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, // Vector-scalar add/sub multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>; - let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>; - } + let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>; + } } multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI> @@ -5196,35 +5196,35 @@ defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>; // Vector-scalar saturating add/sub multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, - SDNode Op, Intrinsic PredInt> { + SDNode Op, Intrinsic PredInt> { def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract, 0b0, VTI.Unsigned>; - - let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), - !cast<Instruction>(NAME)>; - } + + let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? (i32 VTI.Unsigned)), + !cast<Instruction>(NAME)>; + } } -multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI, SDNode Op> - : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, Op, int_arm_mve_qadd_predicated>; +multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI, SDNode Op> + : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, Op, int_arm_mve_qadd_predicated>; -multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI, SDNode Op> - : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, Op, int_arm_mve_qsub_predicated>; +multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI, SDNode Op> + : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, Op, int_arm_mve_qsub_predicated>; -defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8, saddsat>; -defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16, saddsat>; -defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32, saddsat>; -defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8, uaddsat>; -defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16, uaddsat>; -defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32, uaddsat>; +defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8, saddsat>; +defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16, saddsat>; +defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32, saddsat>; +defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8, uaddsat>; +defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16, uaddsat>; +defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32, uaddsat>; -defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8, ssubsat>; -defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16, ssubsat>; -defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32, ssubsat>; -defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8, usubsat>; -defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16, usubsat>; -defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>; +defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8, ssubsat>; +defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16, ssubsat>; +defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32, ssubsat>; +defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8, usubsat>; +defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16, usubsat>; +defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32, usubsat>; class MVE_VQDMULL_qr<string iname, string suffix, bit size, bit T, string cstr="", list<dag> pattern=[]> @@ -5315,23 +5315,23 @@ defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>; defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>; defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>; -multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract, - SDNode Op, Intrinsic PredInt> { - def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>; - defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), - !cast<Instruction>(NAME)>; -} - +multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode Op, Intrinsic PredInt> { + def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, subtract>; + defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), + !cast<Instruction>(NAME)>; +} + let Predicates = [HasMVEFloat] in { - defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd, - int_arm_mve_add_predicated>; - defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd, - int_arm_mve_add_predicated>; + defm MVE_VADD_qr_f32 : MVE_VADDSUB_qr_f<"vadd", MVE_v4f32, 0b0, fadd, + int_arm_mve_add_predicated>; + defm MVE_VADD_qr_f16 : MVE_VADDSUB_qr_f<"vadd", MVE_v8f16, 0b0, fadd, + int_arm_mve_add_predicated>; - defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub, - int_arm_mve_sub_predicated>; - defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub, - int_arm_mve_sub_predicated>; + defm MVE_VSUB_qr_f32 : MVE_VADDSUB_qr_f<"vsub", MVE_v4f32, 0b1, fsub, + int_arm_mve_sub_predicated>; + defm MVE_VSUB_qr_f16 : MVE_VADDSUB_qr_f<"vsub", MVE_v8f16, 0b1, fsub, + int_arm_mve_sub_predicated>; } class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, @@ -5461,10 +5461,10 @@ class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size> multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> { def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>; - let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPatternDup<VTI, mul, int_arm_mve_mul_predicated, (? ), - !cast<Instruction>(NAME), ARMimmOneV>; - } + let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPatternDup<VTI, mul, int_arm_mve_mul_predicated, (? ), + !cast<Instruction>(NAME), ARMimmOneV>; + } } defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>; @@ -5481,25 +5481,25 @@ class MVE_VxxMUL_qr<string iname, string suffix, let Inst{12} = 0b0; let Inst{8} = 0b0; let Inst{5} = 0b1; - let validForTailPredication = 1; + let validForTailPredication = 1; } multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28, - PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> { + PatFrag Op, Intrinsic int_unpred, Intrinsic int_pred> { def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>; - - let Predicates = [HasMVEInt] in { - defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>; - } - defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, int_unpred, int_pred>; + + let Predicates = [HasMVEInt] in { + defm : MVE_TwoOpPatternDup<VTI, Op, int_pred, (? ), !cast<Instruction>(NAME)>; + } + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, int_unpred, int_pred>; } multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> : - MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, MVEvqdmulh, + MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, MVEvqdmulh, int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>; multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> : - MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, null_frag, + MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, null_frag, int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>; defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>; @@ -5510,17 +5510,17 @@ defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>; defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>; defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>; -multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> { - let validForTailPredication = 1 in - def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>; - defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ), - !cast<Instruction>(NAME)>; +multiclass MVE_VxxMUL_qr_f_m<MVEVectorVTInfo VTI> { + let validForTailPredication = 1 in + def "" : MVE_VxxMUL_qr<"vmul", VTI.Suffix, VTI.Size{0}, 0b11>; + defm : MVE_TwoOpPatternDup<VTI, fmul, int_arm_mve_mul_predicated, (? ), + !cast<Instruction>(NAME)>; } -let Predicates = [HasMVEFloat] in { - defm MVE_VMUL_qr_f16 : MVE_VxxMUL_qr_f_m<MVE_v8f16>; - defm MVE_VMUL_qr_f32 : MVE_VxxMUL_qr_f_m<MVE_v4f32>; -} +let Predicates = [HasMVEFloat] in { + defm MVE_VMUL_qr_f16 : MVE_VxxMUL_qr_f_m<MVE_v8f16>; + defm MVE_VMUL_qr_f32 : MVE_VxxMUL_qr_f_m<MVE_v4f32>; +} class MVE_VFMAMLA_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit S, @@ -5595,10 +5595,10 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI, if scalar_addend then { def : Pat<(VTI.Vec (fma v1, v2, vs)), (VTI.Vec (Inst v1, v2, is))>; - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec (fma v1, v2, vs)), - v1)), - (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec (fma v1, v2, vs)), + v1)), + (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)), (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>; } else { @@ -5606,14 +5606,14 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI, (VTI.Vec (Inst v2, v1, is))>; def : Pat<(VTI.Vec (fma vs, v1, v2)), (VTI.Vec (Inst v2, v1, is))>; - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec (fma vs, v2, v1)), - v1)), - (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; - def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), - (VTI.Vec (fma v2, vs, v1)), - v1)), - (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec (fma vs, v2, v1)), + v1)), + (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + (VTI.Vec (fma v2, vs, v1)), + v1)), + (VTI.Vec (Inst v1, v2, is, ARMVCCThen, $pred))>; def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)), (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)), @@ -5742,7 +5742,7 @@ def MVE_VDWDUPu8 : MVE_VxWDUP<"vdwdup", "u8", 0b00, 0b1>; def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; -let isReMaterializable = 1 in +let isReMaterializable = 1 in class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { @@ -5766,8 +5766,8 @@ multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> { defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEInt] in { - def : Pat<(intr rGPR:$Rn), - (VTI.Pred (Inst rGPR:$Rn))>; + def : Pat<(intr rGPR:$Rn), + (VTI.Pred (Inst rGPR:$Rn))>; def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)), (VTI.Pred (Inst rGPR:$Rn, ARMVCCThen, VCCR:$mask))>; } @@ -5845,41 +5845,41 @@ def MVE_VMOV_rr_q : MVE_VMOV_64bit<(outs rGPR:$Rt, rGPR:$Rt2), (ins MQPR:$Qd), let AsmMatchConverter = "cvtMVEVMOVQtoDReg"; } -let Predicates = [HasMVEInt] in { - // Double lane moves. There are a number of patterns here. We know that the - // insertelt's will be in descending order by index, and need to match the 5 - // patterns that might contain 2-0 or 3-1 pairs. These are: - // 3 2 1 0 -> vmovqrr 31; vmovqrr 20 - // 3 2 1 -> vmovqrr 31; vmov 2 - // 3 1 -> vmovqrr 31 - // 2 1 0 -> vmovqrr 20; vmov 1 - // 2 0 -> vmovqrr 20 - // The other potential patterns will be handled by single lane inserts. - def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcA, (i32 0)), - rGPR:$srcB, (i32 1)), - rGPR:$srcC, (i32 2)), - rGPR:$srcD, (i32 3)), - (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcC, (i32 2), (i32 0)), - rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcB, (i32 1)), - rGPR:$srcC, (i32 2)), - rGPR:$srcD, (i32 3)), - (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)), - rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)), - (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 3), (i32 1))>; - def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), - rGPR:$srcB, (i32 0)), - rGPR:$srcC, (i32 1)), - rGPR:$srcD, (i32 2)), - (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)), - rGPR:$srcB, rGPR:$srcD, (i32 2), (i32 0))>; - def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)), - (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 2), (i32 0))>; -} - +let Predicates = [HasMVEInt] in { + // Double lane moves. There are a number of patterns here. We know that the + // insertelt's will be in descending order by index, and need to match the 5 + // patterns that might contain 2-0 or 3-1 pairs. These are: + // 3 2 1 0 -> vmovqrr 31; vmovqrr 20 + // 3 2 1 -> vmovqrr 31; vmov 2 + // 3 1 -> vmovqrr 31 + // 2 1 0 -> vmovqrr 20; vmov 1 + // 2 0 -> vmovqrr 20 + // The other potential patterns will be handled by single lane inserts. + def : Pat<(insertelt (insertelt (insertelt (insertelt (v4i32 MQPR:$src1), + rGPR:$srcA, (i32 0)), + rGPR:$srcB, (i32 1)), + rGPR:$srcC, (i32 2)), + rGPR:$srcD, (i32 3)), + (MVE_VMOV_q_rr (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcC, (i32 2), (i32 0)), + rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>; + def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), + rGPR:$srcB, (i32 1)), + rGPR:$srcC, (i32 2)), + rGPR:$srcD, (i32 3)), + (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 2)), + rGPR:$srcB, rGPR:$srcD, (i32 3), (i32 1))>; + def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 1)), rGPR:$srcB, (i32 3)), + (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 3), (i32 1))>; + def : Pat<(insertelt (insertelt (insertelt (v4i32 MQPR:$src1), + rGPR:$srcB, (i32 0)), + rGPR:$srcC, (i32 1)), + rGPR:$srcD, (i32 2)), + (MVE_VMOV_q_rr (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$srcC, (i32 1)), + rGPR:$srcB, rGPR:$srcD, (i32 2), (i32 0))>; + def : Pat<(insertelt (insertelt (v4i32 MQPR:$src1), rGPR:$srcA, (i32 0)), rGPR:$srcB, (i32 2)), + (MVE_VMOV_q_rr MQPR:$src1, rGPR:$srcA, rGPR:$srcB, (i32 2), (i32 0))>; +} + // end of coproc mov // start of MVE interleaving load/store @@ -5908,7 +5908,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size, let mayLoad = load; let mayStore = !eq(load,0); let hasSideEffects = 0; - let validForTailPredication = load; + let validForTailPredication = load; } // A parameter class used to encapsulate all the ways the writeback @@ -6518,7 +6518,7 @@ class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> patte let Inst{4} = 0b0; let Defs = [VPR]; - let validForTailPredication=1; + let validForTailPredication=1; } class MVE_VPTt1<string suffix, bits<2> size, dag iops> @@ -6631,7 +6631,7 @@ class MVE_VPTf<string suffix, bit size, dag iops, string asm, list<dag> pattern= let Defs = [VPR]; let Predicates = [HasMVEFloat]; - let validForTailPredication=1; + let validForTailPredication=1; } class MVE_VPTft1<string suffix, bit size> @@ -7107,7 +7107,7 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst, class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst, PatFrag LoadKind, int shift> - : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))), + : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))), (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>; multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind, @@ -7274,11 +7274,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>; // Masked ext loads - def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), + def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; - def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), + def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; - def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), + def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))), (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>; } diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrNEON.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrNEON.td index a8c0d05d91..0f5d53b57d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrNEON.td @@ -509,7 +509,7 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>; def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; -def NEONvbsp : SDNode<"ARMISD::VBSP", +def NEONvbsp : SDNode<"ARMISD::VBSP", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -4197,10 +4197,10 @@ def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16", defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, "vaddl", "s", add, sext, 1>; defm VADDLu : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, - "vaddl", "u", add, zanyext, 1>; + "vaddl", "u", add, zanyext, 1>; // VADDW : Vector Add Wide (Q = Q + D) defm VADDWs : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>; -defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zanyext, 0>; +defm VADDWu : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zanyext, 0>; // VHADD : Vector Halving Add defm VHADDs : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, @@ -4512,9 +4512,9 @@ let Predicates = [HasNEON, HasV8_1a] in { (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (ARMvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4565,17 +4565,17 @@ let Predicates = [HasNEON, HasV8_1a] in { (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), - (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (ARMvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), - (v4i16 (EXTRACT_SUBREG + (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; @@ -4587,7 +4587,7 @@ let Predicates = [HasNEON, HasV8_1a] in { imm:$lane)))))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), - (v2i32 (EXTRACT_SUBREG + (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; @@ -5045,10 +5045,10 @@ def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16", defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, "vsubl", "s", sub, sext, 0>; defm VSUBLu : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, - "vsubl", "u", sub, zanyext, 0>; + "vsubl", "u", sub, zanyext, 0>; // VSUBW : Vector Subtract Wide (Q = Q - D) defm VSUBWs : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>; -defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zanyext, 0>; +defm VSUBWu : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zanyext, 0>; // VHSUB : Vector Halving Subtract defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, @@ -5259,9 +5259,9 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", // Vector Bitwise Operations. def vnotd : PatFrag<(ops node:$in), - (xor node:$in, ARMimmAllOnesD)>; + (xor node:$in, ARMimmAllOnesD)>; def vnotq : PatFrag<(ops node:$in), - (xor node:$in, ARMimmAllOnesV)>; + (xor node:$in, ARMimmAllOnesV)>; // VAND : Vector Bitwise AND @@ -5428,84 +5428,84 @@ def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>; def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; } -// The TwoAddress pass will not go looking for equivalent operations -// with different register constraints; it just inserts copies. -// That is why pseudo VBSP implemented. Is is expanded later into -// VBIT/VBIF/VBSL taking into account register constraints to avoid copies. -def VBSPd - : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), - IIC_VBINiD, "", - [(set DPR:$Vd, - (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; +// The TwoAddress pass will not go looking for equivalent operations +// with different register constraints; it just inserts copies. +// That is why pseudo VBSP implemented. Is is expanded later into +// VBIT/VBIF/VBSL taking into account register constraints to avoid copies. +def VBSPd + : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + IIC_VBINiD, "", + [(set DPR:$Vd, + (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; let Predicates = [HasNEON] in { def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1), (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))), - (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))), - (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))), - (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))), - (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1), (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))), - (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; } -def VBSPq - : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), - IIC_VBINiQ, "", - [(set QPR:$Vd, - (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; +def VBSPq + : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + IIC_VBINiQ, "", + [(set QPR:$Vd, + (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; let Predicates = [HasNEON] in { def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1), (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))), - (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1), (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))), - (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1), (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))), - (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))), - (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1), (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))), - (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; -} - -// VBSL : Vector Bitwise Select -def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), - (ins DPR:$src1, DPR:$Vn, DPR:$Vm), - N3RegFrm, IIC_VBINiD, - "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - []>; - -def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), - (ins QPR:$src1, QPR:$Vn, QPR:$Vm), - N3RegFrm, IIC_VBINiQ, - "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - []>; - + (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; +} + +// VBSL : Vector Bitwise Select +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, @@ -6040,9 +6040,9 @@ defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, // Vector Negate. def vnegd : PatFrag<(ops node:$in), - (sub ARMimmAllZerosD, node:$in)>; + (sub ARMimmAllZerosD, node:$in)>; def vnegq : PatFrag<(ops node:$in), - (sub ARMimmAllZerosV, node:$in)>; + (sub ARMimmAllZerosV, node:$in)>; class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty> : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm), @@ -6256,11 +6256,11 @@ defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32, let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in { def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm, - [(set DPR:$Vd, (v2i32 ARMimmAllZerosD))], + [(set DPR:$Vd, (v2i32 ARMimmAllZerosD))], (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>, Requires<[HasZCZ]>; def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm, - [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))], + [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))], (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>, Requires<[HasZCZ]>; } @@ -7946,7 +7946,7 @@ let Predicates = [HasNEON,IsLE] in { (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; } -// The following patterns are basically a copy of the patterns above, +// The following patterns are basically a copy of the patterns above, // however with an additional VREV16d instruction to convert data // loaded by VLD1LN into proper vector format in big endian mode. let Predicates = [HasNEON,IsBE] in { @@ -9079,11 +9079,11 @@ multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; } -def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>; -def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>; +def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>; +def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>; -defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>; -defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>; +defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; class BF16MM<bit Q, RegisterClass RegTy, string opc> @@ -9091,8 +9091,8 @@ class BF16MM<bit Q, RegisterClass RegTy, (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, "", "", [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd), - (v8bf16 QPR:$Vn), - (v8bf16 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9106,8 +9106,8 @@ class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode> NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", [(set (v4f32 QPR:$dst), (OpNode (v4f32 QPR:$Vd), - (v8bf16 QPR:$Vn), - (v8bf16 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let DecoderNamespace = "VFPV8"; } @@ -9128,9 +9128,9 @@ multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> { def : Pat< (v4f32 (OpNode (v4f32 QPR:$Vd), - (v8bf16 QPR:$Vn), - (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), - VectorIndex16:$lane)))), + (v8bf16 QPR:$Vn), + (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))), (!cast<Instruction>(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb.td index 3a33dfeecd..0b0c510102 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb.td @@ -548,19 +548,19 @@ let isCall = 1, // Also used for Thumb2 def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br, - "blx${p}\t$func", []>, + "blx${p}\t$func", []>, Requires<[IsThumb, HasV5T]>, T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24; bits<4> func; let Inst{6-3} = func; let Inst{2-0} = 0b000; } - def tBLXr_noip : ARMPseudoExpand<(outs), (ins pred:$p, GPRnoip:$func), - 2, IIC_Br, [], (tBLXr pred:$p, GPR:$func)>, - Requires<[IsThumb, HasV5T]>, - Sched<[WriteBrL]>; - + def tBLXr_noip : ARMPseudoExpand<(outs), (ins pred:$p, GPRnoip:$func), + 2, IIC_Br, [], (tBLXr pred:$p, GPR:$func)>, + Requires<[IsThumb, HasV5T]>, + Sched<[WriteBrL]>; + // ARMv8-M Security Extensions def tBLXNSr : TI<(outs), (ins pred:$p, GPRnopc:$func), IIC_Br, "blxns${p}\t$func", []>, @@ -590,11 +590,11 @@ let isCall = 1, Requires<[IsThumb]>, Sched<[WriteBr]>; } -def : ARMPat<(ARMcall GPR:$func), (tBLXr $func)>, - Requires<[IsThumb, HasV5T, NoSLSBLRMitigation]>; -def : ARMPat<(ARMcall GPRnoip:$func), (tBLXr_noip $func)>, - Requires<[IsThumb, HasV5T, SLSBLRMitigation]>; - +def : ARMPat<(ARMcall GPR:$func), (tBLXr $func)>, + Requires<[IsThumb, HasV5T, NoSLSBLRMitigation]>; +def : ARMPat<(ARMcall GPRnoip:$func), (tBLXr_noip $func)>, + Requires<[IsThumb, HasV5T, SLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isPredicable = 1 in def tB : T1pI<(outs), (ins t_brtarget:$target), IIC_Br, diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb2.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb2.td index 5642cab32e..b79212a48b 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrThumb2.td @@ -1724,7 +1724,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb), // only. // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4 class T2IstT<bits<2> type, string opc, InstrItinClass ii> - : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc, + : T2Ii8<(outs), (ins rGPR:$Rt, t2addrmode_imm8:$addr), ii, opc, "\t$Rt, $addr", []>, Sched<[WriteST]> { let Inst{31-27} = 0b11111; let Inst{26-25} = 0b00; @@ -2623,9 +2623,9 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn), let Inst{4} = 0; } -def : T2Pat<(ARMssat GPRnopc:$Rn, imm0_31:$imm), +def : T2Pat<(ARMssat GPRnopc:$Rn, imm0_31:$imm), (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; -def : T2Pat<(ARMusat GPRnopc:$Rn, imm0_31:$imm), +def : T2Pat<(ARMusat GPRnopc:$Rn, imm0_31:$imm), (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; @@ -2635,24 +2635,24 @@ def : T2Pat<(int_arm_ssat16 GPR:$a, imm1_16:$pos), (t2SSAT16 imm1_16:$pos, GPR:$a)>; def : T2Pat<(int_arm_usat16 GPR:$a, imm0_15:$pos), (t2USAT16 imm0_15:$pos, GPR:$a)>; -def : T2Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), - (t2SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : T2Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), - (t2SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; -def : T2Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), - (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : T2Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), - (t2USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; -def : T2Pat<(ARMssat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), - (t2SSAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : T2Pat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), - (t2SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; -def : T2Pat<(ARMusat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), - (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; -def : T2Pat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), - (t2USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; - - +def : T2Pat<(int_arm_ssat (shl GPRnopc:$a, imm0_31:$shft), imm1_32:$pos), + (t2SSAT imm1_32:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(int_arm_ssat (sra GPRnopc:$a, asr_imm:$shft), imm1_32:$pos), + (t2SSAT imm1_32:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : T2Pat<(int_arm_usat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(int_arm_usat (sra GPRnopc:$a, asr_imm:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$a, asr_imm:$shft)>; +def : T2Pat<(ARMssat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (t2SSAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(ARMssat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), + (t2SSAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; +def : T2Pat<(ARMusat (shl GPRnopc:$a, imm0_31:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$a, imm0_31:$shft)>; +def : T2Pat<(ARMusat (sra GPRnopc:$Rn, asr_imm:$shft), imm0_31:$pos), + (t2USAT imm0_31:$pos, GPRnopc:$Rn, asr_imm:$shft)>; + + //===----------------------------------------------------------------------===// // Shift and rotate Instructions. // @@ -4935,15 +4935,15 @@ def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>; // Armv8-R 'Data Full Barrier' def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; -// SpeculationBarrierEndBB must only be used after an unconditional control -// flow, i.e. after a terminator for which isBarrier is True. -let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { - def t2SpeculationBarrierISBDSBEndBB - : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; - def t2SpeculationBarrierSBEndBB - : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; -} - +// SpeculationBarrierEndBB must only be used after an unconditional control +// flow, i.e. after a terminator for which isBarrier is True. +let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + def t2SpeculationBarrierISBDSBEndBB + : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; + def t2SpeculationBarrierSBEndBB + : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>; +} + // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional // width specifier. def : t2InstAlias<"ldr${p} $Rt, $addr", @@ -5429,17 +5429,17 @@ def t2LE : t2LOL<(outs ), (ins lelabel_u11:$label), "le", "$label"> { let isTerminator = 1; } -let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in { - -let usesCustomInserter = 1 in +let Predicates = [IsThumb2, HasV8_1MMainline, HasLOB] in { + +let usesCustomInserter = 1 in def t2DoLoopStart : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, - [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; - -let isTerminator = 1, hasSideEffects = 1 in -def t2DoLoopStartTP : - t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts), 4, IIC_Br, + [(set GPRlr:$X, (int_start_loop_iterations rGPR:$elts))]>; +let isTerminator = 1, hasSideEffects = 1 in +def t2DoLoopStartTP : + t2PseudoInst<(outs GPRlr:$X), (ins rGPR:$elts, rGPR:$count), 4, IIC_Br, []>; + let hasSideEffects = 0 in def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), @@ -5458,14 +5458,14 @@ def t2LoopEnd : t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; -def t2LoopEndDec : - t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), - 8, IIC_Br, []>, Sched<[WriteBr]>; - +def t2LoopEndDec : + t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$elts, brtarget:$target), + 8, IIC_Br, []>, Sched<[WriteBr]>; + } // end isBranch, isTerminator, hasSideEffects -} - +} + } // end isNotDuplicable class CS<string iname, bits<4> opcode, list<dag> pattern=[]> @@ -5484,7 +5484,7 @@ class CS<string iname, bits<4> opcode, list<dag> pattern=[]> let Inst{3-0} = Rm{3-0}; let Uses = [CPSR]; - let hasSideEffects = 0; + let hasSideEffects = 0; } def t2CSEL : CS<"csel", 0b1000>; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrVFP.td b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrVFP.td index 2be58d7a0e..9034b35ded 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMInstrVFP.td @@ -54,16 +54,16 @@ def vfp_f16imm : Operand<f16>, let ParserMatchClass = FPImmOperand; } -def vfp_f32f16imm_xform : SDNodeXForm<fpimm, [{ - APFloat InVal = N->getValueAPF(); - uint32_t enc = ARM_AM::getFP32FP16Imm(InVal); - return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>; - -def vfp_f32f16imm : PatLeaf<(f32 fpimm), [{ - return ARM_AM::getFP32FP16Imm(N->getValueAPF()) != -1; - }], vfp_f32f16imm_xform>; - +def vfp_f32f16imm_xform : SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = ARM_AM::getFP32FP16Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>; + +def vfp_f32f16imm : PatLeaf<(f32 fpimm), [{ + return ARM_AM::getFP32FP16Imm(N->getValueAPF()) != -1; + }], vfp_f32f16imm_xform>; + def vfp_f32imm_xform : SDNodeXForm<fpimm, [{ APFloat InVal = N->getValueAPF(); uint32_t enc = ARM_AM::getFP32Imm(InVal); @@ -1561,8 +1561,8 @@ class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{5} = Sm{0}; let Inst{15-12} = Sd{4-1}; let Inst{22} = Sd{0}; - - let hasSideEffects = 0; + + let hasSideEffects = 0; } class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, @@ -2626,11 +2626,11 @@ def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm), } } -def : Pat<(f32 (vfp_f32f16imm:$imm)), - (f32 (COPY_TO_REGCLASS (f16 (FCONSTH (vfp_f32f16imm_xform (f32 $imm)))), SPR))> { - let Predicates = [HasFullFP16]; -} - +def : Pat<(f32 (vfp_f32f16imm:$imm)), + (f32 (COPY_TO_REGCLASS (f16 (FCONSTH (vfp_f32f16imm_xform (f32 $imm)))), SPR))> { + let Predicates = [HasFullFP16]; +} + //===----------------------------------------------------------------------===// // Assembler aliases. // @@ -2846,12 +2846,12 @@ let Predicates = [HasV8_1MMainline, HasMVEInt] in { } defm VSTR_P0 : vfp_vstrldr_sysreg<0b0,0b1101, "p0", (outs), (ins VCCR:$P0)>; - - let Defs = [VPR] in { - defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; - } - defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0", - (outs VCCR:$P0), (ins)>; + + let Defs = [VPR] in { + defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; + } + defm VLDR_P0 : vfp_vstrldr_sysreg<0b1,0b1101, "p0", + (outs VCCR:$P0), (ins)>; } let Uses = [FPSCR] in { diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMLegalizerInfo.cpp index d9b60f4c4e..92b7dd5047 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -88,7 +88,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR}) .legalFor({s32}) - .clampScalar(0, s32, s32); + .clampScalar(0, s32, s32); if (ST.hasNEON()) getActionDefinitionsBuilder({G_ADD, G_SUB}) diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index aa1fe4e4ff..e264726f91 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1268,7 +1268,7 @@ findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg, bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // Thumb1 is already using updating loads/stores. if (isThumb1) return false; - LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI); + LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI); const MachineOperand &BaseOP = MI->getOperand(0); Register Base = BaseOP.getReg(); @@ -1320,10 +1320,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { return false; } } - if (MergeInstr != MBB.end()) { - LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); + if (MergeInstr != MBB.end()) { + LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); MBB.erase(MergeInstr); - } + } unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) @@ -1338,7 +1338,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { // Transfer memoperands. MIB.setMemRefs(MI->memoperands()); - LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB); + LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB); MBB.erase(MBBI); return true; } @@ -1386,27 +1386,27 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, case ARM::t2LDRi8: case ARM::t2LDRi12: return ARM::t2LDR_POST; - case ARM::t2LDRBi8: - case ARM::t2LDRBi12: - return ARM::t2LDRB_POST; - case ARM::t2LDRSBi8: - case ARM::t2LDRSBi12: - return ARM::t2LDRSB_POST; - case ARM::t2LDRHi8: - case ARM::t2LDRHi12: - return ARM::t2LDRH_POST; - case ARM::t2LDRSHi8: - case ARM::t2LDRSHi12: - return ARM::t2LDRSH_POST; + case ARM::t2LDRBi8: + case ARM::t2LDRBi12: + return ARM::t2LDRB_POST; + case ARM::t2LDRSBi8: + case ARM::t2LDRSBi12: + return ARM::t2LDRSB_POST; + case ARM::t2LDRHi8: + case ARM::t2LDRHi12: + return ARM::t2LDRH_POST; + case ARM::t2LDRSHi8: + case ARM::t2LDRSHi12: + return ARM::t2LDRSH_POST; case ARM::t2STRi8: case ARM::t2STRi12: return ARM::t2STR_POST; - case ARM::t2STRBi8: - case ARM::t2STRBi12: - return ARM::t2STRB_POST; - case ARM::t2STRHi8: - case ARM::t2STRHi12: - return ARM::t2STRH_POST; + case ARM::t2STRBi8: + case ARM::t2STRBi12: + return ARM::t2STRB_POST; + case ARM::t2STRHi8: + case ARM::t2STRHi12: + return ARM::t2STRH_POST; case ARM::MVE_VLDRBS16: return ARM::MVE_VLDRBS16_post; @@ -1449,7 +1449,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // Thumb1 doesn't have updating LDR/STR. // FIXME: Use LDM/STM with single register instead. if (isThumb1) return false; - LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI); + LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << *MI); Register Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); @@ -1491,7 +1491,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { } else return false; } - LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); + LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); MBB.erase(MergeInstr); ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; @@ -1503,54 +1503,54 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // updating load/store-multiple instructions can be used with only one // register.) MachineOperand &MO = MI->getOperand(0); - auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) - .addReg(Base, getDefRegState(true)) // WB base register - .addReg(Base, getKillRegState(isLd ? BaseKill : false)) - .addImm(Pred) - .addReg(PredReg) - .addReg(MO.getReg(), (isLd ? getDefRegState(true) - : getKillRegState(MO.isKill()))) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) + .addReg(Base, getDefRegState(true)) // WB base register + .addReg(Base, getKillRegState(isLd ? BaseKill : false)) + .addImm(Pred) + .addReg(PredReg) + .addReg(MO.getReg(), (isLd ? getDefRegState(true) + : getKillRegState(MO.isKill()))) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } else if (isLd) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { - auto MIB = - BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) - .addReg(Base, RegState::Define) - .addReg(Base) - .addImm(Offset) - .addImm(Pred) - .addReg(PredReg) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base) + .addImm(Offset) + .addImm(Pred) + .addReg(PredReg) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } else { int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); - auto MIB = - BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) - .addReg(Base, RegState::Define) - .addReg(Base) - .addReg(0) - .addImm(Imm) - .add(predOps(Pred, PredReg)) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base) + .addReg(0) + .addImm(Imm) + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } } else { // t2LDR_PRE, t2LDR_POST - auto MIB = - BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) - .addReg(Base, RegState::Define) - .addReg(Base) - .addImm(Offset) - .add(predOps(Pred, PredReg)) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, RegState::Define) + .addReg(Base) + .addImm(Offset) + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } } else { MachineOperand &MO = MI->getOperand(0); @@ -1560,25 +1560,25 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM2 && NewOpc == ARM::STR_POST_IMM) { int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST - auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) - .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base) - .addReg(0) - .addImm(Imm) - .add(predOps(Pred, PredReg)) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base) + .addReg(0) + .addImm(Imm) + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } else { // t2STR_PRE, t2STR_POST - auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) - .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base) - .addImm(Offset) - .add(predOps(Pred, PredReg)) - .cloneMemRefs(*MI); - (void)MIB; - LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); + auto MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) + .addReg(MO.getReg(), getKillRegState(MO.isKill())) + .addReg(Base) + .addImm(Offset) + .add(predOps(Pred, PredReg)) + .cloneMemRefs(*MI); + (void)MIB; + LLVM_DEBUG(dbgs() << " Added new instruction: " << *MIB); } } MBB.erase(MBBI); @@ -1592,7 +1592,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { "Must have t2STRDi8 or t2LDRDi8"); if (MI.getOperand(3).getImm() != 0) return false; - LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << MI); + LLVM_DEBUG(dbgs() << "Attempting to merge update of: " << MI); // Behaviour for writeback is undefined if base register is the same as one // of the others. @@ -1620,7 +1620,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { } else return false; } - LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); + LLVM_DEBUG(dbgs() << " Erasing old increment: " << *MergeInstr); MBB.erase(MergeInstr); DebugLoc DL = MI.getDebugLoc(); @@ -1642,7 +1642,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { MIB.add(MO); MIB.cloneMemRefs(MI); - LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB); + LLVM_DEBUG(dbgs() << " Added new load/store: " << *MIB); MBB.erase(MBBI); return true; } @@ -2585,169 +2585,169 @@ static int getBaseOperandIndex(MachineInstr &MI) { case ARM::MVE_VSTRBU8: case ARM::MVE_VSTRHU16: case ARM::MVE_VSTRWU32: - case ARM::t2LDRHi8: - case ARM::t2LDRHi12: - case ARM::t2LDRSHi8: - case ARM::t2LDRSHi12: - case ARM::t2LDRBi8: - case ARM::t2LDRBi12: - case ARM::t2LDRSBi8: - case ARM::t2LDRSBi12: - case ARM::t2STRBi8: - case ARM::t2STRBi12: - case ARM::t2STRHi8: - case ARM::t2STRHi12: + case ARM::t2LDRHi8: + case ARM::t2LDRHi12: + case ARM::t2LDRSHi8: + case ARM::t2LDRSHi12: + case ARM::t2LDRBi8: + case ARM::t2LDRBi12: + case ARM::t2LDRSBi8: + case ARM::t2LDRSBi12: + case ARM::t2STRBi8: + case ARM::t2STRBi12: + case ARM::t2STRHi8: + case ARM::t2STRHi12: return 1; - case ARM::MVE_VLDRBS16_post: - case ARM::MVE_VLDRBS32_post: - case ARM::MVE_VLDRBU16_post: - case ARM::MVE_VLDRBU32_post: - case ARM::MVE_VLDRHS32_post: - case ARM::MVE_VLDRHU32_post: - case ARM::MVE_VLDRBU8_post: - case ARM::MVE_VLDRHU16_post: - case ARM::MVE_VLDRWU32_post: - case ARM::MVE_VSTRB16_post: - case ARM::MVE_VSTRB32_post: - case ARM::MVE_VSTRH32_post: - case ARM::MVE_VSTRBU8_post: - case ARM::MVE_VSTRHU16_post: - case ARM::MVE_VSTRWU32_post: - case ARM::MVE_VLDRBS16_pre: - case ARM::MVE_VLDRBS32_pre: - case ARM::MVE_VLDRBU16_pre: - case ARM::MVE_VLDRBU32_pre: - case ARM::MVE_VLDRHS32_pre: - case ARM::MVE_VLDRHU32_pre: - case ARM::MVE_VLDRBU8_pre: - case ARM::MVE_VLDRHU16_pre: - case ARM::MVE_VLDRWU32_pre: - case ARM::MVE_VSTRB16_pre: - case ARM::MVE_VSTRB32_pre: - case ARM::MVE_VSTRH32_pre: - case ARM::MVE_VSTRBU8_pre: - case ARM::MVE_VSTRHU16_pre: - case ARM::MVE_VSTRWU32_pre: - return 2; + case ARM::MVE_VLDRBS16_post: + case ARM::MVE_VLDRBS32_post: + case ARM::MVE_VLDRBU16_post: + case ARM::MVE_VLDRBU32_post: + case ARM::MVE_VLDRHS32_post: + case ARM::MVE_VLDRHU32_post: + case ARM::MVE_VLDRBU8_post: + case ARM::MVE_VLDRHU16_post: + case ARM::MVE_VLDRWU32_post: + case ARM::MVE_VSTRB16_post: + case ARM::MVE_VSTRB32_post: + case ARM::MVE_VSTRH32_post: + case ARM::MVE_VSTRBU8_post: + case ARM::MVE_VSTRHU16_post: + case ARM::MVE_VSTRWU32_post: + case ARM::MVE_VLDRBS16_pre: + case ARM::MVE_VLDRBS32_pre: + case ARM::MVE_VLDRBU16_pre: + case ARM::MVE_VLDRBU32_pre: + case ARM::MVE_VLDRHS32_pre: + case ARM::MVE_VLDRHU32_pre: + case ARM::MVE_VLDRBU8_pre: + case ARM::MVE_VLDRHU16_pre: + case ARM::MVE_VLDRWU32_pre: + case ARM::MVE_VSTRB16_pre: + case ARM::MVE_VSTRB32_pre: + case ARM::MVE_VSTRH32_pre: + case ARM::MVE_VSTRBU8_pre: + case ARM::MVE_VSTRHU16_pre: + case ARM::MVE_VSTRWU32_pre: + return 2; } return -1; } -static bool isPostIndex(MachineInstr &MI) { - switch (MI.getOpcode()) { - case ARM::MVE_VLDRBS16_post: - case ARM::MVE_VLDRBS32_post: - case ARM::MVE_VLDRBU16_post: - case ARM::MVE_VLDRBU32_post: - case ARM::MVE_VLDRHS32_post: - case ARM::MVE_VLDRHU32_post: - case ARM::MVE_VLDRBU8_post: - case ARM::MVE_VLDRHU16_post: - case ARM::MVE_VLDRWU32_post: - case ARM::MVE_VSTRB16_post: - case ARM::MVE_VSTRB32_post: - case ARM::MVE_VSTRH32_post: - case ARM::MVE_VSTRBU8_post: - case ARM::MVE_VSTRHU16_post: - case ARM::MVE_VSTRWU32_post: - return true; - } - return false; -} - -static bool isPreIndex(MachineInstr &MI) { - switch (MI.getOpcode()) { - case ARM::MVE_VLDRBS16_pre: - case ARM::MVE_VLDRBS32_pre: - case ARM::MVE_VLDRBU16_pre: - case ARM::MVE_VLDRBU32_pre: - case ARM::MVE_VLDRHS32_pre: - case ARM::MVE_VLDRHU32_pre: - case ARM::MVE_VLDRBU8_pre: - case ARM::MVE_VLDRHU16_pre: - case ARM::MVE_VLDRWU32_pre: - case ARM::MVE_VSTRB16_pre: - case ARM::MVE_VSTRB32_pre: - case ARM::MVE_VSTRH32_pre: - case ARM::MVE_VSTRBU8_pre: - case ARM::MVE_VSTRHU16_pre: - case ARM::MVE_VSTRWU32_pre: - return true; - } - return false; -} - -// Given a memory access Opcode, check that the give Imm would be a valid Offset -// for this instruction (same as isLegalAddressImm), Or if the instruction -// could be easily converted to one where that was valid. For example converting -// t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with -// AdjustBaseAndOffset below. -static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm, - const TargetInstrInfo *TII, - int &CodesizeEstimate) { - if (isLegalAddressImm(Opcode, Imm, TII)) - return true; - - // We can convert AddrModeT2_i12 to AddrModeT2_i8. - const MCInstrDesc &Desc = TII->get(Opcode); - unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); - switch (AddrMode) { - case ARMII::AddrModeT2_i12: - CodesizeEstimate += 1; - return std::abs(Imm) < (((1 << 8) * 1) - 1); - } - return false; -} - -// Given an MI adjust its address BaseReg to use NewBaseReg and address offset -// by -Offset. This can either happen in-place or be a replacement as MI is -// converted to another instruction type. -static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg, - int Offset, const TargetInstrInfo *TII) { - unsigned BaseOp = getBaseOperandIndex(*MI); - MI->getOperand(BaseOp).setReg(NewBaseReg); - int OldOffset = MI->getOperand(BaseOp + 1).getImm(); - if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII)) - MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset); - else { - unsigned ConvOpcode; - switch (MI->getOpcode()) { - case ARM::t2LDRHi12: - ConvOpcode = ARM::t2LDRHi8; - break; - case ARM::t2LDRSHi12: - ConvOpcode = ARM::t2LDRSHi8; - break; - case ARM::t2LDRBi12: - ConvOpcode = ARM::t2LDRBi8; - break; - case ARM::t2LDRSBi12: - ConvOpcode = ARM::t2LDRSBi8; - break; - case ARM::t2STRHi12: - ConvOpcode = ARM::t2STRHi8; - break; - case ARM::t2STRBi12: - ConvOpcode = ARM::t2STRBi8; - break; - default: - llvm_unreachable("Unhandled convertable opcode"); - } - assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) && - "Illegal Address Immediate after convert!"); - - const MCInstrDesc &MCID = TII->get(ConvOpcode); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .addImm(OldOffset - Offset) - .add(MI->getOperand(3)) - .add(MI->getOperand(4)) - .cloneMemRefs(*MI); - MI->eraseFromParent(); - } -} - +static bool isPostIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16_post: + case ARM::MVE_VLDRBS32_post: + case ARM::MVE_VLDRBU16_post: + case ARM::MVE_VLDRBU32_post: + case ARM::MVE_VLDRHS32_post: + case ARM::MVE_VLDRHU32_post: + case ARM::MVE_VLDRBU8_post: + case ARM::MVE_VLDRHU16_post: + case ARM::MVE_VLDRWU32_post: + case ARM::MVE_VSTRB16_post: + case ARM::MVE_VSTRB32_post: + case ARM::MVE_VSTRH32_post: + case ARM::MVE_VSTRBU8_post: + case ARM::MVE_VSTRHU16_post: + case ARM::MVE_VSTRWU32_post: + return true; + } + return false; +} + +static bool isPreIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16_pre: + case ARM::MVE_VLDRBS32_pre: + case ARM::MVE_VLDRBU16_pre: + case ARM::MVE_VLDRBU32_pre: + case ARM::MVE_VLDRHS32_pre: + case ARM::MVE_VLDRHU32_pre: + case ARM::MVE_VLDRBU8_pre: + case ARM::MVE_VLDRHU16_pre: + case ARM::MVE_VLDRWU32_pre: + case ARM::MVE_VSTRB16_pre: + case ARM::MVE_VSTRB32_pre: + case ARM::MVE_VSTRH32_pre: + case ARM::MVE_VSTRBU8_pre: + case ARM::MVE_VSTRHU16_pre: + case ARM::MVE_VSTRWU32_pre: + return true; + } + return false; +} + +// Given a memory access Opcode, check that the give Imm would be a valid Offset +// for this instruction (same as isLegalAddressImm), Or if the instruction +// could be easily converted to one where that was valid. For example converting +// t2LDRi12 to t2LDRi8 for negative offsets. Works in conjunction with +// AdjustBaseAndOffset below. +static bool isLegalOrConvertableAddressImm(unsigned Opcode, int Imm, + const TargetInstrInfo *TII, + int &CodesizeEstimate) { + if (isLegalAddressImm(Opcode, Imm, TII)) + return true; + + // We can convert AddrModeT2_i12 to AddrModeT2_i8. + const MCInstrDesc &Desc = TII->get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i12: + CodesizeEstimate += 1; + return std::abs(Imm) < (((1 << 8) * 1) - 1); + } + return false; +} + +// Given an MI adjust its address BaseReg to use NewBaseReg and address offset +// by -Offset. This can either happen in-place or be a replacement as MI is +// converted to another instruction type. +static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg, + int Offset, const TargetInstrInfo *TII) { + unsigned BaseOp = getBaseOperandIndex(*MI); + MI->getOperand(BaseOp).setReg(NewBaseReg); + int OldOffset = MI->getOperand(BaseOp + 1).getImm(); + if (isLegalAddressImm(MI->getOpcode(), OldOffset - Offset, TII)) + MI->getOperand(BaseOp + 1).setImm(OldOffset - Offset); + else { + unsigned ConvOpcode; + switch (MI->getOpcode()) { + case ARM::t2LDRHi12: + ConvOpcode = ARM::t2LDRHi8; + break; + case ARM::t2LDRSHi12: + ConvOpcode = ARM::t2LDRSHi8; + break; + case ARM::t2LDRBi12: + ConvOpcode = ARM::t2LDRBi8; + break; + case ARM::t2LDRSBi12: + ConvOpcode = ARM::t2LDRSBi8; + break; + case ARM::t2STRHi12: + ConvOpcode = ARM::t2STRHi8; + break; + case ARM::t2STRBi12: + ConvOpcode = ARM::t2STRBi8; + break; + default: + llvm_unreachable("Unhandled convertable opcode"); + } + assert(isLegalAddressImm(ConvOpcode, OldOffset - Offset, TII) && + "Illegal Address Immediate after convert!"); + + const MCInstrDesc &MCID = TII->get(ConvOpcode); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(OldOffset - Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + MI->eraseFromParent(); + } +} + static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, Register NewReg, const TargetInstrInfo *TII, @@ -2766,70 +2766,70 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, TRC = TII->getRegClass(MCID, 2, TRI, *MF); MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); - unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask); - switch (AddrMode) { - case ARMII::AddrModeT2_i7: - case ARMII::AddrModeT2_i7s2: - case ARMII::AddrModeT2_i7s4: - // Any MVE load/store - return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) - .addReg(NewReg, RegState::Define) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .addImm(Offset) - .add(MI->getOperand(3)) - .add(MI->getOperand(4)) - .cloneMemRefs(*MI); - case ARMII::AddrModeT2_i8: - if (MI->mayLoad()) { - return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) - .add(MI->getOperand(0)) - .addReg(NewReg, RegState::Define) - .add(MI->getOperand(1)) - .addImm(Offset) - .add(MI->getOperand(3)) - .add(MI->getOperand(4)) - .cloneMemRefs(*MI); - } else { - return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) - .addReg(NewReg, RegState::Define) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .addImm(Offset) - .add(MI->getOperand(3)) - .add(MI->getOperand(4)) - .cloneMemRefs(*MI); - } - default: - llvm_unreachable("Unhandled createPostIncLoadStore"); - } + unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i7: + case ARMII::AddrModeT2_i7s2: + case ARMII::AddrModeT2_i7s4: + // Any MVE load/store + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + case ARMII::AddrModeT2_i8: + if (MI->mayLoad()) { + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .add(MI->getOperand(0)) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + } else { + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); + } + default: + llvm_unreachable("Unhandled createPostIncLoadStore"); + } } // Given a Base Register, optimise the load/store uses to attempt to create more -// post-inc accesses and less register moves. We do this by taking zero offset -// loads/stores with an add, and convert them to a postinc load/store of the -// same type. Any subsequent accesses will be adjusted to use and account for -// the post-inc value. +// post-inc accesses and less register moves. We do this by taking zero offset +// loads/stores with an add, and convert them to a postinc load/store of the +// same type. Any subsequent accesses will be adjusted to use and account for +// the post-inc value. // For example: // LDR #0 LDR_POSTINC #16 // LDR #4 LDR #-12 // LDR #8 LDR #-8 // LDR #12 LDR #-4 // ADD #16 -// -// At the same time if we do not find an increment but do find an existing -// pre/post inc instruction, we can still adjust the offsets of subsequent -// instructions to save the register move that would otherwise be needed for the -// in-place increment. +// +// At the same time if we do not find an increment but do find an existing +// pre/post inc instruction, we can still adjust the offsets of subsequent +// instructions to save the register move that would otherwise be needed for the +// in-place increment. bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { // We are looking for: // One zero offset load/store that can become postinc MachineInstr *BaseAccess = nullptr; - MachineInstr *PrePostInc = nullptr; + MachineInstr *PrePostInc = nullptr; // An increment that can be folded in MachineInstr *Increment = nullptr; // Other accesses after BaseAccess that will need to be updated to use the - // postinc value. + // postinc value. SmallPtrSet<MachineInstr *, 8> OtherAccesses; for (auto &Use : MRI->use_nodbg_instructions(Base)) { if (!Increment && getAddSubImmediate(Use) != 0) { @@ -2844,81 +2844,81 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { if (!Use.getOperand(BaseOp).isReg() || Use.getOperand(BaseOp).getReg() != Base) return false; - if (isPreIndex(Use) || isPostIndex(Use)) - PrePostInc = &Use; - else if (Use.getOperand(BaseOp + 1).getImm() == 0) + if (isPreIndex(Use) || isPostIndex(Use)) + PrePostInc = &Use; + else if (Use.getOperand(BaseOp + 1).getImm() == 0) BaseAccess = &Use; else OtherAccesses.insert(&Use); } - int IncrementOffset; - Register NewBaseReg; - if (BaseAccess && Increment) { - if (PrePostInc || BaseAccess->getParent() != Increment->getParent()) - return false; - Register PredReg; - if (Increment->definesRegister(ARM::CPSR) || - getInstrPredicate(*Increment, PredReg) != ARMCC::AL) - return false; - - LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " - << Base.virtRegIndex() << "\n"); - - // Make sure that Increment has no uses before BaseAccess. - for (MachineInstr &Use : - MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { - if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { - LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); - return false; - } - } - - // Make sure that Increment can be folded into Base - IncrementOffset = getAddSubImmediate(*Increment); - unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( - BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); - if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { - LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); + int IncrementOffset; + Register NewBaseReg; + if (BaseAccess && Increment) { + if (PrePostInc || BaseAccess->getParent() != Increment->getParent()) + return false; + Register PredReg; + if (Increment->definesRegister(ARM::CPSR) || + getInstrPredicate(*Increment, PredReg) != ARMCC::AL) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " + << Base.virtRegIndex() << "\n"); + + // Make sure that Increment has no uses before BaseAccess. + for (MachineInstr &Use : + MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { + if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { + LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + return false; + } + } + + // Make sure that Increment can be folded into Base + IncrementOffset = getAddSubImmediate(*Increment); + unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( + BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); + if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); return false; } } - else if (PrePostInc) { - // If we already have a pre/post index load/store then set BaseAccess, - // IncrementOffset and NewBaseReg to the values it already produces, - // allowing us to update and subsequent uses of BaseOp reg with the - // incremented value. - if (Increment) - return false; - - LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already " - << "indexed VirtualReg " << Base.virtRegIndex() << "\n"); - int BaseOp = getBaseOperandIndex(*PrePostInc); - IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm(); - BaseAccess = PrePostInc; - NewBaseReg = PrePostInc->getOperand(0).getReg(); - } - else + else if (PrePostInc) { + // If we already have a pre/post index load/store then set BaseAccess, + // IncrementOffset and NewBaseReg to the values it already produces, + // allowing us to update and subsequent uses of BaseOp reg with the + // incremented value. + if (Increment) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on already " + << "indexed VirtualReg " << Base.virtRegIndex() << "\n"); + int BaseOp = getBaseOperandIndex(*PrePostInc); + IncrementOffset = PrePostInc->getOperand(BaseOp+1).getImm(); + BaseAccess = PrePostInc; + NewBaseReg = PrePostInc->getOperand(0).getReg(); + } + else return false; // And make sure that the negative value of increment can be added to all // other offsets after the BaseAccess. We rely on either // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess) // to keep things simple. - // This also adds a simple codesize metric, to detect if an instruction (like - // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi) - // cannot because it is converted to something else (t2LDRBi8). We start this - // at -1 for the gain from removing the increment. + // This also adds a simple codesize metric, to detect if an instruction (like + // t2LDRBi12) which can often be shrunk to a thumb1 instruction (tLDRBi) + // cannot because it is converted to something else (t2LDRBi8). We start this + // at -1 for the gain from removing the increment. SmallPtrSet<MachineInstr *, 4> SuccessorAccesses; - int CodesizeEstimate = -1; + int CodesizeEstimate = -1; for (auto *Use : OtherAccesses) { if (DT->dominates(BaseAccess, Use)) { SuccessorAccesses.insert(Use); unsigned BaseOp = getBaseOperandIndex(*Use); - if (!isLegalOrConvertableAddressImm(Use->getOpcode(), - Use->getOperand(BaseOp + 1).getImm() - - IncrementOffset, - TII, CodesizeEstimate)) { + if (!isLegalOrConvertableAddressImm(Use->getOpcode(), + Use->getOperand(BaseOp + 1).getImm() - + IncrementOffset, + TII, CodesizeEstimate)) { LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n"); return false; } @@ -2928,27 +2928,27 @@ bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { return false; } } - if (STI->hasMinSize() && CodesizeEstimate > 0) { - LLVM_DEBUG(dbgs() << " Expected to grow instructions under minsize\n"); - return false; - } - - if (!PrePostInc) { - // Replace BaseAccess with a post inc - LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); - LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); - NewBaseReg = Increment->getOperand(0).getReg(); - MachineInstr *BaseAccessPost = - createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); - BaseAccess->eraseFromParent(); - Increment->eraseFromParent(); - (void)BaseAccessPost; - LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); - } + if (STI->hasMinSize() && CodesizeEstimate > 0) { + LLVM_DEBUG(dbgs() << " Expected to grow instructions under minsize\n"); + return false; + } + + if (!PrePostInc) { + // Replace BaseAccess with a post inc + LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); + LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); + NewBaseReg = Increment->getOperand(0).getReg(); + MachineInstr *BaseAccessPost = + createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); + BaseAccess->eraseFromParent(); + Increment->eraseFromParent(); + (void)BaseAccessPost; + LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + } for (auto *Use : SuccessorAccesses) { LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); - AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII); + AdjustBaseAndOffset(Use, NewBaseReg, IncrementOffset, TII); LLVM_DEBUG(dbgs() << " To : "; Use->dump()); } diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMLowOverheadLoops.cpp index 8dc5320584..144e845550 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -56,7 +56,7 @@ #include "ARMBaseRegisterInfo.h" #include "ARMBasicBlockInfo.h" #include "ARMSubtarget.h" -#include "MVETailPredUtils.h" +#include "MVETailPredUtils.h" #include "Thumb2InstrInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" @@ -74,37 +74,37 @@ using namespace llvm; #define DEBUG_TYPE "arm-low-overhead-loops" #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" -static cl::opt<bool> -DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden, - cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"), - cl::init(false)); - -static bool isVectorPredicated(MachineInstr *MI) { - int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); - return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; -} - -static bool isVectorPredicate(MachineInstr *MI) { - return MI->findRegisterDefOperandIdx(ARM::VPR) != -1; -} - -static bool hasVPRUse(MachineInstr &MI) { - return MI.findRegisterUseOperandIdx(ARM::VPR) != -1; -} - -static bool isDomainMVE(MachineInstr *MI) { - uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; - return Domain == ARMII::DomainMVE; -} - -static bool shouldInspect(MachineInstr &MI) { - return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); -} - -static bool isDo(MachineInstr *MI) { - return MI->getOpcode() != ARM::t2WhileLoopStart; -} - +static cl::opt<bool> +DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden, + cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"), + cl::init(false)); + +static bool isVectorPredicated(MachineInstr *MI) { + int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); + return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; +} + +static bool isVectorPredicate(MachineInstr *MI) { + return MI->findRegisterDefOperandIdx(ARM::VPR) != -1; +} + +static bool hasVPRUse(MachineInstr &MI) { + return MI.findRegisterUseOperandIdx(ARM::VPR) != -1; +} + +static bool isDomainMVE(MachineInstr *MI) { + uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; + return Domain == ARMII::DomainMVE; +} + +static bool shouldInspect(MachineInstr &MI) { + return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); +} + +static bool isDo(MachineInstr *MI) { + return MI->getOpcode() != ARM::t2WhileLoopStart; +} + namespace { using InstSet = SmallPtrSetImpl<MachineInstr *>; @@ -143,7 +143,7 @@ namespace { // Insert exit blocks. SmallVector<MachineBasicBlock*, 2> ExitBlocks; ML.getExitBlocks(ExitBlocks); - append_range(Order, ExitBlocks); + append_range(Order, ExitBlocks); // Then add the loop body. Search(ML.getHeader()); @@ -174,187 +174,187 @@ namespace { } }; - // Represent the current state of the VPR and hold all instances which - // represent a VPT block, which is a list of instructions that begins with a - // VPT/VPST and has a maximum of four proceeding instructions. All - // instructions within the block are predicated upon the vpr and we allow - // instructions to define the vpr within in the block too. - class VPTState { - friend struct LowOverheadLoop; - - SmallVector<MachineInstr *, 4> Insts; - - static SmallVector<VPTState, 4> Blocks; - static SetVector<MachineInstr *> CurrentPredicates; - static std::map<MachineInstr *, - std::unique_ptr<PredicatedMI>> PredicatedInsts; - - static void CreateVPTBlock(MachineInstr *MI) { - assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) - && "Can't begin VPT without predicate"); - Blocks.emplace_back(MI); - // The execution of MI is predicated upon the current set of instructions - // that are AND'ed together to form the VPR predicate value. In the case - // that MI is a VPT, CurrentPredicates will also just be MI. - PredicatedInsts.emplace( - MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates)); - } - - static void reset() { - Blocks.clear(); - PredicatedInsts.clear(); - CurrentPredicates.clear(); - } - - static void addInst(MachineInstr *MI) { - Blocks.back().insert(MI); - PredicatedInsts.emplace( - MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates)); - } - - static void addPredicate(MachineInstr *MI) { - LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); - CurrentPredicates.insert(MI); + // Represent the current state of the VPR and hold all instances which + // represent a VPT block, which is a list of instructions that begins with a + // VPT/VPST and has a maximum of four proceeding instructions. All + // instructions within the block are predicated upon the vpr and we allow + // instructions to define the vpr within in the block too. + class VPTState { + friend struct LowOverheadLoop; + + SmallVector<MachineInstr *, 4> Insts; + + static SmallVector<VPTState, 4> Blocks; + static SetVector<MachineInstr *> CurrentPredicates; + static std::map<MachineInstr *, + std::unique_ptr<PredicatedMI>> PredicatedInsts; + + static void CreateVPTBlock(MachineInstr *MI) { + assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) + && "Can't begin VPT without predicate"); + Blocks.emplace_back(MI); + // The execution of MI is predicated upon the current set of instructions + // that are AND'ed together to form the VPR predicate value. In the case + // that MI is a VPT, CurrentPredicates will also just be MI. + PredicatedInsts.emplace( + MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates)); } - static void resetPredicate(MachineInstr *MI) { - LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); - CurrentPredicates.clear(); - CurrentPredicates.insert(MI); + static void reset() { + Blocks.clear(); + PredicatedInsts.clear(); + CurrentPredicates.clear(); } - public: + static void addInst(MachineInstr *MI) { + Blocks.back().insert(MI); + PredicatedInsts.emplace( + MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates)); + } + + static void addPredicate(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); + CurrentPredicates.insert(MI); + } + + static void resetPredicate(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); + CurrentPredicates.clear(); + CurrentPredicates.insert(MI); + } + + public: // Have we found an instruction within the block which defines the vpr? If // so, not all the instructions in the block will have the same predicate. - static bool hasUniformPredicate(VPTState &Block) { - return getDivergent(Block) == nullptr; + static bool hasUniformPredicate(VPTState &Block) { + return getDivergent(Block) == nullptr; } - // If it exists, return the first internal instruction which modifies the - // VPR. - static MachineInstr *getDivergent(VPTState &Block) { - SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); - for (unsigned i = 1; i < Insts.size(); ++i) { - MachineInstr *Next = Insts[i]; - if (isVectorPredicate(Next)) - return Next; // Found an instruction altering the vpr. - } - return nullptr; - } - - // Return whether the given instruction is predicated upon a VCTP. - static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { - SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates; - if (Exclusive && Predicates.size() != 1) - return false; - for (auto *PredMI : Predicates) - if (isVCTP(PredMI)) - return true; - return false; - } - - // Is the VPST, controlling the block entry, predicated upon a VCTP. - static bool isEntryPredicatedOnVCTP(VPTState &Block, - bool Exclusive = false) { - SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); - return isPredicatedOnVCTP(Insts.front(), Exclusive); - } - - // If this block begins with a VPT, we can check whether it's using - // at least one predicated input(s), as well as possible loop invariant - // which would result in it being implicitly predicated. - static bool hasImplicitlyValidVPT(VPTState &Block, - ReachingDefAnalysis &RDA) { - SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); - MachineInstr *VPT = Insts.front(); - assert(isVPTOpcode(VPT->getOpcode()) && - "Expected VPT block to begin with VPT/VPST"); - - if (VPT->getOpcode() == ARM::MVE_VPST) - return false; - - auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { - MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx)); - return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op); - }; - - auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { - MachineOperand &MO = MI->getOperand(Idx); - if (!MO.isReg() || !MO.getReg()) - return true; - - SmallPtrSet<MachineInstr *, 2> Defs; - RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs); - if (Defs.empty()) - return true; - - for (auto *Def : Defs) - if (Def->getParent() == VPT->getParent()) - return false; - return true; - }; - - // Check that at least one of the operands is directly predicated on a - // vctp and allow an invariant value too. - return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && - (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && - (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); - } - - static bool isValid(ReachingDefAnalysis &RDA) { - // All predication within the loop should be based on vctp. If the block - // isn't predicated on entry, check whether the vctp is within the block - // and that all other instructions are then predicated on it. - for (auto &Block : Blocks) { - if (isEntryPredicatedOnVCTP(Block, false) || - hasImplicitlyValidVPT(Block, RDA)) - continue; - - SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); - // We don't know how to convert a block with just a VPT;VCTP into - // anything valid once we remove the VCTP. For now just bail out. - assert(isVPTOpcode(Insts.front()->getOpcode()) && - "Expected VPT block to start with a VPST or VPT!"); - if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && - isVCTP(Insts.back())) - return false; - - for (auto *MI : Insts) { - // Check that any internal VCTPs are 'Then' predicated. - if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then) - return false; - // Skip other instructions that build up the predicate. - if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) - continue; - // Check that any other instructions are predicated upon a vctp. - // TODO: We could infer when VPTs are implicitly predicated on the - // vctp (when the operands are predicated). - if (!isPredicatedOnVCTP(MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); - return false; - } - } - } - return true; + // If it exists, return the first internal instruction which modifies the + // VPR. + static MachineInstr *getDivergent(VPTState &Block) { + SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); + for (unsigned i = 1; i < Insts.size(); ++i) { + MachineInstr *Next = Insts[i]; + if (isVectorPredicate(Next)) + return Next; // Found an instruction altering the vpr. + } + return nullptr; } - VPTState(MachineInstr *MI) { Insts.push_back(MI); } - - void insert(MachineInstr *MI) { - Insts.push_back(MI); - // VPT/VPST + 4 predicated instructions. - assert(Insts.size() <= 5 && "Too many instructions in VPT block!"); + // Return whether the given instruction is predicated upon a VCTP. + static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { + SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates; + if (Exclusive && Predicates.size() != 1) + return false; + for (auto *PredMI : Predicates) + if (isVCTP(PredMI)) + return true; + return false; + } + + // Is the VPST, controlling the block entry, predicated upon a VCTP. + static bool isEntryPredicatedOnVCTP(VPTState &Block, + bool Exclusive = false) { + SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); + return isPredicatedOnVCTP(Insts.front(), Exclusive); } - bool containsVCTP() const { - for (auto *MI : Insts) - if (isVCTP(MI)) - return true; - return false; + // If this block begins with a VPT, we can check whether it's using + // at least one predicated input(s), as well as possible loop invariant + // which would result in it being implicitly predicated. + static bool hasImplicitlyValidVPT(VPTState &Block, + ReachingDefAnalysis &RDA) { + SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); + MachineInstr *VPT = Insts.front(); + assert(isVPTOpcode(VPT->getOpcode()) && + "Expected VPT block to begin with VPT/VPST"); + + if (VPT->getOpcode() == ARM::MVE_VPST) + return false; + + auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { + MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx)); + return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op); + }; + + auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { + MachineOperand &MO = MI->getOperand(Idx); + if (!MO.isReg() || !MO.getReg()) + return true; + + SmallPtrSet<MachineInstr *, 2> Defs; + RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs); + if (Defs.empty()) + return true; + + for (auto *Def : Defs) + if (Def->getParent() == VPT->getParent()) + return false; + return true; + }; + + // Check that at least one of the operands is directly predicated on a + // vctp and allow an invariant value too. + return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && + (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && + (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); } - unsigned size() const { return Insts.size(); } - SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } + static bool isValid(ReachingDefAnalysis &RDA) { + // All predication within the loop should be based on vctp. If the block + // isn't predicated on entry, check whether the vctp is within the block + // and that all other instructions are then predicated on it. + for (auto &Block : Blocks) { + if (isEntryPredicatedOnVCTP(Block, false) || + hasImplicitlyValidVPT(Block, RDA)) + continue; + + SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); + // We don't know how to convert a block with just a VPT;VCTP into + // anything valid once we remove the VCTP. For now just bail out. + assert(isVPTOpcode(Insts.front()->getOpcode()) && + "Expected VPT block to start with a VPST or VPT!"); + if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && + isVCTP(Insts.back())) + return false; + + for (auto *MI : Insts) { + // Check that any internal VCTPs are 'Then' predicated. + if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then) + return false; + // Skip other instructions that build up the predicate. + if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) + continue; + // Check that any other instructions are predicated upon a vctp. + // TODO: We could infer when VPTs are implicitly predicated on the + // vctp (when the operands are predicated). + if (!isPredicatedOnVCTP(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); + return false; + } + } + } + return true; + } + + VPTState(MachineInstr *MI) { Insts.push_back(MI); } + + void insert(MachineInstr *MI) { + Insts.push_back(MI); + // VPT/VPST + 4 predicated instructions. + assert(Insts.size() <= 5 && "Too many instructions in VPT block!"); + } + + bool containsVCTP() const { + for (auto *MI : Insts) + if (isVCTP(MI)) + return true; + return false; + } + + unsigned size() const { return Insts.size(); } + SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } }; struct LowOverheadLoop { @@ -366,13 +366,13 @@ namespace { const TargetRegisterInfo &TRI; const ARMBaseInstrInfo &TII; MachineFunction *MF = nullptr; - MachineBasicBlock::iterator StartInsertPt; - MachineBasicBlock *StartInsertBB = nullptr; + MachineBasicBlock::iterator StartInsertPt; + MachineBasicBlock *StartInsertBB = nullptr; MachineInstr *Start = nullptr; MachineInstr *Dec = nullptr; MachineInstr *End = nullptr; - MachineOperand TPNumElements; - SmallVector<MachineInstr*, 4> VCTPs; + MachineOperand TPNumElements; + SmallVector<MachineInstr*, 4> VCTPs; SmallPtrSet<MachineInstr*, 4> ToRemove; SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute; bool Revert = false; @@ -381,14 +381,14 @@ namespace { LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, const ARMBaseInstrInfo &TII) - : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), - TPNumElements(MachineOperand::CreateImm(0)) { + : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), + TPNumElements(MachineOperand::CreateImm(0)) { MF = ML.getHeader()->getParent(); if (auto *MBB = ML.getLoopPreheader()) Preheader = MBB; else if (auto *MBB = MLI.findLoopPreheader(&ML, true)) Preheader = MBB; - VPTState::reset(); + VPTState::reset(); } // If this is an MVE instruction, check that we know how to use tail @@ -403,18 +403,18 @@ namespace { bool IsTailPredicationLegal() const { // For now, let's keep things really simple and only support a single // block for tail predication. - return !Revert && FoundAllComponents() && !VCTPs.empty() && + return !Revert && FoundAllComponents() && !VCTPs.empty() && !CannotTailPredicate && ML.getNumBlocks() == 1; } - // Given that MI is a VCTP, check that is equivalent to any other VCTPs - // found. - bool AddVCTP(MachineInstr *MI); - + // Given that MI is a VCTP, check that is equivalent to any other VCTPs + // found. + bool AddVCTP(MachineInstr *MI); + // Check that the predication in the loop will be equivalent once we // perform the conversion. Also ensure that we can provide the number // of elements to the loop start instruction. - bool ValidateTailPredicate(); + bool ValidateTailPredicate(); // Check that any values available outside of the loop will be the same // after tail predication conversion. @@ -427,41 +427,41 @@ namespace { // Check the branch targets are within range and we satisfy our // restrictions. - void Validate(ARMBasicBlockUtils *BBUtils); + void Validate(ARMBasicBlockUtils *BBUtils); bool FoundAllComponents() const { return Start && Dec && End; } - SmallVectorImpl<VPTState> &getVPTBlocks() { - return VPTState::Blocks; - } + SmallVectorImpl<VPTState> &getVPTBlocks() { + return VPTState::Blocks; + } - // Return the operand for the loop start instruction. This will be the loop - // iteration count, or the number of elements if we're tail predicating. - MachineOperand &getLoopStartOperand() { - if (IsTailPredicationLegal()) - return TPNumElements; - return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0); + // Return the operand for the loop start instruction. This will be the loop + // iteration count, or the number of elements if we're tail predicating. + MachineOperand &getLoopStartOperand() { + if (IsTailPredicationLegal()) + return TPNumElements; + return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0); } unsigned getStartOpcode() const { - bool IsDo = isDo(Start); + bool IsDo = isDo(Start); if (!IsTailPredicationLegal()) return IsDo ? ARM::t2DLS : ARM::t2WLS; - return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo); + return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo); } void dump() const { if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; - if (!VCTPs.empty()) { - dbgs() << "ARM Loops: Found VCTP(s):\n"; - for (auto *MI : VCTPs) - dbgs() << " - " << *MI; - } + if (!VCTPs.empty()) { + dbgs() << "ARM Loops: Found VCTP(s):\n"; + for (auto *MI : VCTPs) + dbgs() << " - " << *MI; + } if (!FoundAllComponents()) dbgs() << "ARM Loops: Not a low-overhead loop.\n"; else if (!(Start && Dec && End)) @@ -508,14 +508,14 @@ namespace { bool RevertNonLoops(); void RevertWhile(MachineInstr *MI) const; - void RevertDo(MachineInstr *MI) const; + void RevertDo(MachineInstr *MI) const; bool RevertLoopDec(MachineInstr *MI) const; void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void RevertLoopEndDec(MachineInstr *MI) const; - + void RevertLoopEndDec(MachineInstr *MI) const; + void ConvertVPTBlocks(LowOverheadLoop &LoLoop); MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); @@ -528,230 +528,230 @@ namespace { char ARMLowOverheadLoops::ID = 0; -SmallVector<VPTState, 4> VPTState::Blocks; -SetVector<MachineInstr *> VPTState::CurrentPredicates; -std::map<MachineInstr *, - std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts; - +SmallVector<VPTState, 4> VPTState::Blocks; +SetVector<MachineInstr *> VPTState::CurrentPredicates; +std::map<MachineInstr *, + std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts; + INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, - InstSet &ToRemove, InstSet &Ignore) { - - // Check that we can remove all of Killed without having to modify any IT - // blocks. - auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { - // Collect the dead code and the MBBs in which they reside. - SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; - for (auto *Dead : Killed) - BasicBlocks.insert(Dead->getParent()); - - // Collect IT blocks in all affected basic blocks. - std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; - for (auto *MBB : BasicBlocks) { - for (auto &IT : *MBB) { - if (IT.getOpcode() != ARM::t2IT) - continue; - RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE), - ITBlocks[&IT]); - } - } - - // If we're removing all of the instructions within an IT block, then - // also remove the IT instruction. - SmallPtrSet<MachineInstr *, 2> ModifiedITs; - SmallPtrSet<MachineInstr *, 2> RemoveITs; - for (auto *Dead : Killed) { - if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) { - MachineInstr *IT = RDA.getMIOperand(Dead, *MO); - RemoveITs.insert(IT); - auto &CurrentBlock = ITBlocks[IT]; - CurrentBlock.erase(Dead); - if (CurrentBlock.empty()) - ModifiedITs.erase(IT); - else - ModifiedITs.insert(IT); - } +static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, + InstSet &ToRemove, InstSet &Ignore) { + + // Check that we can remove all of Killed without having to modify any IT + // blocks. + auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { + // Collect the dead code and the MBBs in which they reside. + SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; + for (auto *Dead : Killed) + BasicBlocks.insert(Dead->getParent()); + + // Collect IT blocks in all affected basic blocks. + std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; + for (auto *MBB : BasicBlocks) { + for (auto &IT : *MBB) { + if (IT.getOpcode() != ARM::t2IT) + continue; + RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE), + ITBlocks[&IT]); + } + } + + // If we're removing all of the instructions within an IT block, then + // also remove the IT instruction. + SmallPtrSet<MachineInstr *, 2> ModifiedITs; + SmallPtrSet<MachineInstr *, 2> RemoveITs; + for (auto *Dead : Killed) { + if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) { + MachineInstr *IT = RDA.getMIOperand(Dead, *MO); + RemoveITs.insert(IT); + auto &CurrentBlock = ITBlocks[IT]; + CurrentBlock.erase(Dead); + if (CurrentBlock.empty()) + ModifiedITs.erase(IT); + else + ModifiedITs.insert(IT); + } + } + if (!ModifiedITs.empty()) + return false; + Killed.insert(RemoveITs.begin(), RemoveITs.end()); + return true; + }; + + SmallPtrSet<MachineInstr *, 2> Uses; + if (!RDA.isSafeToRemove(MI, Uses, Ignore)) + return false; + + if (WontCorruptITs(Uses, RDA)) { + ToRemove.insert(Uses.begin(), Uses.end()); + LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI + << " - can also remove:\n"; + for (auto *Use : Uses) + dbgs() << " - " << *Use); + + SmallPtrSet<MachineInstr*, 4> Killed; + RDA.collectKilledOperands(MI, Killed); + if (WontCorruptITs(Killed, RDA)) { + ToRemove.insert(Killed.begin(), Killed.end()); + LLVM_DEBUG(for (auto *Dead : Killed) + dbgs() << " - " << *Dead); } - if (!ModifiedITs.empty()) - return false; - Killed.insert(RemoveITs.begin(), RemoveITs.end()); - return true; - }; - - SmallPtrSet<MachineInstr *, 2> Uses; - if (!RDA.isSafeToRemove(MI, Uses, Ignore)) - return false; - - if (WontCorruptITs(Uses, RDA)) { - ToRemove.insert(Uses.begin(), Uses.end()); - LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI - << " - can also remove:\n"; - for (auto *Use : Uses) - dbgs() << " - " << *Use); - - SmallPtrSet<MachineInstr*, 4> Killed; - RDA.collectKilledOperands(MI, Killed); - if (WontCorruptITs(Killed, RDA)) { - ToRemove.insert(Killed.begin(), Killed.end()); - LLVM_DEBUG(for (auto *Dead : Killed) - dbgs() << " - " << *Dead); - } - return true; - } - return false; -} - -bool LowOverheadLoop::ValidateTailPredicate() { - if (!IsTailPredicationLegal()) { - LLVM_DEBUG(if (VCTPs.empty()) - dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; - dbgs() << "ARM Loops: Tail-predication is not valid.\n"); - return false; + return true; } - - assert(!VCTPs.empty() && "VCTP instruction expected but is not set"); - assert(ML.getBlocks().size() == 1 && - "Shouldn't be processing a loop with more than one block"); - - if (DisableTailPredication) { - LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n"); + return false; +} + +bool LowOverheadLoop::ValidateTailPredicate() { + if (!IsTailPredicationLegal()) { + LLVM_DEBUG(if (VCTPs.empty()) + dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; + dbgs() << "ARM Loops: Tail-predication is not valid.\n"); return false; - } + } - if (!VPTState::isValid(RDA)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n"); - return false; - } + assert(!VCTPs.empty() && "VCTP instruction expected but is not set"); + assert(ML.getBlocks().size() == 1 && + "Shouldn't be processing a loop with more than one block"); - if (!ValidateLiveOuts()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n"); + if (DisableTailPredication) { + LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n"); return false; } - // Check that creating a [W|D]LSTP, which will define LR with an element - // count instead of iteration count, won't affect any other instructions - // than the LoopStart and LoopDec. - // TODO: We should try to insert the [W|D]LSTP after any of the other uses. - Register StartReg = isDo(Start) ? Start->getOperand(1).getReg() - : Start->getOperand(0).getReg(); - if (StartInsertPt == Start && StartReg == ARM::LR) { - if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) { - SmallPtrSet<MachineInstr *, 2> Uses; - RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses); - for (auto *Use : Uses) { - if (Use != Start && Use != Dec) { - LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use); - return false; - } + if (!VPTState::isValid(RDA)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n"); + return false; + } + + if (!ValidateLiveOuts()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n"); + return false; + } + + // Check that creating a [W|D]LSTP, which will define LR with an element + // count instead of iteration count, won't affect any other instructions + // than the LoopStart and LoopDec. + // TODO: We should try to insert the [W|D]LSTP after any of the other uses. + Register StartReg = isDo(Start) ? Start->getOperand(1).getReg() + : Start->getOperand(0).getReg(); + if (StartInsertPt == Start && StartReg == ARM::LR) { + if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) { + SmallPtrSet<MachineInstr *, 2> Uses; + RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses); + for (auto *Use : Uses) { + if (Use != Start && Use != Dec) { + LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use); + return false; + } } } } - // For tail predication, we need to provide the number of elements, instead - // of the iteration count, to the loop start instruction. The number of - // elements is provided to the vctp instruction, so we need to check that - // we can use this register at InsertPt. - MachineInstr *VCTP = VCTPs.back(); - if (Start->getOpcode() == ARM::t2DoLoopStartTP) { - TPNumElements = Start->getOperand(2); - StartInsertPt = Start; - StartInsertBB = Start->getParent(); - } else { - TPNumElements = VCTP->getOperand(1); - MCRegister NumElements = TPNumElements.getReg().asMCReg(); - - // If the register is defined within loop, then we can't perform TP. - // TODO: Check whether this is just a mov of a register that would be - // available. - if (RDA.hasLocalDefBefore(VCTP, NumElements)) { - LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); + // For tail predication, we need to provide the number of elements, instead + // of the iteration count, to the loop start instruction. The number of + // elements is provided to the vctp instruction, so we need to check that + // we can use this register at InsertPt. + MachineInstr *VCTP = VCTPs.back(); + if (Start->getOpcode() == ARM::t2DoLoopStartTP) { + TPNumElements = Start->getOperand(2); + StartInsertPt = Start; + StartInsertBB = Start->getParent(); + } else { + TPNumElements = VCTP->getOperand(1); + MCRegister NumElements = TPNumElements.getReg().asMCReg(); + + // If the register is defined within loop, then we can't perform TP. + // TODO: Check whether this is just a mov of a register that would be + // available. + if (RDA.hasLocalDefBefore(VCTP, NumElements)) { + LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); + return false; + } + + // The element count register maybe defined after InsertPt, in which case we + // need to try to move either InsertPt or the def so that the [w|d]lstp can + // use the value. + + if (StartInsertPt != StartInsertBB->end() && + !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) { + if (auto *ElemDef = + RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) { + if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) { + ElemDef->removeFromParent(); + StartInsertBB->insert(StartInsertPt, ElemDef); + LLVM_DEBUG(dbgs() + << "ARM Loops: Moved element count def: " << *ElemDef); + } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) { + StartInsertPt->removeFromParent(); + StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), + &*StartInsertPt); + LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); + } else { + // If we fail to move an instruction and the element count is provided + // by a mov, use the mov operand if it will have the same value at the + // insertion point + MachineOperand Operand = ElemDef->getOperand(1); + if (isMovRegOpcode(ElemDef->getOpcode()) && + RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) == + RDA.getUniqueReachingMIDef(&*StartInsertPt, + Operand.getReg().asMCReg())) { + TPNumElements = Operand; + NumElements = TPNumElements.getReg(); + } else { + LLVM_DEBUG(dbgs() + << "ARM Loops: Unable to move element count to loop " + << "start instruction.\n"); + return false; + } + } + } + } + + // Especially in the case of while loops, InsertBB may not be the + // preheader, so we need to check that the register isn't redefined + // before entering the loop. + auto CannotProvideElements = [this](MachineBasicBlock *MBB, + MCRegister NumElements) { + if (MBB->empty()) + return false; + // NumElements is redefined in this block. + if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) + return true; + + // Don't continue searching up through multiple predecessors. + if (MBB->pred_size() > 1) + return true; + return false; + }; + + // Search backwards for a def, until we get to InsertBB. + MachineBasicBlock *MBB = Preheader; + while (MBB && MBB != StartInsertBB) { + if (CannotProvideElements(MBB, NumElements)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); + return false; + } + MBB = *MBB->pred_begin(); } - - // The element count register maybe defined after InsertPt, in which case we - // need to try to move either InsertPt or the def so that the [w|d]lstp can - // use the value. - - if (StartInsertPt != StartInsertBB->end() && - !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) { - if (auto *ElemDef = - RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) { - if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) { - ElemDef->removeFromParent(); - StartInsertBB->insert(StartInsertPt, ElemDef); - LLVM_DEBUG(dbgs() - << "ARM Loops: Moved element count def: " << *ElemDef); - } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) { - StartInsertPt->removeFromParent(); - StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), - &*StartInsertPt); - LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); - } else { - // If we fail to move an instruction and the element count is provided - // by a mov, use the mov operand if it will have the same value at the - // insertion point - MachineOperand Operand = ElemDef->getOperand(1); - if (isMovRegOpcode(ElemDef->getOpcode()) && - RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) == - RDA.getUniqueReachingMIDef(&*StartInsertPt, - Operand.getReg().asMCReg())) { - TPNumElements = Operand; - NumElements = TPNumElements.getReg(); - } else { - LLVM_DEBUG(dbgs() - << "ARM Loops: Unable to move element count to loop " - << "start instruction.\n"); - return false; - } - } - } - } - - // Especially in the case of while loops, InsertBB may not be the - // preheader, so we need to check that the register isn't redefined - // before entering the loop. - auto CannotProvideElements = [this](MachineBasicBlock *MBB, - MCRegister NumElements) { - if (MBB->empty()) - return false; - // NumElements is redefined in this block. - if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) - return true; - - // Don't continue searching up through multiple predecessors. - if (MBB->pred_size() > 1) - return true; - - return false; - }; - - // Search backwards for a def, until we get to InsertBB. - MachineBasicBlock *MBB = Preheader; - while (MBB && MBB != StartInsertBB) { - if (CannotProvideElements(MBB, NumElements)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); - return false; - } - MBB = *MBB->pred_begin(); - } - } - - // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect - // world the [w|d]lstp instruction would be last instruction in the preheader - // and so it would only affect instructions within the loop body. But due to - // scheduling, and/or the logic in this pass (above), the insertion point can - // be moved earlier. So if the Loop Start isn't the last instruction in the - // preheader, and if the initial element count is smaller than the vector - // width, the Loop Start instruction will immediately generate one or more - // false lane mask which can, incorrectly, affect the proceeding MVE - // instructions in the preheader. - if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n"); - return false; } + // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect + // world the [w|d]lstp instruction would be last instruction in the preheader + // and so it would only affect instructions within the loop body. But due to + // scheduling, and/or the logic in this pass (above), the insertion point can + // be moved earlier. So if the Loop Start isn't the last instruction in the + // preheader, and if the initial element count is smaller than the vector + // width, the Loop Start instruction will immediately generate one or more + // false lane mask which can, incorrectly, affect the proceeding MVE + // instructions in the preheader. + if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n"); + return false; + } + // Check that the value change of the element count is what we expect and // that the predication will be equivalent. For this we need: // NumElements = NumElements - VectorWidth. The sub will be a sub immediate @@ -760,20 +760,20 @@ bool LowOverheadLoop::ValidateTailPredicate() { return -getAddSubImmediate(*MI) == ExpectedVecWidth; }; - MachineBasicBlock *MBB = VCTP->getParent(); - // Remove modifications to the element count since they have no purpose in a - // tail predicated loop. Explicitly refer to the vctp operand no matter which - // register NumElements has been assigned to, since that is what the - // modifications will be using - if (auto *Def = RDA.getUniqueReachingMIDef( - &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) { + MachineBasicBlock *MBB = VCTP->getParent(); + // Remove modifications to the element count since they have no purpose in a + // tail predicated loop. Explicitly refer to the vctp operand no matter which + // register NumElements has been assigned to, since that is what the + // modifications will be using + if (auto *Def = RDA.getUniqueReachingMIDef( + &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) { SmallPtrSet<MachineInstr*, 2> ElementChain; - SmallPtrSet<MachineInstr*, 2> Ignore; + SmallPtrSet<MachineInstr*, 2> Ignore; unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); - Ignore.insert(VCTPs.begin(), VCTPs.end()); + Ignore.insert(VCTPs.begin(), VCTPs.end()); - if (TryRemove(Def, RDA, ElementChain, Ignore)) { + if (TryRemove(Def, RDA, ElementChain, Ignore)) { bool FoundSub = false; for (auto *MI : ElementChain) { @@ -781,17 +781,17 @@ bool LowOverheadLoop::ValidateTailPredicate() { continue; if (isSubImmOpcode(MI->getOpcode())) { - if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" - " count: " << *MI); + if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" + " count: " << *MI); return false; - } + } FoundSub = true; - } else { - LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" - " count: " << *MI); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" + " count: " << *MI); return false; - } + } } ToRemove.insert(ElementChain.begin(), ElementChain.end()); } @@ -868,18 +868,18 @@ static bool producesFalseLanesZero(MachineInstr &MI, if (canGenerateNonZeros(MI)) return false; - bool isPredicated = isVectorPredicated(&MI); - // Predicated loads will write zeros to the falsely predicated bytes of the - // destination register. - if (MI.mayLoad()) - return isPredicated; - - auto IsZeroInit = [](MachineInstr *Def) { - return !isVectorPredicated(Def) && - Def->getOpcode() == ARM::MVE_VMOVimmi32 && - Def->getOperand(1).getImm() == 0; - }; - + bool isPredicated = isVectorPredicated(&MI); + // Predicated loads will write zeros to the falsely predicated bytes of the + // destination register. + if (MI.mayLoad()) + return isPredicated; + + auto IsZeroInit = [](MachineInstr *Def) { + return !isVectorPredicated(Def) && + Def->getOpcode() == ARM::MVE_VMOVimmi32 && + Def->getOperand(1).getImm() == 0; + }; + bool AllowScalars = isHorizontalReduction(MI); for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg()) @@ -887,21 +887,21 @@ static bool producesFalseLanesZero(MachineInstr &MI, if (!isRegInClass(MO, QPRs) && AllowScalars) continue; - // Check that this instruction will produce zeros in its false lanes: - // - If it only consumes false lanes zero or constant 0 (vmov #0) - // - If it's predicated, it only matters that it's def register already has - // false lane zeros, so we can ignore the uses. - SmallPtrSet<MachineInstr *, 2> Defs; - RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs); - for (auto *Def : Defs) { - if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def)) - continue; - if (MO.isUse() && isPredicated) - continue; + // Check that this instruction will produce zeros in its false lanes: + // - If it only consumes false lanes zero or constant 0 (vmov #0) + // - If it's predicated, it only matters that it's def register already has + // false lane zeros, so we can ignore the uses. + SmallPtrSet<MachineInstr *, 2> Defs; + RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs); + for (auto *Def : Defs) { + if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def)) + continue; + if (MO.isUse() && isPredicated) + continue; return false; - } + } } - LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); return true; } @@ -921,7 +921,7 @@ bool LowOverheadLoop::ValidateLiveOuts() { // the false lanes are zeroed and here we're trying to track that those false // lanes remain zero, or where they change, the differences are masked away // by their user(s). - // All MVE stores have to be predicated, so we know that any predicate load + // All MVE stores have to be predicated, so we know that any predicate load // operands, or stored results are equivalent already. Other explicitly // predicated instructions will perform the same operation in the original // loop and the tail-predicated form too. Because of this, we can insert @@ -934,32 +934,32 @@ bool LowOverheadLoop::ValidateLiveOuts() { MachineBasicBlock *Header = ML.getHeader(); for (auto &MI : *Header) { - if (!shouldInspect(MI)) + if (!shouldInspect(MI)) continue; if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode())) continue; - bool isPredicated = isVectorPredicated(&MI); - bool retainsOrReduces = - retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); - - if (isPredicated) + bool isPredicated = isVectorPredicated(&MI); + bool retainsOrReduces = + retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); + + if (isPredicated) Predicated.insert(&MI); - if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) - FalseLanesZero.insert(&MI); - else if (MI.getNumDefs() == 0) + if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) + FalseLanesZero.insert(&MI); + else if (MI.getNumDefs() == 0) continue; - else if (!isPredicated && retainsOrReduces) - return false; - else if (!isPredicated) + else if (!isPredicated && retainsOrReduces) + return false; + else if (!isPredicated) FalseLanesUnknown.insert(&MI); } auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, SmallPtrSetImpl<MachineInstr *> &Predicated) { SmallPtrSet<MachineInstr *, 2> Uses; - RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses); + RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses); for (auto *Use : Uses) { if (Use != MI && !Predicated.count(Use)) return false; @@ -982,12 +982,12 @@ bool LowOverheadLoop::ValidateLiveOuts() { LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); NonPredicated.insert(MI); - break; + break; } } // Any unknown false lanes have been masked away by the user(s). - if (!NonPredicated.contains(MI)) - Predicated.insert(MI); + if (!NonPredicated.contains(MI)) + Predicated.insert(MI); } SmallPtrSet<MachineInstr *, 2> LiveOutMIs; @@ -997,13 +997,13 @@ bool LowOverheadLoop::ValidateLiveOuts() { assert(ExitBlocks.size() == 1 && "Expected a single exit block"); MachineBasicBlock *ExitBB = ExitBlocks.front(); for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { - // TODO: Instead of blocking predication, we could move the vctp to the exit - // block and calculate it's operand there in or the preheader. - if (RegMask.PhysReg == ARM::VPR) - return false; + // TODO: Instead of blocking predication, we could move the vctp to the exit + // block and calculate it's operand there in or the preheader. + if (RegMask.PhysReg == ARM::VPR) + return false; // Check Q-regs that are live in the exit blocks. We don't collect scalars // because they won't be affected by lane predication. - if (QPRs->contains(RegMask.PhysReg)) + if (QPRs->contains(RegMask.PhysReg)) if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg)) LiveOutMIs.insert(MI); } @@ -1014,123 +1014,123 @@ bool LowOverheadLoop::ValidateLiveOuts() { // instruction needs to be predicated, so check this here. The instructions // in NonPredicated have been found to be a reduction that we can ensure its // legality. - for (auto *MI : LiveOutMIs) { - if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI); + for (auto *MI : LiveOutMIs) { + if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI); return false; - } - } + } + } return true; } -void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { +void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { if (Revert) return; - // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] - // can only jump back. - auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, - ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { - MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd - ? End->getOperand(1).getMBB() - : End->getOperand(2).getMBB(); - // TODO Maybe there's cases where the target doesn't have to be the header, - // but for now be safe and revert. - if (TgtBB != ML.getHeader()) { - LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n"); - return false; - } - - // The WLS and LE instructions have 12-bits for the label offset. WLS - // requires a positive offset, while LE uses negative. - if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) || - !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) { - LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); - return false; - } - - if (Start->getOpcode() == ARM::t2WhileLoopStart && - (BBUtils->getOffsetOf(Start) > - BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || - !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { - LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); - return false; - } - return true; - }; - - // Find a suitable position to insert the loop start instruction. It needs to - // be able to safely define LR. - auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec, - MachineBasicBlock::iterator &InsertPt, - MachineBasicBlock *&InsertBB, - ReachingDefAnalysis &RDA, - InstSet &ToRemove) { - // For a t2DoLoopStart it is always valid to use the start insertion point. - // For WLS we can define LR if LR already contains the same value. - if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) { - InsertPt = MachineBasicBlock::iterator(Start); - InsertBB = Start->getParent(); - return true; - } - - // We've found no suitable LR def and Start doesn't use LR directly. Can we - // just define LR anyway? - if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR))) - return false; - - InsertPt = MachineBasicBlock::iterator(Start); - InsertBB = Start->getParent(); - return true; - }; - - if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA, - ToRemove)) { + // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] + // can only jump back. + auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, + ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { + MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd + ? End->getOperand(1).getMBB() + : End->getOperand(2).getMBB(); + // TODO Maybe there's cases where the target doesn't have to be the header, + // but for now be safe and revert. + if (TgtBB != ML.getHeader()) { + LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n"); + return false; + } + + // The WLS and LE instructions have 12-bits for the label offset. WLS + // requires a positive offset, while LE uses negative. + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) || + !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); + return false; + } + + if (Start->getOpcode() == ARM::t2WhileLoopStart && + (BBUtils->getOffsetOf(Start) > + BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || + !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + return false; + } + return true; + }; + + // Find a suitable position to insert the loop start instruction. It needs to + // be able to safely define LR. + auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec, + MachineBasicBlock::iterator &InsertPt, + MachineBasicBlock *&InsertBB, + ReachingDefAnalysis &RDA, + InstSet &ToRemove) { + // For a t2DoLoopStart it is always valid to use the start insertion point. + // For WLS we can define LR if LR already contains the same value. + if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) { + InsertPt = MachineBasicBlock::iterator(Start); + InsertBB = Start->getParent(); + return true; + } + + // We've found no suitable LR def and Start doesn't use LR directly. Can we + // just define LR anyway? + if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR))) + return false; + + InsertPt = MachineBasicBlock::iterator(Start); + InsertBB = Start->getParent(); + return true; + }; + + if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA, + ToRemove)) { LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); Revert = true; return; - } - LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end()) - dbgs() << "ARM Loops: Will insert LoopStart at end of block\n"; - else - dbgs() << "ARM Loops: Will insert LoopStart at " - << *StartInsertPt - ); - - Revert = !ValidateRanges(Start, End, BBUtils, ML); - CannotTailPredicate = !ValidateTailPredicate(); -} - -bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { - LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); - if (VCTPs.empty()) { - VCTPs.push_back(MI); - return true; + } + LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end()) + dbgs() << "ARM Loops: Will insert LoopStart at end of block\n"; + else + dbgs() << "ARM Loops: Will insert LoopStart at " + << *StartInsertPt + ); + + Revert = !ValidateRanges(Start, End, BBUtils, ML); + CannotTailPredicate = !ValidateTailPredicate(); +} + +bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { + LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); + if (VCTPs.empty()) { + VCTPs.push_back(MI); + return true; } - // If we find another VCTP, check whether it uses the same value as the main VCTP. - // If it does, store it in the VCTPs set, else refuse it. - MachineInstr *Prev = VCTPs.back(); - if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) || - !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " - "definition from the main VCTP"); - return false; - } - VCTPs.push_back(MI); - return true; + // If we find another VCTP, check whether it uses the same value as the main VCTP. + // If it does, store it in the VCTPs set, else refuse it. + MachineInstr *Prev = VCTPs.back(); + if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) || + !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " + "definition from the main VCTP"); + return false; + } + VCTPs.push_back(MI); + return true; } bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (CannotTailPredicate) return false; - if (!shouldInspect(*MI)) - return true; + if (!shouldInspect(*MI)) + return true; - if (MI->getOpcode() == ARM::MVE_VPSEL || - MI->getOpcode() == ARM::MVE_VPNOT) { + if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) { // TODO: Allow VPSEL and VPNOT, we currently cannot because: // 1) It will use the VPR as a predicate operand, but doesn't have to be // instead a VPT block, which means we can assert while building up @@ -1142,24 +1142,24 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { return false; } - // Record all VCTPs and check that they're equivalent to one another. - if (isVCTP(MI) && !AddVCTP(MI)) - return false; - - // Inspect uses first so that any instructions that alter the VPR don't - // alter the predicate upon themselves. - const MCInstrDesc &MCID = MI->getDesc(); + // Record all VCTPs and check that they're equivalent to one another. + if (isVCTP(MI) && !AddVCTP(MI)) + return false; + + // Inspect uses first so that any instructions that alter the VPR don't + // alter the predicate upon themselves. + const MCInstrDesc &MCID = MI->getDesc(); bool IsUse = false; - unsigned LastOpIdx = MI->getNumOperands() - 1; - for (auto &Op : enumerate(reverse(MCID.operands()))) { - const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index()); - if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) + unsigned LastOpIdx = MI->getNumOperands() - 1; + for (auto &Op : enumerate(reverse(MCID.operands()))) { + const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index()); + if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) continue; - if (ARM::isVpred(Op.value().OperandType)) { - VPTState::addInst(MI); + if (ARM::isVpred(Op.value().OperandType)) { + VPTState::addInst(MI); IsUse = true; - } else if (MI->getOpcode() != ARM::MVE_VPST) { + } else if (MI->getOpcode() != ARM::MVE_VPST) { LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); return false; } @@ -1168,36 +1168,36 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { // If we find an instruction that has been marked as not valid for tail // predication, only allow the instruction if it's contained within a valid // VPT block. - bool RequiresExplicitPredication = - (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; - if (isDomainMVE(MI) && RequiresExplicitPredication) { - LLVM_DEBUG(if (!IsUse) - dbgs() << "ARM Loops: Can't tail predicate: " << *MI); - return IsUse; + bool RequiresExplicitPredication = + (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; + if (isDomainMVE(MI) && RequiresExplicitPredication) { + LLVM_DEBUG(if (!IsUse) + dbgs() << "ARM Loops: Can't tail predicate: " << *MI); + return IsUse; } // If the instruction is already explicitly predicated, then the conversion - // will be fine, but ensure that all store operations are predicated. - if (MI->mayStore()) - return IsUse; - - // If this instruction defines the VPR, update the predicate for the - // proceeding instructions. - if (isVectorPredicate(MI)) { - // Clear the existing predicate when we're not in VPT Active state, - // otherwise we add to it. - if (!isVectorPredicated(MI)) - VPTState::resetPredicate(MI); - else - VPTState::addPredicate(MI); - } - - // Finally once the predicate has been modified, we can start a new VPT - // block if necessary. - if (isVPTOpcode(MI->getOpcode())) - VPTState::CreateVPTBlock(MI); - - return true; + // will be fine, but ensure that all store operations are predicated. + if (MI->mayStore()) + return IsUse; + + // If this instruction defines the VPR, update the predicate for the + // proceeding instructions. + if (isVectorPredicate(MI)) { + // Clear the existing predicate when we're not in VPT Active state, + // otherwise we add to it. + if (!isVectorPredicated(MI)) + VPTState::resetPredicate(MI); + else + VPTState::addPredicate(MI); + } + + // Finally once the predicate has been modified, we can start a new VPT + // block if necessary. + if (isVPTOpcode(MI->getOpcode())) + VPTState::CreateVPTBlock(MI); + + return true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { @@ -1220,7 +1220,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { bool Changed = false; for (auto ML : *MLI) { - if (ML->isOutermost()) + if (ML->isOutermost()) Changed |= ProcessLoop(ML); } Changed |= RevertNonLoops(); @@ -1279,8 +1279,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { LoLoop.Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) LoLoop.End = &MI; - else if (MI.getOpcode() == ARM::t2LoopEndDec) - LoLoop.End = LoLoop.Dec = &MI; + else if (MI.getOpcode() == ARM::t2LoopEndDec) + LoLoop.End = LoLoop.Dec = &MI; else if (isLoopStart(MI)) LoLoop.Start = &MI; else if (MI.getDesc().isCall()) { @@ -1303,18 +1303,18 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { return false; } - // Check that the only instruction using LoopDec is LoopEnd. This can only - // happen when the Dec and End are separate, not a single t2LoopEndDec. + // Check that the only instruction using LoopDec is LoopEnd. This can only + // happen when the Dec and End are separate, not a single t2LoopEndDec. // TODO: Check for copy chains that really have no effect. - if (LoLoop.Dec != LoLoop.End) { - SmallPtrSet<MachineInstr *, 2> Uses; - RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses); - if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); - LoLoop.Revert = true; - } + if (LoLoop.Dec != LoLoop.End) { + SmallPtrSet<MachineInstr *, 2> Uses; + RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses); + if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); + LoLoop.Revert = true; + } } - LoLoop.Validate(BBUtils.get()); + LoLoop.Validate(BBUtils.get()); Expand(LoLoop); return true; } @@ -1329,14 +1329,14 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; - RevertWhileLoopStart(MI, TII, BrOpc); -} - -void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { - LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); - RevertDoLoopStart(MI, TII); + RevertWhileLoopStart(MI, TII, BrOpc); } +void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); + RevertDoLoopStart(MI, TII); +} + bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); @@ -1349,10 +1349,10 @@ bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { } // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. - bool SetFlags = - RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore); + bool SetFlags = + RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore); - llvm::RevertLoopDec(MI, TII, SetFlags); + llvm::RevertLoopDec(MI, TII, SetFlags); return SetFlags; } @@ -1364,35 +1364,35 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; - llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); -} - -// Generate a subs, or sub and cmp, and a branch instead of an LE. -void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { - LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); - assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!"); - MachineBasicBlock *MBB = MI->getParent(); - + llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); +} + +// Generate a subs, or sub and cmp, and a branch instead of an LE. +void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); + assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!"); + MachineBasicBlock *MBB = MI->getParent(); + MachineInstrBuilder MIB = - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); - MIB.addDef(ARM::LR); - MIB.add(MI->getOperand(1)); - MIB.addImm(1); - MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::NoRegister); - MIB.addReg(ARM::CPSR); - MIB->getOperand(5).setIsDef(true); - - MachineBasicBlock *DestBB = MI->getOperand(2).getMBB(); - unsigned BrOpc = - BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; - - // Create bne - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); - MIB.add(MI->getOperand(2)); // branch target - MIB.addImm(ARMCC::NE); // condition code + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); + MIB.addDef(ARM::LR); + MIB.add(MI->getOperand(1)); + MIB.addImm(1); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); MIB.addReg(ARM::CPSR); - + MIB->getOperand(5).setIsDef(true); + + MachineBasicBlock *DestBB = MI->getOperand(2).getMBB(); + unsigned BrOpc = + BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; + + // Create bne + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); + MIB.add(MI->getOperand(2)); // branch target + MIB.addImm(ARMCC::NE); // condition code + MIB.addReg(ARM::CPSR); + MI->eraseFromParent(); } @@ -1403,7 +1403,7 @@ void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { // // $lr = big-itercount-expression // .. -// $lr = t2DoLoopStart renamable $lr +// $lr = t2DoLoopStart renamable $lr // vector.body: // .. // $vpr = MVE_VCTP32 renamable $r3 @@ -1426,8 +1426,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n"); - MachineInstr *Def = - RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0); + MachineInstr *Def = + RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0); if (!Def) { LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n"); return; @@ -1435,8 +1435,8 @@ void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { // Collect and remove the users of iteration count. SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, - LoLoop.End }; - if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed)) + LoLoop.End }; + if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed)) LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n"); } @@ -1446,18 +1446,18 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { // calculate the number of loop iterations. IterationCountDCE(LoLoop); - MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; + MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; MachineInstr *Start = LoLoop.Start; - MachineBasicBlock *MBB = LoLoop.StartInsertBB; + MachineBasicBlock *MBB = LoLoop.StartInsertBB; unsigned Opc = LoLoop.getStartOpcode(); - MachineOperand &Count = LoLoop.getLoopStartOperand(); + MachineOperand &Count = LoLoop.getLoopStartOperand(); MachineInstrBuilder MIB = - BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc)); + BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc)); MIB.addDef(ARM::LR); MIB.add(Count); - if (!isDo(Start)) + if (!isDo(Start)) MIB.add(Start->getOperand(1)); LoLoop.ToRemove.insert(Start); @@ -1467,50 +1467,50 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { - if (MI->isDebugInstr()) - return; + if (MI->isDebugInstr()) + return; LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); - int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); - assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction"); - assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && - "Expected Then predicate!"); - MI->getOperand(PIdx).setImm(ARMVCC::None); - MI->getOperand(PIdx + 1).setReg(0); + int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); + assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction"); + assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && + "Expected Then predicate!"); + MI->getOperand(PIdx).setImm(ARMVCC::None); + MI->getOperand(PIdx + 1).setReg(0); }; for (auto &Block : LoLoop.getVPTBlocks()) { - SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); - - auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { - assert(TheVCMP && "Replacing a removed or non-existent VCMP"); - // Replace the VCMP with a VPT - MachineInstrBuilder MIB = - BuildMI(*At->getParent(), At, At->getDebugLoc(), - TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode()))); - MIB.addImm(ARMVCC::Then); - // Register one - MIB.add(TheVCMP->getOperand(1)); - // Register two - MIB.add(TheVCMP->getOperand(2)); - // The comparison code, e.g. ge, eq, lt - MIB.add(TheVCMP->getOperand(3)); - LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); - LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); - LoLoop.ToRemove.insert(TheVCMP); - TheVCMP = nullptr; - }; - - if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) { - MachineInstr *VPST = Insts.front(); - if (VPTState::hasUniformPredicate(Block)) { - // A vpt block starting with VPST, is only predicated upon vctp and has no - // internal vpr defs: - // - Remove vpst. - // - Unpredicate the remaining instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); - for (unsigned i = 1; i < Insts.size(); ++i) - RemovePredicate(Insts[i]); - } else { + SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); + + auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { + assert(TheVCMP && "Replacing a removed or non-existent VCMP"); + // Replace the VCMP with a VPT + MachineInstrBuilder MIB = + BuildMI(*At->getParent(), At, At->getDebugLoc(), + TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode()))); + MIB.addImm(ARMVCC::Then); + // Register one + MIB.add(TheVCMP->getOperand(1)); + // Register two + MIB.add(TheVCMP->getOperand(2)); + // The comparison code, e.g. ge, eq, lt + MIB.add(TheVCMP->getOperand(3)); + LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); + LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); + LoLoop.ToRemove.insert(TheVCMP); + TheVCMP = nullptr; + }; + + if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) { + MachineInstr *VPST = Insts.front(); + if (VPTState::hasUniformPredicate(Block)) { + // A vpt block starting with VPST, is only predicated upon vctp and has no + // internal vpr defs: + // - Remove vpst. + // - Unpredicate the remaining instructions. + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); + for (unsigned i = 1; i < Insts.size(); ++i) + RemovePredicate(Insts[i]); + } else { // The VPT block has a non-uniform predicate but it uses a vpst and its // entry is guarded only by a vctp, which means we: // - Need to remove the original vpst. @@ -1518,88 +1518,88 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { // we come across the divergent vpr def. // - Insert a new vpst to predicate the instruction(s) that following // the divergent vpr def. - MachineInstr *Divergent = VPTState::getDivergent(Block); - MachineBasicBlock *MBB = Divergent->getParent(); - auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); - while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) - ++DivergentNext; - - bool DivergentNextIsPredicated = - DivergentNext != MBB->end() && - getVPTInstrPredicate(*DivergentNext) != ARMVCC::None; - - for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; - I != E; ++I) + MachineInstr *Divergent = VPTState::getDivergent(Block); + MachineBasicBlock *MBB = Divergent->getParent(); + auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); + while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) + ++DivergentNext; + + bool DivergentNextIsPredicated = + DivergentNext != MBB->end() && + getVPTInstrPredicate(*DivergentNext) != ARMVCC::None; + + for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; + I != E; ++I) RemovePredicate(&*I); - // Check if the instruction defining vpr is a vcmp so it can be combined - // with the VPST This should be the divergent instruction - MachineInstr *VCMP = - VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr; - - if (DivergentNextIsPredicated) { - // Insert a VPST at the divergent only if the next instruction - // would actually use it. A VCMP following a VPST can be - // merged into a VPT so do that instead if the VCMP exists. - if (!VCMP) { - // Create a VPST (with a null mask for now, we'll recompute it - // later) - MachineInstrBuilder MIB = - BuildMI(*Divergent->getParent(), Divergent, - Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST)); - MIB.addImm(0); - LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); - LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); - } else { - // No RDA checks are necessary here since the VPST would have been - // directly after the VCMP - ReplaceVCMPWithVPT(VCMP, VCMP); - } + // Check if the instruction defining vpr is a vcmp so it can be combined + // with the VPST This should be the divergent instruction + MachineInstr *VCMP = + VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr; + + if (DivergentNextIsPredicated) { + // Insert a VPST at the divergent only if the next instruction + // would actually use it. A VCMP following a VPST can be + // merged into a VPT so do that instead if the VCMP exists. + if (!VCMP) { + // Create a VPST (with a null mask for now, we'll recompute it + // later) + MachineInstrBuilder MIB = + BuildMI(*Divergent->getParent(), Divergent, + Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST)); + MIB.addImm(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); + LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); + } else { + // No RDA checks are necessary here since the VPST would have been + // directly after the VCMP + ReplaceVCMPWithVPT(VCMP, VCMP); + } } } - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); - LoLoop.ToRemove.insert(VPST); - } else if (Block.containsVCTP()) { - // The vctp will be removed, so either the entire block will be dead or - // the block mask of the vp(s)t will need to be recomputed. - MachineInstr *VPST = Insts.front(); - if (Block.size() == 2) { - assert(VPST->getOpcode() == ARM::MVE_VPST && - "Found a VPST in an otherwise empty vpt block"); - LoLoop.ToRemove.insert(VPST); - } else - LoLoop.BlockMasksToRecompute.insert(VPST); - } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { - // If this block starts with a VPST then attempt to merge it with the - // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT - // block that no longer exists - MachineInstr *VPST = Insts.front(); - auto Next = ++MachineBasicBlock::iterator(VPST); - assert(getVPTInstrPredicate(*Next) != ARMVCC::None && - "The instruction after a VPST must be predicated"); - (void)Next; - MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR); - if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) && - !LoLoop.ToRemove.contains(VprDef)) { - MachineInstr *VCMP = VprDef; - // The VCMP and VPST can only be merged if the VCMP's operands will have - // the same values at the VPST. - // If any of the instructions between the VCMP and VPST are predicated - // then a different code path is expected to have merged the VCMP and - // VPST already. - if (!std::any_of(++MachineBasicBlock::iterator(VCMP), - MachineBasicBlock::iterator(VPST), hasVPRUse) && - RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) && - RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) { - ReplaceVCMPWithVPT(VCMP, VPST); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); - LoLoop.ToRemove.insert(VPST); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); + LoLoop.ToRemove.insert(VPST); + } else if (Block.containsVCTP()) { + // The vctp will be removed, so either the entire block will be dead or + // the block mask of the vp(s)t will need to be recomputed. + MachineInstr *VPST = Insts.front(); + if (Block.size() == 2) { + assert(VPST->getOpcode() == ARM::MVE_VPST && + "Found a VPST in an otherwise empty vpt block"); + LoLoop.ToRemove.insert(VPST); + } else + LoLoop.BlockMasksToRecompute.insert(VPST); + } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { + // If this block starts with a VPST then attempt to merge it with the + // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT + // block that no longer exists + MachineInstr *VPST = Insts.front(); + auto Next = ++MachineBasicBlock::iterator(VPST); + assert(getVPTInstrPredicate(*Next) != ARMVCC::None && + "The instruction after a VPST must be predicated"); + (void)Next; + MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR); + if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) && + !LoLoop.ToRemove.contains(VprDef)) { + MachineInstr *VCMP = VprDef; + // The VCMP and VPST can only be merged if the VCMP's operands will have + // the same values at the VPST. + // If any of the instructions between the VCMP and VPST are predicated + // then a different code path is expected to have merged the VCMP and + // VPST already. + if (!std::any_of(++MachineBasicBlock::iterator(VCMP), + MachineBasicBlock::iterator(VPST), hasVPRUse) && + RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) && + RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) { + ReplaceVCMPWithVPT(VCMP, VPST); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); + LoLoop.ToRemove.insert(VPST); } } } } - - LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end()); + + LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end()); } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { @@ -1613,9 +1613,9 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(Opc)); MIB.addDef(ARM::LR); - unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; - MIB.add(End->getOperand(Off + 0)); - MIB.add(End->getOperand(Off + 1)); + unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; + MIB.add(End->getOperand(Off + 0)); + MIB.add(End->getOperand(Off + 1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); LoLoop.ToRemove.insert(LoLoop.Dec); LoLoop.ToRemove.insert(End); @@ -1643,17 +1643,17 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) RevertWhile(LoLoop.Start); else - RevertDo(LoLoop.Start); - if (LoLoop.Dec == LoLoop.End) - RevertLoopEndDec(LoLoop.End); - else - RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec)); + RevertDo(LoLoop.Start); + if (LoLoop.Dec == LoLoop.End) + RevertLoopEndDec(LoLoop.End); + else + RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec)); } else { LoLoop.Start = ExpandLoopStart(LoLoop); RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) + if (LoLoop.IsTailPredicationLegal()) ConvertVPTBlocks(LoLoop); for (auto *I : LoLoop.ToRemove) { LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); @@ -1691,7 +1691,7 @@ bool ARMLowOverheadLoops::RevertNonLoops() { SmallVector<MachineInstr*, 4> Starts; SmallVector<MachineInstr*, 4> Decs; SmallVector<MachineInstr*, 4> Ends; - SmallVector<MachineInstr *, 4> EndDecs; + SmallVector<MachineInstr *, 4> EndDecs; for (auto &I : MBB) { if (isLoopStart(I)) @@ -1700,11 +1700,11 @@ bool ARMLowOverheadLoops::RevertNonLoops() { Decs.push_back(&I); else if (I.getOpcode() == ARM::t2LoopEnd) Ends.push_back(&I); - else if (I.getOpcode() == ARM::t2LoopEndDec) - EndDecs.push_back(&I); + else if (I.getOpcode() == ARM::t2LoopEndDec) + EndDecs.push_back(&I); } - if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty()) + if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty()) continue; Changed = true; @@ -1713,15 +1713,15 @@ bool ARMLowOverheadLoops::RevertNonLoops() { if (Start->getOpcode() == ARM::t2WhileLoopStart) RevertWhile(Start); else - RevertDo(Start); + RevertDo(Start); } for (auto *Dec : Decs) RevertLoopDec(Dec); for (auto *End : Ends) RevertLoopEnd(End); - for (auto *End : EndDecs) - RevertLoopEndDec(End); + for (auto *End : EndDecs) + RevertLoopEndDec(End); } return Changed; } diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMParallelDSP.cpp index 9a7c1f541a..cd3c3b4ca6 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMParallelDSP.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMParallelDSP.cpp @@ -22,7 +22,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" @@ -202,7 +202,7 @@ namespace { public: WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide) : NewLd(Wide) { - append_range(Loads, Lds); + append_range(Loads, Lds); } LoadInst *getLoad() { return NewLd; @@ -374,7 +374,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { DepMap RAWDeps; // Record any writes that may alias a load. - const auto Size = LocationSize::beforeOrAfterPointer(); + const auto Size = LocationSize::beforeOrAfterPointer(); for (auto Write : Writes) { for (auto Read : Loads) { MemoryLocation ReadLoc = diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMPredicates.td b/contrib/libs/llvm12/lib/Target/ARM/ARMPredicates.td index 2dc097566d..3c03b95e26 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMPredicates.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMPredicates.td @@ -77,8 +77,8 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; -def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, - AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; +def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, + AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">, AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">; @@ -189,9 +189,9 @@ let RecomputePerFunction = 1 in { def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&" " TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||" "Subtarget->hasMinSize())">; - def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>; - def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>; - + def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>; + def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<ARMSubtarget>().hardenSlsBlr() }]>; + } def UseMulOps : Predicate<"Subtarget->useMulOps()">; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterBankInfo.cpp index 1a7f10a13e..eb905282dc 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -156,10 +156,10 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) "Subclass not added?"); assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) && "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnoip_and_tcGPRRegClassID)) && + assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnoip_and_tcGPRRegClassID)) && "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass( - ARM::tGPREven_and_GPRnoip_and_tcGPRRegClassID)) && + assert(RBGPR.covers(*TRI.getRegClass( + ARM::tGPREven_and_GPRnoip_and_tcGPRRegClassID)) && "Subclass not added?"); assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) && "Subclass not added?"); @@ -182,12 +182,12 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, switch (RC.getID()) { case GPRRegClassID: case GPRwithAPSRRegClassID: - case GPRnoipRegClassID: + case GPRnoipRegClassID: case GPRnopcRegClassID: - case GPRnoip_and_GPRnopcRegClassID: + case GPRnoip_and_GPRnopcRegClassID: case rGPRRegClassID: case GPRspRegClassID: - case GPRnoip_and_tcGPRRegClassID: + case GPRnoip_and_tcGPRRegClassID: case tcGPRRegClassID: case tGPRRegClassID: case tGPREvenRegClassID: @@ -195,7 +195,7 @@ ARMRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, case tGPR_and_tGPREvenRegClassID: case tGPR_and_tGPROddRegClassID: case tGPREven_and_tcGPRRegClassID: - case tGPREven_and_GPRnoip_and_tcGPRRegClassID: + case tGPREven_and_GPRnoip_and_tcGPRRegClassID: case tGPROdd_and_tcGPRRegClassID: return getRegBank(ARM::GPRRegBankID); case HPRRegClassID: diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterInfo.td b/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterInfo.td index fe3243315d..8ac3e3c402 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMRegisterInfo.td @@ -235,23 +235,23 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), let DiagnosticString = "operand must be a register in range [r0, r15]"; } -// Register set that excludes registers that are reserved for procedure calls. -// This is used for pseudo-instructions that are actually implemented using a -// procedure call. -def GPRnoip : RegisterClass<"ARM", [i32], 32, (sub GPR, R12, LR)> { - // Allocate LR as the first CSR since it is always saved anyway. - // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't - // know how to spill them. If we make our prologue/epilogue code smarter at - // some point, we can go back to using the above allocation orders for the - // Thumb1 instructions that know how to use hi regs. - let AltOrders = [(add GPRnoip, GPRnoip), (trunc GPRnoip, 8), - (add (trunc GPRnoip, 8), (shl GPRnoip, 8))]; - let AltOrderSelect = [{ - return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF); - }]; - let DiagnosticString = "operand must be a register in range [r0, r14]"; -} - +// Register set that excludes registers that are reserved for procedure calls. +// This is used for pseudo-instructions that are actually implemented using a +// procedure call. +def GPRnoip : RegisterClass<"ARM", [i32], 32, (sub GPR, R12, LR)> { + // Allocate LR as the first CSR since it is always saved anyway. + // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't + // know how to spill them. If we make our prologue/epilogue code smarter at + // some point, we can go back to using the above allocation orders for the + // Thumb1 instructions that know how to use hi regs. + let AltOrders = [(add GPRnoip, GPRnoip), (trunc GPRnoip, 8), + (add (trunc GPRnoip, 8), (shl GPRnoip, 8))]; + let AltOrderSelect = [{ + return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF); + }]; + let DiagnosticString = "operand must be a register in range [r0, r14]"; +} + // GPRs without the PC. Some ARM instructions do not allow the PC in // certain operand slots, particularly as the destination. Primarily // useful for disassembly. diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMSLSHardening.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMSLSHardening.cpp index cfcc7d5a04..de2cd45c14 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMSLSHardening.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMSLSHardening.cpp @@ -1,416 +1,416 @@ -//===- ARMSLSHardening.cpp - Harden Straight Line Missspeculation ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass to insert code to mitigate against side channel -// vulnerabilities that may happen under straight line miss-speculation. -// -//===----------------------------------------------------------------------===// - -#include "ARM.h" -#include "ARMInstrInfo.h" -#include "ARMSubtarget.h" -#include "llvm/CodeGen/IndirectThunks.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/IR/DebugLoc.h" -#include <cassert> - -using namespace llvm; - -#define DEBUG_TYPE "arm-sls-hardening" - -#define ARM_SLS_HARDENING_NAME "ARM sls hardening pass" - -namespace { - -class ARMSLSHardening : public MachineFunctionPass { -public: - const TargetInstrInfo *TII; - const ARMSubtarget *ST; - - static char ID; - - ARMSLSHardening() : MachineFunctionPass(ID) { - initializeARMSLSHardeningPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { return ARM_SLS_HARDENING_NAME; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const; - bool hardenIndirectCalls(MachineBasicBlock &MBB) const; - MachineBasicBlock & - ConvertIndirectCallToIndirectJump(MachineBasicBlock &MBB, - MachineBasicBlock::iterator) const; -}; - -} // end anonymous namespace - -char ARMSLSHardening::ID = 0; - -INITIALIZE_PASS(ARMSLSHardening, "arm-sls-hardening", - ARM_SLS_HARDENING_NAME, false, false) - -static void insertSpeculationBarrier(const ARMSubtarget *ST, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL, - bool AlwaysUseISBDSB = false) { - assert(MBBI != MBB.begin() && - "Must not insert SpeculationBarrierEndBB as only instruction in MBB."); - assert(std::prev(MBBI)->isBarrier() && - "SpeculationBarrierEndBB must only follow unconditional control flow " - "instructions."); - assert(std::prev(MBBI)->isTerminator() && - "SpeculationBarrierEndBB must only follow terminators."); - const TargetInstrInfo *TII = ST->getInstrInfo(); - assert(ST->hasDataBarrier() || ST->hasSB()); - bool ProduceSB = ST->hasSB() && !AlwaysUseISBDSB; - unsigned BarrierOpc = - ProduceSB ? (ST->isThumb() ? ARM::t2SpeculationBarrierSBEndBB - : ARM::SpeculationBarrierSBEndBB) - : (ST->isThumb() ? ARM::t2SpeculationBarrierISBDSBEndBB - : ARM::SpeculationBarrierISBDSBEndBB); - if (MBBI == MBB.end() || !isSpeculationBarrierEndBBOpcode(MBBI->getOpcode())) - BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc)); -} - -bool ARMSLSHardening::runOnMachineFunction(MachineFunction &MF) { - ST = &MF.getSubtarget<ARMSubtarget>(); - TII = MF.getSubtarget().getInstrInfo(); - - bool Modified = false; - for (auto &MBB : MF) { - Modified |= hardenReturnsAndBRs(MBB); - Modified |= hardenIndirectCalls(MBB); - } - - return Modified; -} - -bool ARMSLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const { - if (!ST->hardenSlsRetBr()) - return false; - assert(!ST->isThumb1Only()); - bool Modified = false; - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end(); - MachineBasicBlock::iterator NextMBBI; - for (; MBBI != E; MBBI = NextMBBI) { - MachineInstr &MI = *MBBI; - NextMBBI = std::next(MBBI); - if (isIndirectControlFlowNotComingBack(MI)) { - assert(MI.isTerminator()); - assert(!TII->isPredicated(MI)); - insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc()); - Modified = true; - } - } - return Modified; -} - -static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; - -static const struct ThunkNameRegMode { - const char* Name; - Register Reg; - bool isThumb; -} SLSBLRThunks[] = { - {"__llvm_slsblr_thunk_arm_r0", ARM::R0, false}, - {"__llvm_slsblr_thunk_arm_r1", ARM::R1, false}, - {"__llvm_slsblr_thunk_arm_r2", ARM::R2, false}, - {"__llvm_slsblr_thunk_arm_r3", ARM::R3, false}, - {"__llvm_slsblr_thunk_arm_r4", ARM::R4, false}, - {"__llvm_slsblr_thunk_arm_r5", ARM::R5, false}, - {"__llvm_slsblr_thunk_arm_r6", ARM::R6, false}, - {"__llvm_slsblr_thunk_arm_r7", ARM::R7, false}, - {"__llvm_slsblr_thunk_arm_r8", ARM::R8, false}, - {"__llvm_slsblr_thunk_arm_r9", ARM::R9, false}, - {"__llvm_slsblr_thunk_arm_r10", ARM::R10, false}, - {"__llvm_slsblr_thunk_arm_r11", ARM::R11, false}, - {"__llvm_slsblr_thunk_arm_sp", ARM::SP, false}, - {"__llvm_slsblr_thunk_arm_pc", ARM::PC, false}, - {"__llvm_slsblr_thunk_thumb_r0", ARM::R0, true}, - {"__llvm_slsblr_thunk_thumb_r1", ARM::R1, true}, - {"__llvm_slsblr_thunk_thumb_r2", ARM::R2, true}, - {"__llvm_slsblr_thunk_thumb_r3", ARM::R3, true}, - {"__llvm_slsblr_thunk_thumb_r4", ARM::R4, true}, - {"__llvm_slsblr_thunk_thumb_r5", ARM::R5, true}, - {"__llvm_slsblr_thunk_thumb_r6", ARM::R6, true}, - {"__llvm_slsblr_thunk_thumb_r7", ARM::R7, true}, - {"__llvm_slsblr_thunk_thumb_r8", ARM::R8, true}, - {"__llvm_slsblr_thunk_thumb_r9", ARM::R9, true}, - {"__llvm_slsblr_thunk_thumb_r10", ARM::R10, true}, - {"__llvm_slsblr_thunk_thumb_r11", ARM::R11, true}, - {"__llvm_slsblr_thunk_thumb_sp", ARM::SP, true}, - {"__llvm_slsblr_thunk_thumb_pc", ARM::PC, true}, -}; - -namespace { -struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> { - const char *getThunkPrefix() { return SLSBLRNamePrefix; } - bool mayUseThunk(const MachineFunction &MF) { - // FIXME: This could also check if there are any indirect calls in the - // function to more accurately reflect if a thunk will be needed. - return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr(); - } - void insertThunks(MachineModuleInfo &MMI); - void populateThunk(MachineFunction &MF); -}; -} // namespace - -void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) { - // FIXME: It probably would be possible to filter which thunks to produce - // based on which registers are actually used in indirect calls in this - // function. But would that be a worthwhile optimization? - for (auto T : SLSBLRThunks) - createThunkFunction(MMI, T.Name); -} - -void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) { - // FIXME: How to better communicate Register number, rather than through - // name and lookup table? - assert(MF.getName().startswith(getThunkPrefix())); - auto ThunkIt = llvm::find_if( - SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); }); - assert(ThunkIt != std::end(SLSBLRThunks)); - Register ThunkReg = ThunkIt->Reg; - bool isThumb = ThunkIt->isThumb; - - const TargetInstrInfo *TII = MF.getSubtarget<ARMSubtarget>().getInstrInfo(); - MachineBasicBlock *Entry = &MF.front(); - Entry->clear(); - - // These thunks need to consist of the following instructions: - // __llvm_slsblr_thunk_(arm/thumb)_rN: - // bx rN - // barrierInsts - Entry->addLiveIn(ThunkReg); - if (isThumb) - BuildMI(Entry, DebugLoc(), TII->get(ARM::tBX)) - .addReg(ThunkReg) - .add(predOps(ARMCC::AL)); - else - BuildMI(Entry, DebugLoc(), TII->get(ARM::BX)) - .addReg(ThunkReg); - - // Make sure the thunks do not make use of the SB extension in case there is - // a function somewhere that will call to it that for some reason disabled - // the SB extension locally on that function, even though it's enabled for - // the module otherwise. Therefore set AlwaysUseISBSDB to true. - insertSpeculationBarrier(&MF.getSubtarget<ARMSubtarget>(), *Entry, - Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/); -} - -MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { - // Transform an indirect call to an indirect jump as follows: - // Before: - // |-----------------------------| - // | ... | - // | instI | - // | BLX rN | - // | instJ | - // | ... | - // |-----------------------------| - // - // After: - // |---------- -------------------------| - // | ... | - // | instI | - // | *call* __llvm_slsblr_thunk_mode_xN | - // | instJ | - // | ... | - // |--------------------------------------| - // - // __llvm_slsblr_thunk_mode_xN: - // |-----------------------------| - // | BX rN | - // | barrierInsts | - // |-----------------------------| - // - // The __llvm_slsblr_thunk_mode_xN thunks are created by the - // SLSBLRThunkInserter. - // This function merely needs to transform an indirect call to a direct call - // to __llvm_slsblr_thunk_xN. - MachineInstr &IndirectCall = *MBBI; - assert(isIndirectCall(IndirectCall) && !IndirectCall.isReturn()); - int RegOpIdxOnIndirectCall = -1; - bool isThumb; - switch (IndirectCall.getOpcode()) { - case ARM::BLX: // !isThumb2 - case ARM::BLX_noip: // !isThumb2 - isThumb = false; - RegOpIdxOnIndirectCall = 0; - break; - case ARM::tBLXr: // isThumb2 - case ARM::tBLXr_noip: // isThumb2 - isThumb = true; - RegOpIdxOnIndirectCall = 2; - break; - default: - llvm_unreachable("unhandled Indirect Call"); - } - - Register Reg = IndirectCall.getOperand(RegOpIdxOnIndirectCall).getReg(); - // Since linkers are allowed to clobber R12 on function calls, the above - // mitigation only works if the original indirect call instruction was not - // using R12. Code generation before must make sure that no indirect call - // using R12 was produced if the mitigation is enabled. - // Also, the transformation is incorrect if the indirect call uses LR, so - // also have to avoid that. - assert(Reg != ARM::R12 && Reg != ARM::LR); - bool RegIsKilled = IndirectCall.getOperand(RegOpIdxOnIndirectCall).isKill(); - - DebugLoc DL = IndirectCall.getDebugLoc(); - - MachineFunction &MF = *MBBI->getMF(); - auto ThunkIt = llvm::find_if(SLSBLRThunks, [Reg, isThumb](auto T) { - return T.Reg == Reg && T.isThumb == isThumb; - }); - assert(ThunkIt != std::end(SLSBLRThunks)); - Module *M = MF.getFunction().getParent(); - const GlobalValue *GV = cast<GlobalValue>(M->getNamedValue(ThunkIt->Name)); - - MachineInstr *BL = - isThumb ? BuildMI(MBB, MBBI, DL, TII->get(ARM::tBL)) - .addImm(IndirectCall.getOperand(0).getImm()) - .addReg(IndirectCall.getOperand(1).getReg()) - .addGlobalAddress(GV) - : BuildMI(MBB, MBBI, DL, TII->get(ARM::BL)).addGlobalAddress(GV); - - // Now copy the implicit operands from IndirectCall to BL and copy other - // necessary info. - // However, both IndirectCall and BL instructions implictly use SP and - // implicitly define LR. Blindly copying implicit operands would result in SP - // and LR operands to be present multiple times. While this may not be too - // much of an issue, let's avoid that for cleanliness, by removing those - // implicit operands from the BL created above before we copy over all - // implicit operands from the IndirectCall. - int ImpLROpIdx = -1; - int ImpSPOpIdx = -1; - for (unsigned OpIdx = BL->getNumExplicitOperands(); - OpIdx < BL->getNumOperands(); OpIdx++) { - MachineOperand Op = BL->getOperand(OpIdx); - if (!Op.isReg()) - continue; - if (Op.getReg() == ARM::LR && Op.isDef()) - ImpLROpIdx = OpIdx; - if (Op.getReg() == ARM::SP && !Op.isDef()) - ImpSPOpIdx = OpIdx; - } - assert(ImpLROpIdx != -1); - assert(ImpSPOpIdx != -1); - int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); - int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); - BL->RemoveOperand(FirstOpIdxToRemove); - BL->RemoveOperand(SecondOpIdxToRemove); - // Now copy over the implicit operands from the original IndirectCall - BL->copyImplicitOps(MF, IndirectCall); - MF.moveCallSiteInfo(&IndirectCall, BL); - // Also add the register called in the IndirectCall as being used in the - // called thunk. - BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/, - RegIsKilled /*isKill*/)); - // Remove IndirectCallinstruction - MBB.erase(MBBI); - return MBB; -} - -bool ARMSLSHardening::hardenIndirectCalls(MachineBasicBlock &MBB) const { - if (!ST->hardenSlsBlr()) - return false; - bool Modified = false; - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MachineBasicBlock::iterator NextMBBI; - for (; MBBI != E; MBBI = NextMBBI) { - MachineInstr &MI = *MBBI; - NextMBBI = std::next(MBBI); - // Tail calls are both indirect calls and "returns". - // They are also indirect jumps, so should be handled by sls-harden-retbr, - // rather than sls-harden-blr. - if (isIndirectCall(MI) && !MI.isReturn()) { - ConvertIndirectCallToIndirectJump(MBB, MBBI); - Modified = true; - } - } - return Modified; -} - - - -FunctionPass *llvm::createARMSLSHardeningPass() { - return new ARMSLSHardening(); -} - -namespace { -class ARMIndirectThunks : public MachineFunctionPass { -public: - static char ID; - - ARMIndirectThunks() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { return "ARM Indirect Thunks"; } - - bool doInitialization(Module &M) override; - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineModuleInfoWrapperPass>(); - AU.addPreserved<MachineModuleInfoWrapperPass>(); - } - -private: - std::tuple<SLSBLRThunkInserter> TIs; - - // FIXME: When LLVM moves to C++17, these can become folds - template <typename... ThunkInserterT> - static void initTIs(Module &M, - std::tuple<ThunkInserterT...> &ThunkInserters) { - (void)std::initializer_list<int>{ - (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...}; - } - template <typename... ThunkInserterT> - static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, - std::tuple<ThunkInserterT...> &ThunkInserters) { - bool Modified = false; - (void)std::initializer_list<int>{ - Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...}; - return Modified; - } -}; - -} // end anonymous namespace - -char ARMIndirectThunks::ID = 0; - -FunctionPass *llvm::createARMIndirectThunks() { - return new ARMIndirectThunks(); -} - -bool ARMIndirectThunks::doInitialization(Module &M) { - initTIs(M, TIs); - return false; -} - -bool ARMIndirectThunks::runOnMachineFunction(MachineFunction &MF) { - LLVM_DEBUG(dbgs() << getPassName() << '\n'); - auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); - return runTIs(MMI, MF, TIs); -} +//===- ARMSLSHardening.cpp - Harden Straight Line Missspeculation ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass to insert code to mitigate against side channel +// vulnerabilities that may happen under straight line miss-speculation. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/IndirectThunks.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "arm-sls-hardening" + +#define ARM_SLS_HARDENING_NAME "ARM sls hardening pass" + +namespace { + +class ARMSLSHardening : public MachineFunctionPass { +public: + const TargetInstrInfo *TII; + const ARMSubtarget *ST; + + static char ID; + + ARMSLSHardening() : MachineFunctionPass(ID) { + initializeARMSLSHardeningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { return ARM_SLS_HARDENING_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const; + bool hardenIndirectCalls(MachineBasicBlock &MBB) const; + MachineBasicBlock & + ConvertIndirectCallToIndirectJump(MachineBasicBlock &MBB, + MachineBasicBlock::iterator) const; +}; + +} // end anonymous namespace + +char ARMSLSHardening::ID = 0; + +INITIALIZE_PASS(ARMSLSHardening, "arm-sls-hardening", + ARM_SLS_HARDENING_NAME, false, false) + +static void insertSpeculationBarrier(const ARMSubtarget *ST, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool AlwaysUseISBDSB = false) { + assert(MBBI != MBB.begin() && + "Must not insert SpeculationBarrierEndBB as only instruction in MBB."); + assert(std::prev(MBBI)->isBarrier() && + "SpeculationBarrierEndBB must only follow unconditional control flow " + "instructions."); + assert(std::prev(MBBI)->isTerminator() && + "SpeculationBarrierEndBB must only follow terminators."); + const TargetInstrInfo *TII = ST->getInstrInfo(); + assert(ST->hasDataBarrier() || ST->hasSB()); + bool ProduceSB = ST->hasSB() && !AlwaysUseISBDSB; + unsigned BarrierOpc = + ProduceSB ? (ST->isThumb() ? ARM::t2SpeculationBarrierSBEndBB + : ARM::SpeculationBarrierSBEndBB) + : (ST->isThumb() ? ARM::t2SpeculationBarrierISBDSBEndBB + : ARM::SpeculationBarrierISBDSBEndBB); + if (MBBI == MBB.end() || !isSpeculationBarrierEndBBOpcode(MBBI->getOpcode())) + BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc)); +} + +bool ARMSLSHardening::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget<ARMSubtarget>(); + TII = MF.getSubtarget().getInstrInfo(); + + bool Modified = false; + for (auto &MBB : MF) { + Modified |= hardenReturnsAndBRs(MBB); + Modified |= hardenIndirectCalls(MBB); + } + + return Modified; +} + +bool ARMSLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsRetBr()) + return false; + assert(!ST->isThumb1Only()); + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (isIndirectControlFlowNotComingBack(MI)) { + assert(MI.isTerminator()); + assert(!TII->isPredicated(MI)); + insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc()); + Modified = true; + } + } + return Modified; +} + +static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; + +static const struct ThunkNameRegMode { + const char* Name; + Register Reg; + bool isThumb; +} SLSBLRThunks[] = { + {"__llvm_slsblr_thunk_arm_r0", ARM::R0, false}, + {"__llvm_slsblr_thunk_arm_r1", ARM::R1, false}, + {"__llvm_slsblr_thunk_arm_r2", ARM::R2, false}, + {"__llvm_slsblr_thunk_arm_r3", ARM::R3, false}, + {"__llvm_slsblr_thunk_arm_r4", ARM::R4, false}, + {"__llvm_slsblr_thunk_arm_r5", ARM::R5, false}, + {"__llvm_slsblr_thunk_arm_r6", ARM::R6, false}, + {"__llvm_slsblr_thunk_arm_r7", ARM::R7, false}, + {"__llvm_slsblr_thunk_arm_r8", ARM::R8, false}, + {"__llvm_slsblr_thunk_arm_r9", ARM::R9, false}, + {"__llvm_slsblr_thunk_arm_r10", ARM::R10, false}, + {"__llvm_slsblr_thunk_arm_r11", ARM::R11, false}, + {"__llvm_slsblr_thunk_arm_sp", ARM::SP, false}, + {"__llvm_slsblr_thunk_arm_pc", ARM::PC, false}, + {"__llvm_slsblr_thunk_thumb_r0", ARM::R0, true}, + {"__llvm_slsblr_thunk_thumb_r1", ARM::R1, true}, + {"__llvm_slsblr_thunk_thumb_r2", ARM::R2, true}, + {"__llvm_slsblr_thunk_thumb_r3", ARM::R3, true}, + {"__llvm_slsblr_thunk_thumb_r4", ARM::R4, true}, + {"__llvm_slsblr_thunk_thumb_r5", ARM::R5, true}, + {"__llvm_slsblr_thunk_thumb_r6", ARM::R6, true}, + {"__llvm_slsblr_thunk_thumb_r7", ARM::R7, true}, + {"__llvm_slsblr_thunk_thumb_r8", ARM::R8, true}, + {"__llvm_slsblr_thunk_thumb_r9", ARM::R9, true}, + {"__llvm_slsblr_thunk_thumb_r10", ARM::R10, true}, + {"__llvm_slsblr_thunk_thumb_r11", ARM::R11, true}, + {"__llvm_slsblr_thunk_thumb_sp", ARM::SP, true}, + {"__llvm_slsblr_thunk_thumb_pc", ARM::PC, true}, +}; + +namespace { +struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> { + const char *getThunkPrefix() { return SLSBLRNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + // FIXME: This could also check if there are any indirect calls in the + // function to more accurately reflect if a thunk will be needed. + return MF.getSubtarget<ARMSubtarget>().hardenSlsBlr(); + } + void insertThunks(MachineModuleInfo &MMI); + void populateThunk(MachineFunction &MF); +}; +} // namespace + +void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) { + // FIXME: It probably would be possible to filter which thunks to produce + // based on which registers are actually used in indirect calls in this + // function. But would that be a worthwhile optimization? + for (auto T : SLSBLRThunks) + createThunkFunction(MMI, T.Name); +} + +void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) { + // FIXME: How to better communicate Register number, rather than through + // name and lookup table? + assert(MF.getName().startswith(getThunkPrefix())); + auto ThunkIt = llvm::find_if( + SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); }); + assert(ThunkIt != std::end(SLSBLRThunks)); + Register ThunkReg = ThunkIt->Reg; + bool isThumb = ThunkIt->isThumb; + + const TargetInstrInfo *TII = MF.getSubtarget<ARMSubtarget>().getInstrInfo(); + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + + // These thunks need to consist of the following instructions: + // __llvm_slsblr_thunk_(arm/thumb)_rN: + // bx rN + // barrierInsts + Entry->addLiveIn(ThunkReg); + if (isThumb) + BuildMI(Entry, DebugLoc(), TII->get(ARM::tBX)) + .addReg(ThunkReg) + .add(predOps(ARMCC::AL)); + else + BuildMI(Entry, DebugLoc(), TII->get(ARM::BX)) + .addReg(ThunkReg); + + // Make sure the thunks do not make use of the SB extension in case there is + // a function somewhere that will call to it that for some reason disabled + // the SB extension locally on that function, even though it's enabled for + // the module otherwise. Therefore set AlwaysUseISBSDB to true. + insertSpeculationBarrier(&MF.getSubtarget<ARMSubtarget>(), *Entry, + Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/); +} + +MachineBasicBlock &ARMSLSHardening::ConvertIndirectCallToIndirectJump( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + // Transform an indirect call to an indirect jump as follows: + // Before: + // |-----------------------------| + // | ... | + // | instI | + // | BLX rN | + // | instJ | + // | ... | + // |-----------------------------| + // + // After: + // |---------- -------------------------| + // | ... | + // | instI | + // | *call* __llvm_slsblr_thunk_mode_xN | + // | instJ | + // | ... | + // |--------------------------------------| + // + // __llvm_slsblr_thunk_mode_xN: + // |-----------------------------| + // | BX rN | + // | barrierInsts | + // |-----------------------------| + // + // The __llvm_slsblr_thunk_mode_xN thunks are created by the + // SLSBLRThunkInserter. + // This function merely needs to transform an indirect call to a direct call + // to __llvm_slsblr_thunk_xN. + MachineInstr &IndirectCall = *MBBI; + assert(isIndirectCall(IndirectCall) && !IndirectCall.isReturn()); + int RegOpIdxOnIndirectCall = -1; + bool isThumb; + switch (IndirectCall.getOpcode()) { + case ARM::BLX: // !isThumb2 + case ARM::BLX_noip: // !isThumb2 + isThumb = false; + RegOpIdxOnIndirectCall = 0; + break; + case ARM::tBLXr: // isThumb2 + case ARM::tBLXr_noip: // isThumb2 + isThumb = true; + RegOpIdxOnIndirectCall = 2; + break; + default: + llvm_unreachable("unhandled Indirect Call"); + } + + Register Reg = IndirectCall.getOperand(RegOpIdxOnIndirectCall).getReg(); + // Since linkers are allowed to clobber R12 on function calls, the above + // mitigation only works if the original indirect call instruction was not + // using R12. Code generation before must make sure that no indirect call + // using R12 was produced if the mitigation is enabled. + // Also, the transformation is incorrect if the indirect call uses LR, so + // also have to avoid that. + assert(Reg != ARM::R12 && Reg != ARM::LR); + bool RegIsKilled = IndirectCall.getOperand(RegOpIdxOnIndirectCall).isKill(); + + DebugLoc DL = IndirectCall.getDebugLoc(); + + MachineFunction &MF = *MBBI->getMF(); + auto ThunkIt = llvm::find_if(SLSBLRThunks, [Reg, isThumb](auto T) { + return T.Reg == Reg && T.isThumb == isThumb; + }); + assert(ThunkIt != std::end(SLSBLRThunks)); + Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = cast<GlobalValue>(M->getNamedValue(ThunkIt->Name)); + + MachineInstr *BL = + isThumb ? BuildMI(MBB, MBBI, DL, TII->get(ARM::tBL)) + .addImm(IndirectCall.getOperand(0).getImm()) + .addReg(IndirectCall.getOperand(1).getReg()) + .addGlobalAddress(GV) + : BuildMI(MBB, MBBI, DL, TII->get(ARM::BL)).addGlobalAddress(GV); + + // Now copy the implicit operands from IndirectCall to BL and copy other + // necessary info. + // However, both IndirectCall and BL instructions implictly use SP and + // implicitly define LR. Blindly copying implicit operands would result in SP + // and LR operands to be present multiple times. While this may not be too + // much of an issue, let's avoid that for cleanliness, by removing those + // implicit operands from the BL created above before we copy over all + // implicit operands from the IndirectCall. + int ImpLROpIdx = -1; + int ImpSPOpIdx = -1; + for (unsigned OpIdx = BL->getNumExplicitOperands(); + OpIdx < BL->getNumOperands(); OpIdx++) { + MachineOperand Op = BL->getOperand(OpIdx); + if (!Op.isReg()) + continue; + if (Op.getReg() == ARM::LR && Op.isDef()) + ImpLROpIdx = OpIdx; + if (Op.getReg() == ARM::SP && !Op.isDef()) + ImpSPOpIdx = OpIdx; + } + assert(ImpLROpIdx != -1); + assert(ImpSPOpIdx != -1); + int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); + int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); + BL->RemoveOperand(FirstOpIdxToRemove); + BL->RemoveOperand(SecondOpIdxToRemove); + // Now copy over the implicit operands from the original IndirectCall + BL->copyImplicitOps(MF, IndirectCall); + MF.moveCallSiteInfo(&IndirectCall, BL); + // Also add the register called in the IndirectCall as being used in the + // called thunk. + BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/, + RegIsKilled /*isKill*/)); + // Remove IndirectCallinstruction + MBB.erase(MBBI); + return MBB; +} + +bool ARMSLSHardening::hardenIndirectCalls(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsBlr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + // Tail calls are both indirect calls and "returns". + // They are also indirect jumps, so should be handled by sls-harden-retbr, + // rather than sls-harden-blr. + if (isIndirectCall(MI) && !MI.isReturn()) { + ConvertIndirectCallToIndirectJump(MBB, MBBI); + Modified = true; + } + } + return Modified; +} + + + +FunctionPass *llvm::createARMSLSHardeningPass() { + return new ARMSLSHardening(); +} + +namespace { +class ARMIndirectThunks : public MachineFunctionPass { +public: + static char ID; + + ARMIndirectThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "ARM Indirect Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineModuleInfoWrapperPass>(); + AU.addPreserved<MachineModuleInfoWrapperPass>(); + } + +private: + std::tuple<SLSBLRThunkInserter> TIs; + + // FIXME: When LLVM moves to C++17, these can become folds + template <typename... ThunkInserterT> + static void initTIs(Module &M, + std::tuple<ThunkInserterT...> &ThunkInserters) { + (void)std::initializer_list<int>{ + (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...}; + } + template <typename... ThunkInserterT> + static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, + std::tuple<ThunkInserterT...> &ThunkInserters) { + bool Modified = false; + (void)std::initializer_list<int>{ + Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...}; + return Modified; + } +}; + +} // end anonymous namespace + +char ARMIndirectThunks::ID = 0; + +FunctionPass *llvm::createARMIndirectThunks() { + return new ARMIndirectThunks(); +} + +bool ARMIndirectThunks::doInitialization(Module &M) { + initTIs(M, TIs); + return false; +} + +bool ARMIndirectThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << '\n'); + auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); + return runTIs(MMI, MF, TIs); +} diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMSchedule.td b/contrib/libs/llvm12/lib/Target/ARM/ARMSchedule.td index 53a2a6fec5..503a0fbd96 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMSchedule.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMSchedule.td @@ -151,61 +151,61 @@ def : PredicateProlog<[{ (void)STI; }]>; -def IsPredicated : CheckFunctionPredicateWithTII< - "ARM_MC::isPredicated", - "isPredicated" ->; -def IsPredicatedPred : MCSchedPredicate<IsPredicated>; - -def IsCPSRDefined : CheckFunctionPredicateWithTII< - "ARM_MC::isCPSRDefined", - "ARMBaseInstrInfo::isCPSRDefined" ->; - -def IsCPSRDefinedPred : MCSchedPredicate<IsCPSRDefined>; - -let FunctionMapper = "ARM_AM::getAM2ShiftOpc" in { - class CheckAM2NoShift<int n> : CheckImmOperand_s<n, "ARM_AM::no_shift">; - class CheckAM2ShiftLSL<int n> : CheckImmOperand_s<n, "ARM_AM::lsl">; -} - -let FunctionMapper = "ARM_AM::getAM2Op" in { - class CheckAM2OpAdd<int n> : CheckImmOperand_s<n, "ARM_AM::add"> {} - class CheckAM2OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {} -} - -let FunctionMapper = "ARM_AM::getAM2Offset" in { - class CheckAM2Offset<int n, int of> : CheckImmOperand<n, of> {} -} - -def IsLDMBaseRegInList : CheckFunctionPredicate< - "ARM_MC::isLDMBaseRegInList", "ARM_MC::isLDMBaseRegInList" ->; - -let FunctionMapper = "ARM_AM::getAM3Op" in { - class CheckAM3OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {} -} - -// LDM, base reg in list -def IsLDMBaseRegInListPred : MCSchedPredicate<IsLDMBaseRegInList>; - -class IsRegPCPred<int n> : MCSchedPredicate<CheckRegOperand<n, PC>>; - -class BranchWriteRes<int lat, int uops, list<ProcResourceKind> resl, - list<int> rcl, SchedWriteRes wr> : - SchedWriteRes<!listconcat(wr.ProcResources, resl)> { - let Latency = !add(wr.Latency, lat); - let ResourceCycles = !listconcat(wr.ResourceCycles, rcl); - let NumMicroOps = !add(wr.NumMicroOps, uops); - SchedWriteRes BaseWr = wr; -} - -class CheckBranchForm<int n, BranchWriteRes br> : - SchedWriteVariant<[ - SchedVar<IsRegPCPred<n>, [br]>, - SchedVar<NoSchedPred, [br.BaseWr]> - ]>; +def IsPredicated : CheckFunctionPredicateWithTII< + "ARM_MC::isPredicated", + "isPredicated" +>; +def IsPredicatedPred : MCSchedPredicate<IsPredicated>; +def IsCPSRDefined : CheckFunctionPredicateWithTII< + "ARM_MC::isCPSRDefined", + "ARMBaseInstrInfo::isCPSRDefined" +>; + +def IsCPSRDefinedPred : MCSchedPredicate<IsCPSRDefined>; + +let FunctionMapper = "ARM_AM::getAM2ShiftOpc" in { + class CheckAM2NoShift<int n> : CheckImmOperand_s<n, "ARM_AM::no_shift">; + class CheckAM2ShiftLSL<int n> : CheckImmOperand_s<n, "ARM_AM::lsl">; +} + +let FunctionMapper = "ARM_AM::getAM2Op" in { + class CheckAM2OpAdd<int n> : CheckImmOperand_s<n, "ARM_AM::add"> {} + class CheckAM2OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {} +} + +let FunctionMapper = "ARM_AM::getAM2Offset" in { + class CheckAM2Offset<int n, int of> : CheckImmOperand<n, of> {} +} + +def IsLDMBaseRegInList : CheckFunctionPredicate< + "ARM_MC::isLDMBaseRegInList", "ARM_MC::isLDMBaseRegInList" +>; + +let FunctionMapper = "ARM_AM::getAM3Op" in { + class CheckAM3OpSub<int n> : CheckImmOperand_s<n, "ARM_AM::sub"> {} +} + +// LDM, base reg in list +def IsLDMBaseRegInListPred : MCSchedPredicate<IsLDMBaseRegInList>; + +class IsRegPCPred<int n> : MCSchedPredicate<CheckRegOperand<n, PC>>; + +class BranchWriteRes<int lat, int uops, list<ProcResourceKind> resl, + list<int> rcl, SchedWriteRes wr> : + SchedWriteRes<!listconcat(wr.ProcResources, resl)> { + let Latency = !add(wr.Latency, lat); + let ResourceCycles = !listconcat(wr.ResourceCycles, rcl); + let NumMicroOps = !add(wr.NumMicroOps, uops); + SchedWriteRes BaseWr = wr; +} + +class CheckBranchForm<int n, BranchWriteRes br> : + SchedWriteVariant<[ + SchedVar<IsRegPCPred<n>, [br]>, + SchedVar<NoSchedPred, [br.BaseWr]> + ]>; + //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for ARM // diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57.td index 0c610a4839..fe8c220db4 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57.td @@ -21,47 +21,47 @@ // Therefore, IssueWidth is set to the narrower of the two at three, while still // modeling the machine as out-of-order. -def IsCPSRDefinedAndPredicated : CheckAll<[IsCPSRDefined, IsPredicated]>; +def IsCPSRDefinedAndPredicated : CheckAll<[IsCPSRDefined, IsPredicated]>; def IsCPSRDefinedAndPredicatedPred : - MCSchedPredicate<IsCPSRDefinedAndPredicated>; + MCSchedPredicate<IsCPSRDefinedAndPredicated>; // Cortex A57 rev. r1p0 or later (false = r0px) -def IsR1P0AndLaterPred : MCSchedPredicate<FalsePred>; +def IsR1P0AndLaterPred : MCSchedPredicate<FalsePred>; -def IsLdrAm3RegOffPred : MCSchedPredicate<CheckInvalidRegOperand<2>>; -def IsLdrAm3RegOffPredX2 : MCSchedPredicate<CheckInvalidRegOperand<3>>; -def IsLdrAm3RegOffPredX3 : MCSchedPredicate<CheckInvalidRegOperand<4>>; +def IsLdrAm3RegOffPred : MCSchedPredicate<CheckInvalidRegOperand<2>>; +def IsLdrAm3RegOffPredX2 : MCSchedPredicate<CheckInvalidRegOperand<3>>; +def IsLdrAm3RegOffPredX3 : MCSchedPredicate<CheckInvalidRegOperand<4>>; // If Addrmode3 contains "minus register" -class Am3NegativeRegOffset<int n> : MCSchedPredicate<CheckAll<[ - CheckValidRegOperand<n>, - CheckAM3OpSub<!add(n, 1)>]>>; - -def IsLdrAm3NegRegOffPred : Am3NegativeRegOffset<2>; -def IsLdrAm3NegRegOffPredX2 : Am3NegativeRegOffset<3>; -def IsLdrAm3NegRegOffPredX3 : Am3NegativeRegOffset<4>; - +class Am3NegativeRegOffset<int n> : MCSchedPredicate<CheckAll<[ + CheckValidRegOperand<n>, + CheckAM3OpSub<!add(n, 1)>]>>; + +def IsLdrAm3NegRegOffPred : Am3NegativeRegOffset<2>; +def IsLdrAm3NegRegOffPredX2 : Am3NegativeRegOffset<3>; +def IsLdrAm3NegRegOffPredX3 : Am3NegativeRegOffset<4>; + // Load, scaled register offset, not plus LSL2 -class ScaledRegNotPlusLsl2<int n> : CheckNot< - CheckAny<[ - CheckAM2NoShift<n>, - CheckAll<[ - CheckAM2OpAdd<n>, - CheckAM2ShiftLSL<n>, - CheckAM2Offset<n, 2> - ]> - ]> - >; - -def IsLdstsoScaledNotOptimalPredX0 : MCSchedPredicate<ScaledRegNotPlusLsl2<2>>; -def IsLdstsoScaledNotOptimalPred : MCSchedPredicate<ScaledRegNotPlusLsl2<3>>; -def IsLdstsoScaledNotOptimalPredX2 : MCSchedPredicate<ScaledRegNotPlusLsl2<4>>; - -def IsLdstsoScaledPredX2 : MCSchedPredicate<CheckNot<CheckAM2NoShift<4>>>; - -def IsLdstsoMinusRegPredX0 : MCSchedPredicate<CheckAM2OpSub<2>>; -def IsLdstsoMinusRegPred : MCSchedPredicate<CheckAM2OpSub<3>>; -def IsLdstsoMinusRegPredX2 : MCSchedPredicate<CheckAM2OpSub<4>>; +class ScaledRegNotPlusLsl2<int n> : CheckNot< + CheckAny<[ + CheckAM2NoShift<n>, + CheckAll<[ + CheckAM2OpAdd<n>, + CheckAM2ShiftLSL<n>, + CheckAM2Offset<n, 2> + ]> + ]> + >; + +def IsLdstsoScaledNotOptimalPredX0 : MCSchedPredicate<ScaledRegNotPlusLsl2<2>>; +def IsLdstsoScaledNotOptimalPred : MCSchedPredicate<ScaledRegNotPlusLsl2<3>>; +def IsLdstsoScaledNotOptimalPredX2 : MCSchedPredicate<ScaledRegNotPlusLsl2<4>>; + +def IsLdstsoScaledPredX2 : MCSchedPredicate<CheckNot<CheckAM2NoShift<4>>>; + +def IsLdstsoMinusRegPredX0 : MCSchedPredicate<CheckAM2OpSub<2>>; +def IsLdstsoMinusRegPred : MCSchedPredicate<CheckAM2OpSub<3>>; +def IsLdstsoMinusRegPredX2 : MCSchedPredicate<CheckAM2OpSub<4>>; class A57WriteLMOpsListType<list<SchedWriteRes> writes> { list <SchedWriteRes> Writes = writes; @@ -173,29 +173,29 @@ def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>; def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>; -// Check branch forms of ALU ops: -// check reg 0 for ARM_AM::PC -// if so adds 2 cyc to latency, 1 uop, 1 res cycle for A57UnitB -class A57BranchForm<SchedWriteRes non_br> : - BranchWriteRes<2, 1, [A57UnitB], [1], non_br>; - +// Check branch forms of ALU ops: +// check reg 0 for ARM_AM::PC +// if so adds 2 cyc to latency, 1 uop, 1 res cycle for A57UnitB +class A57BranchForm<SchedWriteRes non_br> : + BranchWriteRes<2, 1, [A57UnitB], [1], non_br>; + // shift by register, conditional or unconditional // TODO: according to the doc, conditional uses I0/I1, unconditional uses M // Why more complex instruction uses more simple pipeline? // May be an error in doc. def A57WriteALUsr : SchedWriteVariant<[ - SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>, - SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]> + SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>, + SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]> ]>; def A57WriteALUSsr : SchedWriteVariant<[ - SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>, - SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]> + SchedVar<IsPredicatedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1I>>]>, + SchedVar<NoSchedPred, [CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>]> ]>; def A57ReadALUsr : SchedReadVariant<[ SchedVar<IsPredicatedPred, [ReadDefault]>, SchedVar<NoSchedPred, [ReadDefault]> ]>; -def : SchedAlias<WriteALUsi, CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>; +def : SchedAlias<WriteALUsi, CheckBranchForm<0, A57BranchForm<A57Write_2cyc_1M>>>; def : SchedAlias<WriteALUsr, A57WriteALUsr>; def : SchedAlias<WriteALUSsr, A57WriteALUSsr>; def : SchedAlias<ReadALUsr, A57ReadALUsr>; @@ -271,11 +271,11 @@ def : ReadAdvance<ReadMUL, 0>; // from similar μops, allowing a typical sequence of multiply-accumulate μops // to issue one every 1 cycle (sched advance = 2). def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; } -def A57WriteMLAL : SchedWriteVariant<[ - SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>, - SchedVar<NoSchedPred, [A57Write_4cyc_1M]> -]>; - +def A57WriteMLAL : SchedWriteVariant<[ + SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>, + SchedVar<NoSchedPred, [A57Write_4cyc_1M]> +]>; + def A57ReadMLA : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>; def : InstRW<[A57WriteMLA], @@ -470,11 +470,11 @@ def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG", "LDRB_POST_REG", "LDR(B?)T_POST$")>; def A57WriteLdrTRegPost : SchedWriteVariant<[ - SchedVar<IsLdstsoScaledPredX2, [A57Write_4cyc_1I_1L_1M]>, + SchedVar<IsLdstsoScaledPredX2, [A57Write_4cyc_1I_1L_1M]>, SchedVar<NoSchedPred, [A57Write_4cyc_1L_1I]> ]>; def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[ - SchedVar<IsLdstsoScaledPredX2, [A57WrBackThree]>, + SchedVar<IsLdstsoScaledPredX2, [A57WrBackThree]>, SchedVar<NoSchedPred, [A57WrBackTwo]> ]>; // 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L" @@ -510,12 +510,12 @@ def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>; // --- Load multiple instructions --- foreach NumAddr = 1-8 in { - def A57LMAddrPred#NumAddr : MCSchedPredicate<CheckAny<[ - CheckNumOperands<!add(!shl(NumAddr, 1), 2)>, - CheckNumOperands<!add(!shl(NumAddr, 1), 3)>]>>; - def A57LMAddrUpdPred#NumAddr : MCSchedPredicate<CheckAny<[ - CheckNumOperands<!add(!shl(NumAddr, 1), 3)>, - CheckNumOperands<!add(!shl(NumAddr, 1), 4)>]>>; + def A57LMAddrPred#NumAddr : MCSchedPredicate<CheckAny<[ + CheckNumOperands<!add(!shl(NumAddr, 1), 2)>, + CheckNumOperands<!add(!shl(NumAddr, 1), 3)>]>>; + def A57LMAddrUpdPred#NumAddr : MCSchedPredicate<CheckAny<[ + CheckNumOperands<!add(!shl(NumAddr, 1), 3)>, + CheckNumOperands<!add(!shl(NumAddr, 1), 4)>]>>; } def A57LDMOpsListNoregin : A57WriteLMOpsListType< @@ -571,20 +571,20 @@ def A57LDMOpsList_Upd : A57WriteLMOpsListType< A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>; def A57WriteLDM_Upd : SchedWriteVariant<[ - SchedVar<A57LMAddrUpdPred1, A57LDMOpsList_Upd.Writes[0-2]>, - SchedVar<A57LMAddrUpdPred2, A57LDMOpsList_Upd.Writes[0-4]>, - SchedVar<A57LMAddrUpdPred3, A57LDMOpsList_Upd.Writes[0-6]>, - SchedVar<A57LMAddrUpdPred4, A57LDMOpsList_Upd.Writes[0-8]>, - SchedVar<A57LMAddrUpdPred5, A57LDMOpsList_Upd.Writes[0-10]>, - SchedVar<A57LMAddrUpdPred6, A57LDMOpsList_Upd.Writes[0-12]>, - SchedVar<A57LMAddrUpdPred7, A57LDMOpsList_Upd.Writes[0-14]>, - SchedVar<A57LMAddrUpdPred8, A57LDMOpsList_Upd.Writes[0-16]>, - SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]> + SchedVar<A57LMAddrUpdPred1, A57LDMOpsList_Upd.Writes[0-2]>, + SchedVar<A57LMAddrUpdPred2, A57LDMOpsList_Upd.Writes[0-4]>, + SchedVar<A57LMAddrUpdPred3, A57LDMOpsList_Upd.Writes[0-6]>, + SchedVar<A57LMAddrUpdPred4, A57LDMOpsList_Upd.Writes[0-8]>, + SchedVar<A57LMAddrUpdPred5, A57LDMOpsList_Upd.Writes[0-10]>, + SchedVar<A57LMAddrUpdPred6, A57LDMOpsList_Upd.Writes[0-12]>, + SchedVar<A57LMAddrUpdPred7, A57LDMOpsList_Upd.Writes[0-14]>, + SchedVar<A57LMAddrUpdPred8, A57LDMOpsList_Upd.Writes[0-16]>, + SchedVar<NoSchedPred, A57LDMOpsList_Upd.Writes[0-16]> ]> { let Variadic=1; } def A57WriteLDM : SchedWriteVariant<[ - SchedVar<IsLDMBaseRegInListPred, [A57WriteLDMreginlist]>, - SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]> + SchedVar<IsLDMBaseRegInListPred, [A57WriteLDMreginlist]>, + SchedVar<NoSchedPred, [A57WriteLDMnoreginlist]> ]> { let Variadic=1; } def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>; @@ -1194,7 +1194,7 @@ def : InstRW<[A57Write_5cyc_1V], (instregex // --- 3.16 ASIMD Miscellaneous Instructions --- // ASIMD bitwise insert -def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>; // ASIMD count def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>; @@ -1483,7 +1483,7 @@ def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>; // ----------------------------------------------------------------------------- // Common definitions def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; } -def : SchedAlias<WriteALU, CheckBranchForm<0, A57BranchForm<A57Write_1cyc_1I>>>; +def : SchedAlias<WriteALU, CheckBranchForm<0, A57BranchForm<A57Write_1cyc_1I>>>; def : SchedAlias<WriteBr, A57Write_1cyc_1B>; def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57WriteRes.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57WriteRes.td index 531b10bc5c..3ed917682c 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57WriteRes.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA57WriteRes.td @@ -36,16 +36,16 @@ def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20; let ResourceCycles = [20]; } def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } -def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; - let ResourceCycles = [1]; } -def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; - let ResourceCycles = [1]; } +def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; + let ResourceCycles = [1]; } +def A57Write_2cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 2; + let ResourceCycles = [1]; } def A57Write_3cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 3; } def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } def A57Write_2cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 2; } def A57Write_3cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 3; } -def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; - let ResourceCycles = [1]; } +def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; + let ResourceCycles = [1]; } def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32; let ResourceCycles = [32]; } def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; @@ -71,7 +71,7 @@ foreach Lat = 4-16 in { } } -def A57Write_4cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 4; } +def A57Write_4cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 4; } def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } def A57Write_4cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 4; } def A57Write_5cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 5; } diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA9.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA9.td index be7017a7b4..dfda6c6b4b 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA9.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleA9.td @@ -2525,8 +2525,8 @@ def : ReadAdvance<ReadFPMAC, 0>; def : InstRW< [WriteALU], (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr", "BICrr")>; -def : InstRW< [WriteALUsi], (instrs ANDrsi, ORRrsi, EORrsi, BICrsi)>; -def : InstRW< [WriteALUsr], (instrs ANDrsr, ORRrsr, EORrsr, BICrsr)>; +def : InstRW< [WriteALUsi], (instrs ANDrsi, ORRrsi, EORrsi, BICrsi)>; +def : InstRW< [WriteALUsr], (instrs ANDrsr, ORRrsr, EORrsr, BICrsr)>; def : SchedAlias<WriteCMP, A9WriteALU>; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td index 12296ad092..c5e1d32e8d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td @@ -1,488 +1,488 @@ -//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor. -// -//===----------------------------------------------------------------------===// - -def CortexM7Model : SchedMachineModel { - let IssueWidth = 2; // Dual issue for most instructions. - let MicroOpBufferSize = 0; // The Cortex-M7 is in-order. - let LoadLatency = 2; // Best case for load-use case. - let MispredictPenalty = 4; // Mispredict cost for forward branches is 6, - // but 4 works better - let CompleteModel = 0; -} - -//===--------------------------------------------------------------------===// -// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP -// pipe. The stages relevant to scheduling are as follows: -// -// EX1: address generation shifts -// EX2: fast load data ALUs FP operation -// EX3: slow load data integer writeback FP operation -// EX4: store data FP writeback -// -// There are shifters in both EX1 and EX2, and some instructions can be -// flexibly allocated between them. EX2 is used as the "zero" point -// for scheduling, so simple ALU operations executing in EX2 will have -// ReadAdvance<0> (the default) for their source operands and Latency = 1. - -def M7UnitLoad : ProcResource<2> { let BufferSize = 0; } -def M7UnitStore : ProcResource<1> { let BufferSize = 0; } -def M7UnitALU : ProcResource<2>; -def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; } -def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; } -def M7UnitMAC : ProcResource<1> { let BufferSize = 0; } -def M7UnitBranch : ProcResource<1> { let BufferSize = 0; } -def M7UnitVFP : ProcResource<1> { let BufferSize = 0; } -def M7UnitVPort : ProcResource<2> { let BufferSize = 0; } -def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; } - -//===---------------------------------------------------------------------===// -// Subtarget-specific SchedWrite types with map ProcResources and set latency. - -let SchedModel = CortexM7Model in { - -def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; } - -// Basic ALU with shifts. -let Latency = 1 in { - def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>; - def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>; - def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>; -} - -// Compares. -def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; } -def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } -def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } - -// Multiplies. -let Latency = 2 in { - def : WriteRes<WriteMUL16, [M7UnitMAC]>; - def : WriteRes<WriteMUL32, [M7UnitMAC]>; - def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>; - def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; } -} - -// Multiply-accumulates. -let Latency = 2 in { - def : WriteRes<WriteMAC16, [M7UnitMAC]>; - def : WriteRes<WriteMAC32, [M7UnitMAC]>; - def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; } - def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; } -} - -// Divisions. -// These cannot be dual-issued with any instructions. -def : WriteRes<WriteDIV, [M7UnitALU]> { - let Latency = 7; - let SingleIssue = 1; -} - -// Loads/Stores. -def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; } -def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; } -def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; } - -// Branches. -def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; } -def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; } -def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; } - -// Noop. -def : WriteRes<WriteNoop, []> { let Latency = 0; } - -//===---------------------------------------------------------------------===// -// Sched definitions for floating-point instructions -// -// Floating point conversions. -def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } -def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; } - -// The FP pipeline has a latency of 3 cycles. -// ALU operations (32/64-bit). These go down the FP pipeline. -def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } -def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 4; - let BeginGroup = 1; -} - -// Multiplication -def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } -def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 7; - let BeginGroup = 1; -} - -// Multiply-accumulate. FPMAC goes down the FP Pipeline. -def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; } -def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 11; - let BeginGroup = 1; -} - -// Division. Effective scheduling latency is 3, though real latency is larger -def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } -def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 30; - let BeginGroup = 1; -} - -// Square-root. Effective scheduling latency is 3; real latency is larger -def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } -def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 30; - let BeginGroup = 1; -} - -def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {} - -// Not used for M7, but needing definitions anyway -def : WriteRes<WriteVLD1, []>; -def : WriteRes<WriteVLD2, []>; -def : WriteRes<WriteVLD3, []>; -def : WriteRes<WriteVLD4, []>; -def : WriteRes<WriteVST1, []>; -def : WriteRes<WriteVST2, []>; -def : WriteRes<WriteVST3, []>; -def : WriteRes<WriteVST4, []>; - -def M7SingleIssue : SchedWriteRes<[]> { - let SingleIssue = 1; - let NumMicroOps = 0; -} -def M7Slot0Only : SchedWriteRes<[]> { - let BeginGroup = 1; - let NumMicroOps = 0; -} - -// What pipeline stage operands need to be ready for depending on -// where they come from. -def : ReadAdvance<ReadALUsr, 0>; -def : ReadAdvance<ReadMUL, 0>; -def : ReadAdvance<ReadMAC, 1>; -def : ReadAdvance<ReadALU, 0>; -def : ReadAdvance<ReadFPMUL, 0>; -def : ReadAdvance<ReadFPMAC, 3>; -def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1 -def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3 -def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4 - -// Non general purpose instructions may not be dual issued. These -// use both issue units. -def M7NonGeneralPurpose : SchedWriteRes<[]> { - // Assume that these will go down the main ALU pipeline. - // In reality, many look likely to stall the whole pipeline. - let Latency = 3; - let SingleIssue = 1; -} - -// List the non general purpose instructions. -def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT", - "t2MSR", "t2DMB", "t2DSB", "t2ISB", - "t2HVC", "t2SMC", "t2UDF", "ERET", - "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>; - -//===---------------------------------------------------------------------===// -// Sched definitions for load/store -// -// Mark whether the loads/stores must be single-issue -// Address operands are needed earlier -// Data operands are needed later - -def M7BaseUpdate : SchedWriteRes<[]> { - let Latency = 0; // Update is bypassable out of EX1 - let NumMicroOps = 0; -} -def M7LoadLatency1 : SchedWriteRes<[]> { - let Latency = 1; - let NumMicroOps = 0; -} -def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; } - -// Byte and half-word loads should have greater latency than other loads. -// So should load exclusive. - -def : InstRW<[M7SlowLoad], - (instregex "t2LDR(B|H|SB|SH)pc")>; -def : InstRW<[M7SlowLoad, M7Read_ISS], - (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i", - "tLDR(B|H)i")>; -def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS], - (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>; -def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS], - (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>; - -// Exclusive loads/stores cannot be dual-issued -def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS], - (instregex "t2LDREX$")>; -def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS], - (instregex "t2LDREX(B|H)")>; -def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS], - (instregex "t2STREX(B|H)?$")>; - -// Load/store multiples cannot be dual-issued. Note that default scheduling -// occurs around read/write times of individual registers in the list; read -// time for STM cannot be overridden because it is a variadic source operand. - -def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], - (instregex "(t|t2)LDM(DB|IA)$")>; -def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], - (instregex "(t|t2)STM(DB|IA)$")>; -def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], - (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>; -def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], - (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>; - -// Load/store doubles cannot be dual-issued. - -def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, - M7Read_EX2, M7Read_EX2, M7Read_ISS], - (instregex "t2STRD_(PRE|POST)")>; -def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS], - (instregex "t2STRDi")>; -def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS], - (instregex "t2LDRD_(PRE|POST)")>; -def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS], - (instregex "t2LDRDi")>; - -// Word load / preload -def : InstRW<[WriteLd], - (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>; -def : InstRW<[WriteLd, M7Read_ISS], - (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>; -def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS], - (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>; -def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS], - (instregex "t2LDR_(POST|PRE)")>; - -// Stores -def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS], - (instregex "t2STR(B|H)?_(POST|PRE)")>; -def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS], - (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>; -def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS], - (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>; - -// TBB/TBH - single-issue only; takes two cycles to issue - -def M7TableLoad : SchedWriteRes<[M7UnitLoad]> { - let NumMicroOps = 2; - let SingleIssue = 1; -} - -def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>; - -// VFP loads and stores - -def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; } -def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> { - let Latency = 2; - let SingleIssue = 1; -} -def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>; -def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> { - let SingleIssue = 1; -} - -def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>; -def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>; -def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>; -def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>; - -// Load/store multiples cannot be dual-issued. - -def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], - (instregex "VLDM(S|D|Q)(DB|IA)$")>; -def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], - (instregex "VSTM(S|D|Q)(DB|IA)$")>; -def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], - (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>; -def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], - (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>; - -//===---------------------------------------------------------------------===// -// Sched definitions for ALU -// - -// Shifted ALU operands are read a cycle early. -def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>; - -def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS], - (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$", - "t2(SUB|CMP|CMNz|TEQ|TST)rs$", - "t2MOVsr(a|l)")>; -def : InstRW<[WriteALUsi, M7Read_ISS], - (instregex "t2MVNs")>; - -// Treat pure shift operations (except for RRX) as if they used the EX1 -// shifter but have timing as if they used the EX2 shifter as they usually -// can choose the EX2 shifter when needed. Will miss a few dual-issue cases, -// but the results prove to be better than trying to get them exact. - -def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>; -def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>; - -// Instructions that use the shifter, but have normal timing. - -def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>; - -// Instructions which are slot zero only but otherwise normal. - -def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>; - -// MAC operations that don't have SchedRW set. - -def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>; - -// Divides are special because they stall for their latency, and so look like a -// single-cycle as far as scheduling opportunities go. By putting WriteALU -// first, we make the operand latency 1, but keep the instruction latency 7. - -def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>; - -// DSP extension operations - -def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { - let Latency = 1; - let BeginGroup = 1; -} -def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { - let Latency = 2; - let BeginGroup = 1; -} -def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { - let Latency = 1; - let BeginGroup = 1; -} -def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { - let Latency = 0; // Bypassable out of EX1 - let BeginGroup = 1; -} -def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { - let Latency = 2; - let BeginGroup = 1; -} - -def : InstRW<[M7WriteShSIMD2, M7Read_ISS], - (instregex "t2(S|U)SAT")>; -def : InstRW<[M7WriteSIMD1, ReadALU], - (instregex "(t|t2)(S|U)XT(B|H)")>; -def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU], - (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)", - "t2SEL")>; -def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU], - (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>; -def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS], - (instregex "t2QD(ADD|SUB)")>; -def : InstRW<[M7WriteShSIMD0, M7Read_ISS], - (instregex "t2(RBIT|REV)", "tREV")>; -def : InstRW<[M7WriteShSIMD1, M7Read_ISS], - (instregex "t2(SBFX|UBFX)")>; -def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS], - (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>; -def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2], - (instregex "t2USADA8")>; - -// MSR/MRS -def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>; - -//===---------------------------------------------------------------------===// -// Sched definitions for FP operations -// - -// Effective scheduling latency is really 3 for nearly all FP operations, -// even if their true latency is higher. -def M7WriteVFPLatOverride : SchedWriteRes<[]> { - let Latency = 3; - let NumMicroOps = 0; -} -def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> { - let Latency = 3; - let NumMicroOps = 0; -} - -// Instructions which are missing default schedules. -def : InstRW<[WriteFPALU32], - (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], - (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>; - -// VCMP -def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; } -def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> { - let Latency = 0; - let BeginGroup = 1; -} -def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>; -def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>; - - // VMRS/VMSR -def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } -def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } -def : InstRW<[M7VMRS], (instregex "FMSTAT")>; -def : InstRW<[M7VMSR], (instregex "VMSR")>; - -// VSEL cannot bypass in its implied $cpsr operand; model as earlier read -def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS], - (instregex "VSEL.*S$")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only, - ReadALU, ReadALU, M7Read_ISS], - (instregex "VSEL.*D$")>; - -// VMOV -def : InstRW<[WriteFPMOV], - (instregex "VMOV(H|S)$", "FCONST(H|S)")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], - (instregex "VMOVD$")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], - (instregex "FCONSTD")>; -def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue], - (instregex "VMOV(DRR|RRD|RRS|SRR)")>; - -// Larger-latency overrides. - -def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64], - (instregex "V(MUL|NMUL)D")>; -def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], - (instregex "V(ADD|SUB)D")>; - -// Multiply-accumulate. Chained SP timing is correct; rest need overrides -// Double-precision chained MAC stalls the pipeline behind it for 3 cycles, -// making it appear to have 3 cycle latency for scheduling. - -def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, - ReadFPMAC, ReadFPMUL, ReadFPMUL], - (instregex "V(N)?ML(A|S)D$")>; - -// Single-precision fused MACs look like latency 5 with advance of 2. - -def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { - let Latency = 5; - let NumMicroOps = 0; -} -def M7ReadFPMAC2 : SchedReadAdvance<2>; - -def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32, - M7ReadFPMAC2, ReadFPMUL, ReadFPMUL], - (instregex "VF(N)?M(A|S)S$")>; - -// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making -// it appear to have 3 cycle latency for scheduling. - -def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, - ReadFPMAC, ReadFPMUL, ReadFPMUL], - (instregex "VF(N)?M(A|S)D$")>; - -} // SchedModel = CortexM7Model +//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM7Model : SchedMachineModel { + let IssueWidth = 2; // Dual issue for most instructions. + let MicroOpBufferSize = 0; // The Cortex-M7 is in-order. + let LoadLatency = 2; // Best case for load-use case. + let MispredictPenalty = 4; // Mispredict cost for forward branches is 6, + // but 4 works better + let CompleteModel = 0; +} + +//===--------------------------------------------------------------------===// +// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP +// pipe. The stages relevant to scheduling are as follows: +// +// EX1: address generation shifts +// EX2: fast load data ALUs FP operation +// EX3: slow load data integer writeback FP operation +// EX4: store data FP writeback +// +// There are shifters in both EX1 and EX2, and some instructions can be +// flexibly allocated between them. EX2 is used as the "zero" point +// for scheduling, so simple ALU operations executing in EX2 will have +// ReadAdvance<0> (the default) for their source operands and Latency = 1. + +def M7UnitLoad : ProcResource<2> { let BufferSize = 0; } +def M7UnitStore : ProcResource<1> { let BufferSize = 0; } +def M7UnitALU : ProcResource<2>; +def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; } +def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; } +def M7UnitMAC : ProcResource<1> { let BufferSize = 0; } +def M7UnitBranch : ProcResource<1> { let BufferSize = 0; } +def M7UnitVFP : ProcResource<1> { let BufferSize = 0; } +def M7UnitVPort : ProcResource<2> { let BufferSize = 0; } +def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; } + +//===---------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types with map ProcResources and set latency. + +let SchedModel = CortexM7Model in { + +def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; } + +// Basic ALU with shifts. +let Latency = 1 in { + def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>; + def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>; + def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>; +} + +// Compares. +def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; } +def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } +def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } + +// Multiplies. +let Latency = 2 in { + def : WriteRes<WriteMUL16, [M7UnitMAC]>; + def : WriteRes<WriteMUL32, [M7UnitMAC]>; + def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>; + def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; } +} + +// Multiply-accumulates. +let Latency = 2 in { + def : WriteRes<WriteMAC16, [M7UnitMAC]>; + def : WriteRes<WriteMAC32, [M7UnitMAC]>; + def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; } + def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; } +} + +// Divisions. +// These cannot be dual-issued with any instructions. +def : WriteRes<WriteDIV, [M7UnitALU]> { + let Latency = 7; + let SingleIssue = 1; +} + +// Loads/Stores. +def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; } +def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; } +def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; } + +// Branches. +def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; } +def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; } +def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; } + +// Noop. +def : WriteRes<WriteNoop, []> { let Latency = 0; } + +//===---------------------------------------------------------------------===// +// Sched definitions for floating-point instructions +// +// Floating point conversions. +def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } +def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; } + +// The FP pipeline has a latency of 3 cycles. +// ALU operations (32/64-bit). These go down the FP pipeline. +def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } +def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 4; + let BeginGroup = 1; +} + +// Multiplication +def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } +def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 7; + let BeginGroup = 1; +} + +// Multiply-accumulate. FPMAC goes down the FP Pipeline. +def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; } +def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 11; + let BeginGroup = 1; +} + +// Division. Effective scheduling latency is 3, though real latency is larger +def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } +def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 30; + let BeginGroup = 1; +} + +// Square-root. Effective scheduling latency is 3; real latency is larger +def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } +def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 30; + let BeginGroup = 1; +} + +def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {} + +// Not used for M7, but needing definitions anyway +def : WriteRes<WriteVLD1, []>; +def : WriteRes<WriteVLD2, []>; +def : WriteRes<WriteVLD3, []>; +def : WriteRes<WriteVLD4, []>; +def : WriteRes<WriteVST1, []>; +def : WriteRes<WriteVST2, []>; +def : WriteRes<WriteVST3, []>; +def : WriteRes<WriteVST4, []>; + +def M7SingleIssue : SchedWriteRes<[]> { + let SingleIssue = 1; + let NumMicroOps = 0; +} +def M7Slot0Only : SchedWriteRes<[]> { + let BeginGroup = 1; + let NumMicroOps = 0; +} + +// What pipeline stage operands need to be ready for depending on +// where they come from. +def : ReadAdvance<ReadALUsr, 0>; +def : ReadAdvance<ReadMUL, 0>; +def : ReadAdvance<ReadMAC, 1>; +def : ReadAdvance<ReadALU, 0>; +def : ReadAdvance<ReadFPMUL, 0>; +def : ReadAdvance<ReadFPMAC, 3>; +def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1 +def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3 +def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4 + +// Non general purpose instructions may not be dual issued. These +// use both issue units. +def M7NonGeneralPurpose : SchedWriteRes<[]> { + // Assume that these will go down the main ALU pipeline. + // In reality, many look likely to stall the whole pipeline. + let Latency = 3; + let SingleIssue = 1; +} + +// List the non general purpose instructions. +def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT", + "t2MSR", "t2DMB", "t2DSB", "t2ISB", + "t2HVC", "t2SMC", "t2UDF", "ERET", + "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>; + +//===---------------------------------------------------------------------===// +// Sched definitions for load/store +// +// Mark whether the loads/stores must be single-issue +// Address operands are needed earlier +// Data operands are needed later + +def M7BaseUpdate : SchedWriteRes<[]> { + let Latency = 0; // Update is bypassable out of EX1 + let NumMicroOps = 0; +} +def M7LoadLatency1 : SchedWriteRes<[]> { + let Latency = 1; + let NumMicroOps = 0; +} +def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; } + +// Byte and half-word loads should have greater latency than other loads. +// So should load exclusive. + +def : InstRW<[M7SlowLoad], + (instregex "t2LDR(B|H|SB|SH)pc")>; +def : InstRW<[M7SlowLoad, M7Read_ISS], + (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i", + "tLDR(B|H)i")>; +def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS], + (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>; +def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS], + (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>; + +// Exclusive loads/stores cannot be dual-issued +def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS], + (instregex "t2LDREX$")>; +def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS], + (instregex "t2LDREX(B|H)")>; +def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS], + (instregex "t2STREX(B|H)?$")>; + +// Load/store multiples cannot be dual-issued. Note that default scheduling +// occurs around read/write times of individual registers in the list; read +// time for STM cannot be overridden because it is a variadic source operand. + +def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], + (instregex "(t|t2)LDM(DB|IA)$")>; +def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], + (instregex "(t|t2)STM(DB|IA)$")>; +def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], + (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>; +def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], + (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>; + +// Load/store doubles cannot be dual-issued. + +def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, + M7Read_EX2, M7Read_EX2, M7Read_ISS], + (instregex "t2STRD_(PRE|POST)")>; +def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS], + (instregex "t2STRDi")>; +def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS], + (instregex "t2LDRD_(PRE|POST)")>; +def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS], + (instregex "t2LDRDi")>; + +// Word load / preload +def : InstRW<[WriteLd], + (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>; +def : InstRW<[WriteLd, M7Read_ISS], + (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>; +def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS], + (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>; +def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS], + (instregex "t2LDR_(POST|PRE)")>; + +// Stores +def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS], + (instregex "t2STR(B|H)?_(POST|PRE)")>; +def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS], + (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>; +def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS], + (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>; + +// TBB/TBH - single-issue only; takes two cycles to issue + +def M7TableLoad : SchedWriteRes<[M7UnitLoad]> { + let NumMicroOps = 2; + let SingleIssue = 1; +} + +def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>; + +// VFP loads and stores + +def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; } +def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> { + let Latency = 2; + let SingleIssue = 1; +} +def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>; +def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> { + let SingleIssue = 1; +} + +def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>; +def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>; +def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>; +def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>; + +// Load/store multiples cannot be dual-issued. + +def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], + (instregex "VLDM(S|D|Q)(DB|IA)$")>; +def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], + (instregex "VSTM(S|D|Q)(DB|IA)$")>; +def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], + (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>; +def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], + (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>; + +//===---------------------------------------------------------------------===// +// Sched definitions for ALU +// + +// Shifted ALU operands are read a cycle early. +def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>; + +def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS], + (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$", + "t2(SUB|CMP|CMNz|TEQ|TST)rs$", + "t2MOVsr(a|l)")>; +def : InstRW<[WriteALUsi, M7Read_ISS], + (instregex "t2MVNs")>; + +// Treat pure shift operations (except for RRX) as if they used the EX1 +// shifter but have timing as if they used the EX2 shifter as they usually +// can choose the EX2 shifter when needed. Will miss a few dual-issue cases, +// but the results prove to be better than trying to get them exact. + +def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>; +def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>; + +// Instructions that use the shifter, but have normal timing. + +def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>; + +// Instructions which are slot zero only but otherwise normal. + +def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>; + +// MAC operations that don't have SchedRW set. + +def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>; + +// Divides are special because they stall for their latency, and so look like a +// single-cycle as far as scheduling opportunities go. By putting WriteALU +// first, we make the operand latency 1, but keep the instruction latency 7. + +def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>; + +// DSP extension operations + +def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { + let Latency = 1; + let BeginGroup = 1; +} +def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { + let Latency = 2; + let BeginGroup = 1; +} +def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { + let Latency = 1; + let BeginGroup = 1; +} +def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { + let Latency = 0; // Bypassable out of EX1 + let BeginGroup = 1; +} +def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { + let Latency = 2; + let BeginGroup = 1; +} + +def : InstRW<[M7WriteShSIMD2, M7Read_ISS], + (instregex "t2(S|U)SAT")>; +def : InstRW<[M7WriteSIMD1, ReadALU], + (instregex "(t|t2)(S|U)XT(B|H)")>; +def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU], + (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)", + "t2SEL")>; +def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU], + (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>; +def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS], + (instregex "t2QD(ADD|SUB)")>; +def : InstRW<[M7WriteShSIMD0, M7Read_ISS], + (instregex "t2(RBIT|REV)", "tREV")>; +def : InstRW<[M7WriteShSIMD1, M7Read_ISS], + (instregex "t2(SBFX|UBFX)")>; +def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS], + (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>; +def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2], + (instregex "t2USADA8")>; + +// MSR/MRS +def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>; + +//===---------------------------------------------------------------------===// +// Sched definitions for FP operations +// + +// Effective scheduling latency is really 3 for nearly all FP operations, +// even if their true latency is higher. +def M7WriteVFPLatOverride : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} +def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> { + let Latency = 3; + let NumMicroOps = 0; +} + +// Instructions which are missing default schedules. +def : InstRW<[WriteFPALU32], + (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], + (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>; + +// VCMP +def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; } +def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> { + let Latency = 0; + let BeginGroup = 1; +} +def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>; +def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>; + + // VMRS/VMSR +def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } +def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } +def : InstRW<[M7VMRS], (instregex "FMSTAT")>; +def : InstRW<[M7VMSR], (instregex "VMSR")>; + +// VSEL cannot bypass in its implied $cpsr operand; model as earlier read +def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS], + (instregex "VSEL.*S$")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only, + ReadALU, ReadALU, M7Read_ISS], + (instregex "VSEL.*D$")>; + +// VMOV +def : InstRW<[WriteFPMOV], + (instregex "VMOV(H|S)$", "FCONST(H|S)")>; +def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], + (instregex "VMOVD$")>; +def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], + (instregex "FCONSTD")>; +def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue], + (instregex "VMOV(DRR|RRD|RRS|SRR)")>; + +// Larger-latency overrides. + +def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64], + (instregex "V(MUL|NMUL)D")>; +def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], + (instregex "V(ADD|SUB)D")>; + +// Multiply-accumulate. Chained SP timing is correct; rest need overrides +// Double-precision chained MAC stalls the pipeline behind it for 3 cycles, +// making it appear to have 3 cycle latency for scheduling. + +def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, + ReadFPMAC, ReadFPMUL, ReadFPMUL], + (instregex "V(N)?ML(A|S)D$")>; + +// Single-precision fused MACs look like latency 5 with advance of 2. + +def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { + let Latency = 5; + let NumMicroOps = 0; +} +def M7ReadFPMAC2 : SchedReadAdvance<2>; + +def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32, + M7ReadFPMAC2, ReadFPMUL, ReadFPMUL], + (instregex "VF(N)?M(A|S)S$")>; + +// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making +// it appear to have 3 cycle latency for scheduling. + +def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, + ReadFPMAC, ReadFPMUL, ReadFPMUL], + (instregex "VF(N)?M(A|S)D$")>; + +} // SchedModel = CortexM7Model diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleR52.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleR52.td index 466acec6f7..aabce817a9 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleR52.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleR52.td @@ -787,8 +787,8 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F2, R52Read_F2], (instregex "(VAND|VBIC def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>; def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>; -def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)d")>; -def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)q")>; +def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)d")>; +def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)q")>; def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleSwift.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleSwift.td index d66b3065c7..ef2bde2a0d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleSwift.td +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleSwift.td @@ -558,8 +558,8 @@ let SchedModel = SwiftModel in { (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL", "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi", "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST", - "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", "VBIT", - "VBSL", "VBSP", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>; + "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", "VBIT", + "VBSL", "VBSP", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>; def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VEXT", "VREV16", "VREV32", "VREV64")>; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.cpp index 5cb608b74a..c49135d536 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.cpp @@ -97,9 +97,9 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle, bool MinSize) - : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), - UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize), - IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), + : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), + UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize), + IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. @@ -185,7 +185,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { else ArchFS = std::string(FS); } - ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, ArchFS); + ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, ArchFS); // FIXME: This used enable V6T2 support implicitly for Thumb2 mode. // Assert this for now to make the change obvious. @@ -237,7 +237,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { switch (IT) { case DefaultIT: - RestrictIT = hasV8Ops() && !hasMinSize(); + RestrictIT = hasV8Ops() && !hasMinSize(); break; case RestrictedIT: RestrictIT = true; @@ -294,13 +294,13 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexA76: case CortexA77: case CortexA78: - case CortexA78C: + case CortexA78C: case CortexR4: case CortexR4F: case CortexR5: case CortexR7: case CortexM3: - case CortexM7: + case CortexM7: case CortexR52: case CortexX1: break; @@ -316,8 +316,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { PreISelOperandLatencyAdjustment = 1; break; case NeoverseN1: - case NeoverseN2: - case NeoverseV1: + case NeoverseN2: + case NeoverseV1: break; case Swift: MaxInterleaveFactor = 2; diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.h b/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.h index fd9b94fdaa..a6335c6984 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMSubtarget.h @@ -63,11 +63,11 @@ protected: CortexA76, CortexA77, CortexA78, - CortexA78C, + CortexA78C, CortexA8, CortexA9, CortexM3, - CortexM7, + CortexM7, CortexR4, CortexR4F, CortexR5, @@ -78,8 +78,8 @@ protected: Krait, Kryo, NeoverseN1, - NeoverseN2, - NeoverseV1, + NeoverseN2, + NeoverseV1, Swift }; enum ARMProcClassEnum { @@ -167,7 +167,7 @@ protected: bool HasV8_4aOps = false; bool HasV8_5aOps = false; bool HasV8_6aOps = false; - bool HasV8_7aOps = false; + bool HasV8_7aOps = false; bool HasV8MBaselineOps = false; bool HasV8MMainlineOps = false; bool HasV8_1MMainlineOps = false; @@ -466,13 +466,13 @@ protected: /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1. bool NegativeImmediates = true; - /// Harden against Straight Line Speculation for Returns and Indirect - /// Branches. - bool HardenSlsRetBr = false; - - /// Harden against Straight Line Speculation for indirect calls. - bool HardenSlsBlr = false; - + /// Harden against Straight Line Speculation for Returns and Indirect + /// Branches. + bool HardenSlsRetBr = false; + + /// Harden against Straight Line Speculation for indirect calls. + bool HardenSlsBlr = false; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. Align stackAlignment = Align(4); @@ -538,7 +538,7 @@ public: /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. - void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. @@ -606,7 +606,7 @@ public: bool hasV8_4aOps() const { return HasV8_4aOps; } bool hasV8_5aOps() const { return HasV8_5aOps; } bool hasV8_6aOps() const { return HasV8_6aOps; } - bool hasV8_7aOps() const { return HasV8_7aOps; } + bool hasV8_7aOps() const { return HasV8_7aOps; } bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; } @@ -627,7 +627,7 @@ public: bool isCortexA15() const { return ARMProcFamily == CortexA15; } bool isSwift() const { return ARMProcFamily == Swift; } bool isCortexM3() const { return ARMProcFamily == CortexM3; } - bool isCortexM7() const { return ARMProcFamily == CortexM7; } + bool isCortexM7() const { return ARMProcFamily == CortexM7; } bool isLikeA9() const { return isCortexA9() || isCortexA15() || isKrait(); } bool isCortexR5() const { return ARMProcFamily == CortexR5; } bool isKrait() const { return ARMProcFamily == Krait; } @@ -915,9 +915,9 @@ public: bool ignoreCSRForAllocationOrder(const MachineFunction &MF, unsigned PhysReg) const override; unsigned getGPRAllocationOrder(const MachineFunction &MF) const; - - bool hardenSlsRetBr() const { return HardenSlsRetBr; } - bool hardenSlsBlr() const { return HardenSlsBlr; } + + bool hardenSlsRetBr() const { return HardenSlsRetBr; } + bool hardenSlsBlr() const { return HardenSlsBlr; } }; } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.cpp index 237ef54c83..c4841aabdf 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.cpp @@ -99,9 +99,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() { initializeMVEVPTOptimisationsPass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); - initializeARMBlockPlacementPass(Registry); + initializeARMBlockPlacementPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); - initializeARMSLSHardeningPass(Registry); + initializeARMSLSHardeningPass(Registry); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -253,7 +253,7 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, // ARM supports the MachineOutliner. setMachineOutliner(true); - setSupportsDefaultOutlining(true); + setSupportsDefaultOutlining(true); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; @@ -263,10 +263,10 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); - std::string CPU = - CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; - std::string FS = - FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; + std::string CPU = + CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; + std::string FS = + FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; // FIXME: This is related to the code below to reset the target options, // we need to know whether or not the soft float flag is set on the @@ -409,8 +409,8 @@ void ARMPassConfig::addIRPasses() { // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass( - SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true), - [this](const Function &F) { + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true), + [this](const Function &F) { const auto &ST = this->TM->getSubtarget<ARMSubtarget>(F); return ST.hasAnyDataBarrier() && !ST.isThumb1Only(); })); @@ -472,7 +472,7 @@ bool ARMPassConfig::addInstSelector() { } bool ARMPassConfig::addIRTranslator() { - addPass(new IRTranslator(getOptLevel())); + addPass(new IRTranslator(getOptLevel())); return false; } @@ -540,9 +540,9 @@ void ARMPassConfig::addPreSched2() { addPass(&PostMachineSchedulerID); addPass(&PostRASchedulerID); } - - addPass(createARMIndirectThunks()); - addPass(createARMSLSHardeningPass()); + + addPass(createARMIndirectThunks()); + addPass(createARMSLSHardeningPass()); } void ARMPassConfig::addPreEmitPass() { @@ -553,11 +553,11 @@ void ARMPassConfig::addPreEmitPass() { return MF.getSubtarget<ARMSubtarget>().isThumb2(); })); - // Don't optimize barriers or block placement at -O0. - if (getOptLevel() != CodeGenOpt::None) { - addPass(createARMBlockPlacementPass()); + // Don't optimize barriers or block placement at -O0. + if (getOptLevel() != CodeGenOpt::None) { + addPass(createARMBlockPlacementPass()); addPass(createARMOptimizeBarriersPass()); - } + } } void ARMPassConfig::addPreEmitPass2() { diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.h b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.h index 8428092bf1..d9f5d40eb1 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetMachine.h @@ -72,12 +72,12 @@ public: } bool targetSchedulesPostRAScheduling() const override { return true; }; - - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } + + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } }; /// ARM/Thumb little endian target machine. diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp index 8901934013..e4e4252041 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,18 +20,18 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/KnownBits.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/InstCombine/InstCombiner.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <cassert> @@ -50,38 +50,38 @@ static cl::opt<bool> DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); -static cl::opt<bool> - AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), - cl::desc("Enable the generation of WLS loops")); - +static cl::opt<bool> + AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), + cl::desc("Enable the generation of WLS loops")); + extern cl::opt<TailPredication::Mode> EnableTailPredication; extern cl::opt<bool> EnableMaskedGatherScatters; -extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; - -/// Convert a vector load intrinsic into a simple llvm load instruction. -/// This is beneficial when the underlying object being addressed comes -/// from a constant, since we get constant-folding for free. -static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, - InstCombiner::BuilderTy &Builder) { - auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); - - if (!IntrAlign) - return nullptr; - - unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign - ? MemAlign - : IntrAlign->getLimitedValue(); - - if (!isPowerOf2_32(Alignment)) - return nullptr; - - auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), - PointerType::get(II.getType(), 0)); - return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); -} - +extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor; + +/// Convert a vector load intrinsic into a simple llvm load instruction. +/// This is beneficial when the underlying object being addressed comes +/// from a constant, since we get constant-folding for free. +static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, + InstCombiner::BuilderTy &Builder) { + auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1)); + + if (!IntrAlign) + return nullptr; + + unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign + ? MemAlign + : IntrAlign->getLimitedValue(); + + if (!isPowerOf2_32(Alignment)) + return nullptr; + + auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), + PointerType::get(II.getType(), 0)); + return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); +} + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -114,138 +114,138 @@ bool ARMTTIImpl::shouldFavorPostInc() const { return false; } -Optional<Instruction *> -ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { - using namespace PatternMatch; - Intrinsic::ID IID = II.getIntrinsicID(); - switch (IID) { - default: - break; - case Intrinsic::arm_neon_vld1: { - Align MemAlign = - getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()); - if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { - return IC.replaceInstUsesWith(II, V); - } - break; - } - - case Intrinsic::arm_neon_vld2: - case Intrinsic::arm_neon_vld3: - case Intrinsic::arm_neon_vld4: - case Intrinsic::arm_neon_vld2lane: - case Intrinsic::arm_neon_vld3lane: - case Intrinsic::arm_neon_vld4lane: - case Intrinsic::arm_neon_vst1: - case Intrinsic::arm_neon_vst2: - case Intrinsic::arm_neon_vst3: - case Intrinsic::arm_neon_vst4: - case Intrinsic::arm_neon_vst2lane: - case Intrinsic::arm_neon_vst3lane: - case Intrinsic::arm_neon_vst4lane: { - Align MemAlign = - getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, - &IC.getAssumptionCache(), &IC.getDominatorTree()); - unsigned AlignArg = II.getNumArgOperands() - 1; - Value *AlignArgOp = II.getArgOperand(AlignArg); - MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); - if (Align && *Align < MemAlign) { - return IC.replaceOperand( - II, AlignArg, - ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), - false)); - } - break; - } - - case Intrinsic::arm_mve_pred_i2v: { - Value *Arg = II.getArgOperand(0); - Value *ArgArg; - if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( - PatternMatch::m_Value(ArgArg))) && - II.getType() == ArgArg->getType()) { - return IC.replaceInstUsesWith(II, ArgArg); - } - Constant *XorMask; - if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( - PatternMatch::m_Value(ArgArg)), - PatternMatch::m_Constant(XorMask))) && - II.getType() == ArgArg->getType()) { - if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { - if (CI->getValue().trunc(16).isAllOnesValue()) { - auto TrueVector = IC.Builder.CreateVectorSplat( - cast<FixedVectorType>(II.getType())->getNumElements(), - IC.Builder.getTrue()); - return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); - } - } - } - KnownBits ScalarKnown(32); - if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), - ScalarKnown, 0)) { - return &II; - } - break; - } - case Intrinsic::arm_mve_pred_v2i: { - Value *Arg = II.getArgOperand(0); - Value *ArgArg; - if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( - PatternMatch::m_Value(ArgArg)))) { - return IC.replaceInstUsesWith(II, ArgArg); - } - if (!II.getMetadata(LLVMContext::MD_range)) { - Type *IntTy32 = Type::getInt32Ty(II.getContext()); - Metadata *M[] = { - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; - II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); - return &II; - } - break; - } - case Intrinsic::arm_mve_vadc: - case Intrinsic::arm_mve_vadc_predicated: { - unsigned CarryOp = - (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; - assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && - "Bad type for intrinsic!"); - - KnownBits CarryKnown(32); - if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), - CarryKnown)) { - return &II; - } - break; - } - case Intrinsic::arm_mve_vmldava: { - Instruction *I = cast<Instruction>(&II); - if (I->hasOneUse()) { - auto *User = cast<Instruction>(*I->user_begin()); - Value *OpZ; - if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && - match(I->getOperand(3), m_Zero())) { - Value *OpX = I->getOperand(4); - Value *OpY = I->getOperand(5); - Type *OpTy = OpX->getType(); - - IC.Builder.SetInsertPoint(User); - Value *V = - IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, - {I->getOperand(0), I->getOperand(1), - I->getOperand(2), OpZ, OpX, OpY}); - - IC.replaceInstUsesWith(*User, V); - return IC.eraseInstFromFunction(*User); - } - } - return None; - } - } - return None; -} - +Optional<Instruction *> +ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + using namespace PatternMatch; + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::arm_neon_vld1: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + } + + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + unsigned AlignArg = II.getNumArgOperands() - 1; + Value *AlignArgOp = II.getArgOperand(AlignArg); + MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); + if (Align && *Align < MemAlign) { + return IC.replaceOperand( + II, AlignArg, + ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), + false)); + } + break; + } + + case Intrinsic::arm_mve_pred_i2v: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( + PatternMatch::m_Value(ArgArg))) && + II.getType() == ArgArg->getType()) { + return IC.replaceInstUsesWith(II, ArgArg); + } + Constant *XorMask; + if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>( + PatternMatch::m_Value(ArgArg)), + PatternMatch::m_Constant(XorMask))) && + II.getType() == ArgArg->getType()) { + if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { + if (CI->getValue().trunc(16).isAllOnesValue()) { + auto TrueVector = IC.Builder.CreateVectorSplat( + cast<FixedVectorType>(II.getType())->getNumElements(), + IC.Builder.getTrue()); + return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); + } + } + } + KnownBits ScalarKnown(32); + if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), + ScalarKnown, 0)) { + return &II; + } + break; + } + case Intrinsic::arm_mve_pred_v2i: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>( + PatternMatch::m_Value(ArgArg)))) { + return IC.replaceInstUsesWith(II, ArgArg); + } + if (!II.getMetadata(LLVMContext::MD_range)) { + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + Metadata *M[] = { + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; + II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); + return &II; + } + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && + "Bad type for intrinsic!"); + + KnownBits CarryKnown(32); + if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), + CarryKnown)) { + return &II; + } + break; + } + case Intrinsic::arm_mve_vmldava: { + Instruction *I = cast<Instruction>(&II); + if (I->hasOneUse()) { + auto *User = cast<Instruction>(*I->user_begin()); + Value *OpZ; + if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) && + match(I->getOperand(3), m_Zero())) { + Value *OpX = I->getOperand(4); + Value *OpY = I->getOperand(5); + Type *OpTy = OpX->getType(); + + IC.Builder.SetInsertPoint(User); + Value *V = + IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy}, + {I->getOperand(0), I->getOperand(1), + I->getOperand(2), OpZ, OpX, OpY}); + + IC.replaceInstUsesWith(*User, V); + return IC.eraseInstFromFunction(*User); + } + } + return None; + } + } + return None; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); @@ -289,43 +289,43 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, return 1; } -// Checks whether Inst is part of a min(max()) or max(min()) pattern -// that will match to an SSAT instruction -static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { - Value *LHS, *RHS; - ConstantInt *C; - SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; - - if (InstSPF == SPF_SMAX && - PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && - C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { - - auto isSSatMin = [&](Value *MinInst) { - if (isa<SelectInst>(MinInst)) { - Value *MinLHS, *MinRHS; - ConstantInt *MinC; - SelectPatternFlavor MinSPF = - matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; - if (MinSPF == SPF_SMIN && - PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && - MinC->getValue() == ((-Imm) - 1)) - return true; - } - return false; - }; - - if (isSSatMin(Inst->getOperand(1)) || - (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || - isSSatMin(*(++Inst->user_begin()))))) - return true; - } - return false; -} - -int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind, - Instruction *Inst) { +// Checks whether Inst is part of a min(max()) or max(min()) pattern +// that will match to an SSAT instruction +static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { + Value *LHS, *RHS; + ConstantInt *C; + SelectPatternFlavor InstSPF = matchSelectPattern(Inst, LHS, RHS).Flavor; + + if (InstSPF == SPF_SMAX && + PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && + C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { + + auto isSSatMin = [&](Value *MinInst) { + if (isa<SelectInst>(MinInst)) { + Value *MinLHS, *MinRHS; + ConstantInt *MinC; + SelectPatternFlavor MinSPF = + matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor; + if (MinSPF == SPF_SMIN && + PatternMatch::match(MinRHS, PatternMatch::m_ConstantInt(MinC)) && + MinC->getValue() == ((-Imm) - 1)) + return true; + } + return false; + }; + + if (isSSatMin(Inst->getOperand(1)) || + (Inst->hasNUses(2) && (isSSatMin(*Inst->user_begin()) || + isSSatMin(*(++Inst->user_begin()))))) + return true; + } + return false; +} + +int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind, + Instruction *Inst) { // Division by a constant can be turned into multiplication, but only if we // know it's constant. So it's not so much that the immediate is cheap (it's // not), but that the alternative is worse. @@ -364,33 +364,33 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) return 0; - // Ensures negative constant of min(max()) or max(min()) patterns that - // match to SSAT instructions don't get hoisted - if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && - Ty->getIntegerBitWidth() <= 32) { - if (isSSATMinMaxPattern(Inst, Imm) || - (isa<ICmpInst>(Inst) && Inst->hasOneUse() && - isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) - return 0; - } - + // Ensures negative constant of min(max()) or max(min()) patterns that + // match to SSAT instructions don't get hoisted + if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) && + Ty->getIntegerBitWidth() <= 32) { + if (isSSATMinMaxPattern(Inst, Imm) || + (isa<ICmpInst>(Inst) && Inst->hasOneUse() && + isSSATMinMaxPattern(cast<Instruction>(*Inst->user_begin()), Imm))) + return 0; + } + return getIntImmCost(Imm, Ty, CostKind); } -int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { - if (CostKind == TTI::TCK_RecipThroughput && - (ST->hasNEON() || ST->hasMVEIntegerOps())) { - // FIXME: The vectorizer is highly sensistive to the cost of these - // instructions, which suggests that it may be using the costs incorrectly. - // But, for now, just make them free to avoid performance regressions for - // vector targets. - return 0; - } - return BaseT::getCFInstrCost(Opcode, CostKind); -} - +int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { + if (CostKind == TTI::TCK_RecipThroughput && + (ST->hasNEON() || ST->hasMVEIntegerOps())) { + // FIXME: The vectorizer is highly sensistive to the cost of these + // instructions, which suggests that it may be using the costs incorrectly. + // But, for now, just make them free to avoid performance regressions for + // vector targets. + return 0; + } + return BaseT::getCFInstrCost(Opcode, CostKind); +} + int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -402,35 +402,35 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Cost == 0 ? 0 : 1; return Cost; }; - auto IsLegalFPType = [this](EVT VT) { - EVT EltVT = VT.getScalarType(); - return (EltVT == MVT::f32 && ST->hasVFP2Base()) || - (EltVT == MVT::f64 && ST->hasFP64()) || - (EltVT == MVT::f16 && ST->hasFullFP16()); - }; + auto IsLegalFPType = [this](EVT VT) { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 && ST->hasVFP2Base()) || + (EltVT == MVT::f64 && ST->hasFP64()) || + (EltVT == MVT::f16 && ST->hasFullFP16()); + }; EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost( - BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); - - // Extending masked load/Truncating masked stores is expensive because we - // currently don't split them. This means that we'll likely end up - // loading/storing each element individually (hence the high cost). - if ((ST->hasMVEIntegerOps() && - (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || - Opcode == Instruction::SExt)) || - (ST->hasMVEFloatOps() && - (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && - IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) - if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) - return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); - - // The extend of other kinds of load is free - if (CCH == TTI::CastContextHint::Normal || - CCH == TTI::CastContextHint::Masked) { + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + + // Extending masked load/Truncating masked stores is expensive because we + // currently don't split them. This means that we'll likely end up + // loading/storing each element individually (hence the high cost). + if ((ST->hasMVEIntegerOps() && + (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt || + Opcode == Instruction::SExt)) || + (ST->hasMVEFloatOps() && + (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) && + IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))) + if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128) + return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor(); + + // The extend of other kinds of load is free + if (CCH == TTI::CastContextHint::Normal || + CCH == TTI::CastContextHint::Masked) { static const TypeConversionCostTblEntry LoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0}, {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0}, @@ -485,31 +485,31 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - // The truncate of a store is free. This is the mirror of extends above. - static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { + // The truncate of a store is free. This is the mirror of extends above. + static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = { {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, - {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, + {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1}, {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, }; if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVEStoreConversionTbl, ISD, - SrcTy.getSimpleVT(), DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } - static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { + static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = { {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, }; if (SrcTy.isVector() && ST->hasMVEFloatOps()) { if (const auto *Entry = - ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, - SrcTy.getSimpleVT(), DstTy.getSimpleVT())) + ConvertCostTableLookup(MVEFStoreConversionTbl, ISD, + SrcTy.getSimpleVT(), DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } } @@ -746,24 +746,24 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (SrcTy.isFixedLengthVector()) Lanes = SrcTy.getVectorNumElements(); - if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) + if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)) return Lanes; else return Lanes * CallCost; } - if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && - SrcTy.isFixedLengthVector()) { - // Treat a truncate with larger than legal source (128bits for MVE) as - // expensive, 2 instructions per lane. - if ((SrcTy.getScalarType() == MVT::i8 || - SrcTy.getScalarType() == MVT::i16 || - SrcTy.getScalarType() == MVT::i32) && - SrcTy.getSizeInBits() > 128 && - SrcTy.getSizeInBits() > DstTy.getSizeInBits()) - return SrcTy.getVectorNumElements() * 2; - } - + if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() && + SrcTy.isFixedLengthVector()) { + // Treat a truncate with larger than legal source (128bits for MVE) as + // expensive, 2 instructions per lane. + if ((SrcTy.getScalarType() == MVT::i8 || + SrcTy.getScalarType() == MVT::i16 || + SrcTy.getScalarType() == MVT::i32) && + SrcTy.getSizeInBits() > 128 && + SrcTy.getSizeInBits() > DstTy.getSizeInBits()) + return SrcTy.getVectorNumElements() * 2; + } + // Scalar integer conversion costs. static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. @@ -787,7 +787,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, ? ST->getMVEVectorCostFactor() : 1; return AdjustCost( - BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -827,37 +827,37 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { - int ISD = TLI->InstructionOpcodeToISD(Opcode); - - // Thumb scalar code size cost for select. - if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && - ST->isThumb() && !ValTy->isVectorTy()) { - // Assume expensive structs. - if (TLI->getValueType(DL, ValTy, true) == MVT::Other) - return TTI::TCC_Expensive; - - // Select costs can vary because they: - // - may require one or more conditional mov (including an IT), - // - can't operate directly on immediates, - // - require live flags, which we can't copy around easily. - int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; - - // Possible IT instruction for Thumb2, or more for Thumb1. - ++Cost; - - // i1 values may need rematerialising by using mov immediates and/or - // flag setting instructions. - if (ValTy->isIntegerTy(1)) - ++Cost; - - return Cost; - } - + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + // Thumb scalar code size cost for select. + if (CostKind == TTI::TCK_CodeSize && ISD == ISD::SELECT && + ST->isThumb() && !ValTy->isVectorTy()) { + // Assume expensive structs. + if (TLI->getValueType(DL, ValTy, true) == MVT::Other) + return TTI::TCC_Expensive; + + // Select costs can vary because they: + // - may require one or more conditional mov (including an IT), + // - can't operate directly on immediates, + // - require live flags, which we can't copy around easily. + int Cost = TLI->getTypeLegalizationCost(DL, ValTy).first; + + // Possible IT instruction for Thumb2, or more for Thumb1. + ++Cost; + + // i1 values may need rematerialising by using mov immediates and/or + // flag setting instructions. + if (ValTy->isIntegerTy(1)) + ++Cost; + + return Cost; + } + // On NEON a vector select gets lowered to vbsl. - if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { + if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) { // Lowering of some vector selects is currently far from perfect. static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, @@ -878,15 +878,15 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, return LT.first; } - // Default to cheap (throughput/size of 1 instruction) but adjust throughput - // for "multiple beats" potentially needed by MVE instructions. - int BaseCost = 1; - if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && - ValTy->isVectorTy()) - BaseCost = ST->getMVEVectorCostFactor(); - - return BaseCost * - BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); + // Default to cheap (throughput/size of 1 instruction) but adjust throughput + // for "multiple beats" potentially needed by MVE instructions. + int BaseCost = 1; + if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && + ValTy->isVectorTy()) + BaseCost = ST->getMVEVectorCostFactor(); + + return BaseCost * + BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -968,85 +968,85 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); } -/// Given a memcpy/memset/memmove instruction, return the number of memory -/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a -/// call is used. -int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { - MemOp MOp; - unsigned DstAddrSpace = ~0u; - unsigned SrcAddrSpace = ~0u; - const Function *F = I->getParent()->getParent(); - - if (const auto *MC = dyn_cast<MemTransferInst>(I)) { - ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); - // If 'size' is not a constant, a library call will be generated. - if (!C) - return -1; - - const unsigned Size = C->getValue().getZExtValue(); - const Align DstAlign = *MC->getDestAlign(); - const Align SrcAlign = *MC->getSourceAlign(); - - MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, - /*IsVolatile*/ false); - DstAddrSpace = MC->getDestAddressSpace(); - SrcAddrSpace = MC->getSourceAddressSpace(); - } - else if (const auto *MS = dyn_cast<MemSetInst>(I)) { - ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); - // If 'size' is not a constant, a library call will be generated. - if (!C) - return -1; - - const unsigned Size = C->getValue().getZExtValue(); - const Align DstAlign = *MS->getDestAlign(); - - MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, - /*IsZeroMemset*/ false, /*IsVolatile*/ false); - DstAddrSpace = MS->getDestAddressSpace(); - } - else - llvm_unreachable("Expected a memcpy/move or memset!"); - - unsigned Limit, Factor = 2; - switch(I->getIntrinsicID()) { - case Intrinsic::memcpy: - Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); - break; - case Intrinsic::memmove: - Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); - break; - case Intrinsic::memset: - Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); - Factor = 1; - break; - default: - llvm_unreachable("Expected a memcpy/move or memset!"); - } - +/// Given a memcpy/memset/memmove instruction, return the number of memory +/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a +/// call is used. +int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { + MemOp MOp; + unsigned DstAddrSpace = ~0u; + unsigned SrcAddrSpace = ~0u; + const Function *F = I->getParent()->getParent(); + + if (const auto *MC = dyn_cast<MemTransferInst>(I)) { + ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MC->getDestAlign(); + const Align SrcAlign = *MC->getSourceAlign(); + + MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, + /*IsVolatile*/ false); + DstAddrSpace = MC->getDestAddressSpace(); + SrcAddrSpace = MC->getSourceAddressSpace(); + } + else if (const auto *MS = dyn_cast<MemSetInst>(I)) { + ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength()); + // If 'size' is not a constant, a library call will be generated. + if (!C) + return -1; + + const unsigned Size = C->getValue().getZExtValue(); + const Align DstAlign = *MS->getDestAlign(); + + MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign, + /*IsZeroMemset*/ false, /*IsVolatile*/ false); + DstAddrSpace = MS->getDestAddressSpace(); + } + else + llvm_unreachable("Expected a memcpy/move or memset!"); + + unsigned Limit, Factor = 2; + switch(I->getIntrinsicID()) { + case Intrinsic::memcpy: + Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize()); + break; + case Intrinsic::memmove: + Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); + break; + case Intrinsic::memset: + Limit = TLI->getMaxStoresPerMemset(F->hasMinSize()); + Factor = 1; + break; + default: + llvm_unreachable("Expected a memcpy/move or memset!"); + } + // MemOps will be poplulated with a list of data types that needs to be // loaded and stored. That's why we multiply the number of elements by 2 to // get the cost for this memcpy. - std::vector<EVT> MemOps; + std::vector<EVT> MemOps; if (getTLI()->findOptimalMemOpLowering( - MemOps, Limit, MOp, DstAddrSpace, - SrcAddrSpace, F->getAttributes())) - return MemOps.size() * Factor; + MemOps, Limit, MOp, DstAddrSpace, + SrcAddrSpace, F->getAttributes())) + return MemOps.size() * Factor; // If we can't find an optimal memop lowering, return the default cost - return -1; -} - -int ARMTTIImpl::getMemcpyCost(const Instruction *I) { - int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); - - // To model the cost of a library call, we assume 1 for the call, and - // 3 for the argument setup. - if (NumOps == -1) - return 4; - return NumOps; + return -1; } +int ARMTTIImpl::getMemcpyCost(const Instruction *I) { + int NumOps = getNumMemOps(cast<IntrinsicInst>(I)); + + // To model the cost of a library call, we assume 1 for the call, and + // 3 for the argument setup. + if (NumOps == -1) + return 4; + return NumOps; +} + int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp) { if (ST->hasNEON()) { @@ -1149,21 +1149,21 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { - int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); - if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { - // Make operations on i1 relatively expensive as this often involves - // combining predicates. AND and XOR should be easier to handle with IT - // blocks. - switch (ISDOpcode) { - default: - break; - case ISD::AND: - case ISD::XOR: - return 2; - case ISD::OR: - return 3; - } - } + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); + if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) { + // Make operations on i1 relatively expensive as this often involves + // combining predicates. AND and XOR should be easier to handle with IT + // blocks. + switch (ISDOpcode) { + default: + break; + case ISD::AND: + case ISD::XOR: + return 2; + case ISD::OR: + return 3; + } + } std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -1259,12 +1259,12 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (LooksLikeAFreeShift()) return 0; - // Default to cheap (throughput/size of 1 instruction) but adjust throughput - // for "multiple beats" potentially needed by MVE instructions. - int BaseCost = 1; - if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && - Ty->isVectorTy()) - BaseCost = ST->getMVEVectorCostFactor(); + // Default to cheap (throughput/size of 1 instruction) but adjust throughput + // for "multiple beats" potentially needed by MVE instructions. + int BaseCost = 1; + if (CostKind != TTI::TCK_CodeSize && ST->hasMVEIntegerOps() && + Ty->isVectorTy()) + BaseCost = ST->getMVEVectorCostFactor(); // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost, // without treating floats as more expensive that scalars or increasing the @@ -1331,24 +1331,24 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, CostKind, I); } -unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind) { - if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) - return ST->getMVEVectorCostFactor(); - if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) - return ST->getMVEVectorCostFactor(); - } - if (!isa<FixedVectorType>(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); - // Scalar cost, which is currently very high due to the efficiency of the - // generated code. - return cast<FixedVectorType>(Src)->getNumElements() * 8; -} - +unsigned ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind) { + if (ST->hasMVEIntegerOps()) { + if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) + return ST->getMVEVectorCostFactor(); + if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) + return ST->getMVEVectorCostFactor(); + } + if (!isa<FixedVectorType>(Src)) + return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + // Scalar cost, which is currently very high due to the efficiency of the + // generated code. + return cast<FixedVectorType>(Src)->getNumElements() * 8; +} + int ARMTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -1379,8 +1379,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost( // promoted differently). The cost of 2 here is then a load and vrev or // vmovn. if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 && - VecTy->isIntOrIntVectorTy() && - DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) + VecTy->isIntOrIntVectorTy() && + DL.getTypeSizeInBits(SubVecTy).getFixedSize() <= 64) return 2 * BaseCost; } @@ -1413,13 +1413,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, // multiplied by the number of elements being loaded. This is possibly very // conservative, but even so we still end up vectorising loops because the // cost per iteration for many loops is lower than for scalar loops. - unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); + unsigned VectorCost = NumElems * LT.first * ST->getMVEVectorCostFactor(); // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. unsigned ScalarCost = NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {}); - if (EltSize < 8 || Alignment < EltSize / 8) + if (EltSize < 8 || Alignment < EltSize / 8) return ScalarCost; unsigned ExtSize = EltSize; @@ -1488,92 +1488,92 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, return ScalarCost; } -int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind) { - EVT ValVT = TLI->getValueType(DL, ValTy); - int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - - static const CostTblEntry CostTblAdd[]{ - {ISD::ADD, MVT::v16i8, 1}, - {ISD::ADD, MVT::v8i16, 1}, - {ISD::ADD, MVT::v4i32, 1}, - }; - if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) - return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; - - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); -} - -InstructionCost -ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, - Type *ResTy, VectorType *ValTy, - TTI::TargetCostKind CostKind) { - EVT ValVT = TLI->getValueType(DL, ValTy); - EVT ResVT = TLI->getValueType(DL, ResTy); - if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || - (LT.second == MVT::v8i16 && - ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || - (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) - return ST->getMVEVectorCostFactor() * LT.first; - } - - return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, - CostKind); -} - -int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind) { - switch (ICA.getID()) { - case Intrinsic::get_active_lane_mask: - // Currently we make a somewhat optimistic assumption that - // active_lane_mask's are always free. In reality it may be freely folded - // into a tail predicated loop, expanded into a VCPT or expanded into a lot - // of add/icmp code. We may need to improve this in the future, but being - // able to detect if it is free or not involves looking at a lot of other - // code. We currently assume that the vectorizer inserted these, and knew - // what it was doing in adding one. - if (ST->hasMVEIntegerOps()) - return 0; - break; - case Intrinsic::sadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::uadd_sat: - case Intrinsic::usub_sat: { - if (!ST->hasMVEIntegerOps()) - break; - // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. - Type *VT = ICA.getReturnType(); - if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) - VT = VectorType::get(VT, ICA.getVectorFactor()); - - std::pair<int, MVT> LT = - TLI->getTypeLegalizationCost(DL, VT); - if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || - LT.second == MVT::v16i8) { - // This is a base cost of 1 for the vadd, plus 3 extract shifts if we - // need to extend the type, as it uses shr(qadd(shl, shl)). - unsigned Instrs = LT.second.getScalarSizeInBits() == - ICA.getReturnType()->getScalarSizeInBits() - ? 1 - : 4; - return LT.first * ST->getMVEVectorCostFactor() * Instrs; - } - break; - } - } - - return BaseT::getIntrinsicInstrCost(ICA, CostKind); -} - +int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind) { + EVT ValVT = TLI->getValueType(DL, ValTy); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + static const CostTblEntry CostTblAdd[]{ + {ISD::ADD, MVT::v16i8, 1}, + {ISD::ADD, MVT::v8i16, 1}, + {ISD::ADD, MVT::v4i32, 1}, + }; + if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) + return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; + + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, + CostKind); +} + +InstructionCost +ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, + Type *ResTy, VectorType *ValTy, + TTI::TargetCostKind CostKind) { + EVT ValVT = TLI->getValueType(DL, ValTy); + EVT ResVT = TLI->getValueType(DL, ResTy); + if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || + (LT.second == MVT::v8i16 && + ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || + (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + return ST->getMVEVectorCostFactor() * LT.first; + } + + return BaseT::getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, + CostKind); +} + +int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { + switch (ICA.getID()) { + case Intrinsic::get_active_lane_mask: + // Currently we make a somewhat optimistic assumption that + // active_lane_mask's are always free. In reality it may be freely folded + // into a tail predicated loop, expanded into a VCPT or expanded into a lot + // of add/icmp code. We may need to improve this in the future, but being + // able to detect if it is free or not involves looking at a lot of other + // code. We currently assume that the vectorizer inserted these, and knew + // what it was doing in adding one. + if (ST->hasMVEIntegerOps()) + return 0; + break; + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: { + if (!ST->hasMVEIntegerOps()) + break; + // Get the Return type, either directly of from ICA.ReturnType and ICA.VF. + Type *VT = ICA.getReturnType(); + if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar()) + VT = VectorType::get(VT, ICA.getVectorFactor()); + + std::pair<int, MVT> LT = + TLI->getTypeLegalizationCost(DL, VT); + if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 || + LT.second == MVT::v16i8) { + // This is a base cost of 1 for the vadd, plus 3 extract shifts if we + // need to extend the type, as it uses shr(qadd(shl, shl)). + unsigned Instrs = LT.second.getScalarSizeInBits() == + ICA.getReturnType()->getScalarSizeInBits() + ? 1 + : 4; + return LT.first * ST->getMVEVectorCostFactor() * Instrs; + } + break; + } + } + + return BaseT::getIntrinsicInstrCost(ICA, CostKind); +} + bool ARMTTIImpl::isLoweredToCall(const Function *F) { if (!F->isIntrinsic()) BaseT::isLoweredToCall(F); @@ -1635,93 +1635,93 @@ bool ARMTTIImpl::isLoweredToCall(const Function *F) { return BaseT::isLoweredToCall(F); } -bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { - unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); - EVT VT = TLI->getValueType(DL, I.getType(), true); - if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) - return true; - - // Check if an intrinsic will be lowered to a call and assume that any - // other CallInst will generate a bl. - if (auto *Call = dyn_cast<CallInst>(&I)) { - if (auto *II = dyn_cast<IntrinsicInst>(Call)) { - switch(II->getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memset: - case Intrinsic::memmove: - return getNumMemOps(II) == -1; - default: - if (const Function *F = Call->getCalledFunction()) - return isLoweredToCall(F); - } - } - return true; - } - - // FPv5 provides conversions between integer, double-precision, - // single-precision, and half-precision formats. - switch (I.getOpcode()) { - default: - break; - case Instruction::FPToSI: - case Instruction::FPToUI: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::FPTrunc: - case Instruction::FPExt: - return !ST->hasFPARMv8Base(); - } - - // FIXME: Unfortunately the approach of checking the Operation Action does - // not catch all cases of Legalization that use library calls. Our - // Legalization step categorizes some transformations into library calls as - // Custom, Expand or even Legal when doing type legalization. So for now - // we have to special case for instance the SDIV of 64bit integers and the - // use of floating point emulation. - if (VT.isInteger() && VT.getSizeInBits() >= 64) { - switch (ISD) { - default: - break; - case ISD::SDIV: - case ISD::UDIV: - case ISD::SREM: - case ISD::UREM: - case ISD::SDIVREM: - case ISD::UDIVREM: - return true; - } - } - - // Assume all other non-float operations are supported. - if (!VT.isFloatingPoint()) - return false; - - // We'll need a library call to handle most floats when using soft. - if (TLI->useSoftFloat()) { - switch (I.getOpcode()) { - default: - return true; - case Instruction::Alloca: - case Instruction::Load: - case Instruction::Store: - case Instruction::Select: - case Instruction::PHI: - return false; - } - } - - // We'll need a libcall to perform double precision operations on a single - // precision only FPU. - if (I.getType()->isDoubleTy() && !ST->hasFP64()) - return true; - - // Likewise for half precision arithmetic. - if (I.getType()->isHalfTy() && !ST->hasFullFP16()) - return true; - - return false; -} - +bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { + unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode()); + EVT VT = TLI->getValueType(DL, I.getType(), true); + if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall) + return true; + + // Check if an intrinsic will be lowered to a call and assume that any + // other CallInst will generate a bl. + if (auto *Call = dyn_cast<CallInst>(&I)) { + if (auto *II = dyn_cast<IntrinsicInst>(Call)) { + switch(II->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + return getNumMemOps(II) == -1; + default: + if (const Function *F = Call->getCalledFunction()) + return isLoweredToCall(F); + } + } + return true; + } + + // FPv5 provides conversions between integer, double-precision, + // single-precision, and half-precision formats. + switch (I.getOpcode()) { + default: + break; + case Instruction::FPToSI: + case Instruction::FPToUI: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + return !ST->hasFPARMv8Base(); + } + + // FIXME: Unfortunately the approach of checking the Operation Action does + // not catch all cases of Legalization that use library calls. Our + // Legalization step categorizes some transformations into library calls as + // Custom, Expand or even Legal when doing type legalization. So for now + // we have to special case for instance the SDIV of 64bit integers and the + // use of floating point emulation. + if (VT.isInteger() && VT.getSizeInBits() >= 64) { + switch (ISD) { + default: + break; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: + return true; + } + } + + // Assume all other non-float operations are supported. + if (!VT.isFloatingPoint()) + return false; + + // We'll need a library call to handle most floats when using soft. + if (TLI->useSoftFloat()) { + switch (I.getOpcode()) { + default: + return true; + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Store: + case Instruction::Select: + case Instruction::PHI: + return false; + } + } + + // We'll need a libcall to perform double precision operations on a single + // precision only FPU. + if (I.getType()->isDoubleTy() && !ST->hasFP64()) + return true; + + // Likewise for half precision arithmetic. + if (I.getType()->isHalfTy() && !ST->hasFullFP16()) + return true; + + return false; +} + bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, @@ -1762,7 +1762,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, switch (Call->getIntrinsicID()) { default: break; - case Intrinsic::start_loop_iterations: + case Intrinsic::start_loop_iterations: case Intrinsic::test_set_loop_iterations: case Intrinsic::loop_decrement: case Intrinsic::loop_decrement_reg: @@ -1773,24 +1773,24 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, }; // Scan the instructions to see if there's any that we know will turn into a - // call or if this loop is already a low-overhead loop or will become a tail - // predicated loop. - bool IsTailPredLoop = false; + // call or if this loop is already a low-overhead loop or will become a tail + // predicated loop. + bool IsTailPredLoop = false; auto ScanLoop = [&](Loop *L) { for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || - isa<InlineAsm>(I)) { + if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) || + isa<InlineAsm>(I)) { LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); return false; } - if (auto *II = dyn_cast<IntrinsicInst>(&I)) - IsTailPredLoop |= - II->getIntrinsicID() == Intrinsic::get_active_lane_mask || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || - II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + IsTailPredLoop |= + II->getIntrinsicID() == Intrinsic::get_active_lane_mask || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 || + II->getIntrinsicID() == Intrinsic::arm_mve_vctp64; } } return true; @@ -1811,7 +1811,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, LLVMContext &C = L->getHeader()->getContext(); HWLoopInfo.CounterInReg = true; HWLoopInfo.IsNestingLegal = false; - HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; + HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop; HWLoopInfo.CountType = Type::getInt32Ty(C); HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); return true; @@ -1859,28 +1859,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const LoopAccessInfo *LAI) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); - // If there are live-out values, it is probably a reduction. We can predicate - // most reduction operations freely under MVE using a combination of - // prefer-predicated-reduction-select and inloop reductions. We limit this to - // floating point and integer reductions, but don't check for operators - // specifically here. If the value ends up not being a reduction (and so the - // vectorizer cannot tailfold the loop), we should fall back to standard - // vectorization automatically. + // If there are live-out values, it is probably a reduction. We can predicate + // most reduction operations freely under MVE using a combination of + // prefer-predicated-reduction-select and inloop reductions. We limit this to + // floating point and integer reductions, but don't check for operators + // specifically here. If the value ends up not being a reduction (and so the + // vectorizer cannot tailfold the loop), we should fall back to standard + // vectorization automatically. SmallVector< Instruction *, 8 > LiveOuts; LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); - bool ReductionsDisabled = + bool ReductionsDisabled = EnableTailPredication == TailPredication::EnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabledNoReductions; for (auto *I : LiveOuts) { - if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && - !I->getType()->isHalfTy()) { - LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " + if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && + !I->getType()->isHalfTy()) { + LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"); return false; } - if (ReductionsDisabled) { - LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); + if (ReductionsDisabled) { + LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); return false; } } @@ -1910,35 +1910,35 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); int64_t NextStride = getPtrStride(PSE, Ptr, L); - if (NextStride == 1) { - // TODO: for now only allow consecutive strides of 1. We could support - // other strides as long as it is uniform, but let's keep it simple - // for now. + if (NextStride == 1) { + // TODO: for now only allow consecutive strides of 1. We could support + // other strides as long as it is uniform, but let's keep it simple + // for now. continue; - } else if (NextStride == -1 || - (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || - (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { - LLVM_DEBUG(dbgs() - << "Consecutive strides of 2 found, vld2/vstr2 can't " - "be tail-predicated\n."); + } else if (NextStride == -1 || + (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) || + (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) { + LLVM_DEBUG(dbgs() + << "Consecutive strides of 2 found, vld2/vstr2 can't " + "be tail-predicated\n."); return false; - // TODO: don't tail predicate if there is a reversed load? - } else if (EnableMaskedGatherScatters) { - // Gather/scatters do allow loading from arbitrary strides, at - // least if they are loop invariant. - // TODO: Loop variant strides should in theory work, too, but - // this requires further testing. - const SCEV *PtrScev = - replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); - if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { - const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); - if (PSE.getSE()->isLoopInvariant(Step, L)) - continue; - } + // TODO: don't tail predicate if there is a reversed load? + } else if (EnableMaskedGatherScatters) { + // Gather/scatters do allow loading from arbitrary strides, at + // least if they are loop invariant. + // TODO: Loop variant strides should in theory work, too, but + // this requires further testing. + const SCEV *PtrScev = + replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); + if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { + const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); + if (PSE.getSE()->isLoopInvariant(Step, L)) + continue; + } } - LLVM_DEBUG(dbgs() << "Bad stride found, can't " - "tail-predicate\n."); - return false; + LLVM_DEBUG(dbgs() << "Bad stride found, can't " + "tail-predicate\n."); + return false; } } } @@ -1971,7 +1971,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return false; } - assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); + assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected"); HardwareLoopInfo HWLoopInfo(L); if (!HWLoopInfo.canAnalyze(*LI)) { @@ -2039,10 +2039,10 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (ST->hasBranchPredictor() && L->getNumBlocks() > 4) return; - // Don't unroll vectorized loops, including the remainder loop - if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) - return; - + // Don't unroll vectorized loops, including the remainder loop + if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) + return; + // Scan the loop: don't unroll loops with calls as this could prevent // inlining. unsigned Cost = 0; @@ -2061,9 +2061,9 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, return; } - SmallVector<const Value*, 4> Operands(I.operand_values()); - Cost += - getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); + SmallVector<const Value*, 4> Operands(I.operand_values()); + Cost += + getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); } } @@ -2092,24 +2092,24 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return ST->hasMVEIntegerOps(); } - -bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - if (!ST->hasMVEIntegerOps()) - return false; - - unsigned ScalarBits = Ty->getScalarSizeInBits(); - switch (Opcode) { - case Instruction::Add: - return ScalarBits <= 64; - default: - return false; - } -} - -bool ARMTTIImpl::preferPredicatedReductionSelect( - unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { - if (!ST->hasMVEIntegerOps()) - return false; - return true; -} + +bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + + unsigned ScalarBits = Ty->getScalarSizeInBits(); + switch (Opcode) { + case Instruction::Add: + return ScalarBits <= 64; + default: + return false; + } +} + +bool ARMTTIImpl::preferPredicatedReductionSelect( + unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + return true; +} diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.h index 7f045080e3..257e325a28 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/libs/llvm12/lib/Target/ARM/ARMTargetTransformInfo.h @@ -113,9 +113,9 @@ public: return !ST->isTargetDarwin() && !ST->hasMVEFloatOps(); } - Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, - IntrinsicInst &II) const; - + Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + /// \name Scalar TTI Implementations /// @{ @@ -126,8 +126,8 @@ public: int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty, TTI::TargetCostKind CostKind, - Instruction *Inst = nullptr); + Type *Ty, TTI::TargetCostKind CostKind, + Instruction *Inst = nullptr); /// @} @@ -181,31 +181,31 @@ public: int getMemcpyCost(const Instruction *I); - int getNumMemOps(const IntrinsicInst *I) const; - + int getNumMemOps(const IntrinsicInst *I) const; + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; - bool preferInLoopReduction(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const; - - bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const; + bool preferInLoopReduction(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; - bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; - int getCFInstrCost(unsigned Opcode, - TTI::TargetCostKind CostKind); + bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } + int getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); @@ -229,10 +229,10 @@ public: TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind); - + unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind); + int getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, @@ -244,17 +244,17 @@ public: Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind); - InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, - Type *ResTy, VectorType *ValTy, - TTI::TargetCostKind CostKind); - - int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind); - - bool maybeLoweredToCall(Instruction &I); + int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, + Type *ResTy, VectorType *ValTy, + TTI::TargetCostKind CostKind); + + int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + + bool maybeLoweredToCall(Instruction &I); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, diff --git a/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 52577d75dd..b65cfc3811 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6239,9 +6239,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) { StringRef IDVal = Parser.getTok().getIdentifier(); const auto &Prefix = - llvm::find_if(PrefixEntries, [&IDVal](const PrefixEntry &PE) { - return PE.Spelling == IDVal; - }); + llvm::find_if(PrefixEntries, [&IDVal](const PrefixEntry &PE) { + return PE.Spelling == IDVal; + }); if (Prefix == std::end(PrefixEntries)) { Error(Parser.getTok().getLoc(), "unexpected prefix in operand"); return true; @@ -10307,14 +10307,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, !HasWideQualifier) { // The operands aren't the same for tMOV[S]r... (no cc_out) MCInst TmpInst; - unsigned Op = Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr; - TmpInst.setOpcode(Op); + unsigned Op = Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr; + TmpInst.setOpcode(Op); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); - if (Op == ARM::tMOVr) { - TmpInst.addOperand(Inst.getOperand(2)); - TmpInst.addOperand(Inst.getOperand(3)); - } + if (Op == ARM::tMOVr) { + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(3)); + } Inst = TmpInst; return true; } @@ -10599,12 +10599,12 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { (isThumb() && !hasV8Ops())) return Match_InvalidOperand; break; - case ARM::t2TBB: - case ARM::t2TBH: - // Rn = sp is only allowed with ARMv8-A - if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP)) - return Match_RequiresV8; - break; + case ARM::t2TBB: + case ARM::t2TBH: + // Rn = sp is only allowed with ARMv8-A + if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP)) + return Match_RequiresV8; + break; default: break; } @@ -11135,8 +11135,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) { bool WasThumb = isThumb(); Triple T; MCSubtargetInfo &STI = copySTI(); - STI.setDefaultFeatures("", /*TuneCPU*/ "", - ("+" + ARM::getArchName(ID)).str()); + STI.setDefaultFeatures("", /*TuneCPU*/ "", + ("+" + ARM::getArchName(ID)).str()); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); FixModeAfterArchChange(WasThumb, L); @@ -11249,7 +11249,7 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) { bool WasThumb = isThumb(); MCSubtargetInfo &STI = copySTI(); - STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, ""); + STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, ""); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); FixModeAfterArchChange(WasThumb, L); diff --git a/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ya.make index 572d301570..ed9aa8099d 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/AsmParser/ya.make @@ -12,20 +12,20 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCParser - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc - contrib/libs/llvm12/lib/Target/ARM/TargetInfo - contrib/libs/llvm12/lib/Target/ARM/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCParser + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc + contrib/libs/llvm12/lib/Target/ARM/TargetInfo + contrib/libs/llvm12/lib/Target/ARM/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM/AsmParser + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM/AsmParser ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 8ea323a9ce..7953681421 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -860,8 +860,8 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const { VCCPos + 2, MCOI::TIED_TO); assert(TiedOp >= 0 && "Inactive register in vpred_r is not tied to an output!"); - // Copy the operand to ensure it's not invalidated when MI grows. - MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp))); + // Copy the operand to ensure it's not invalidated when MI grows. + MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp))); } } else if (VCC != ARMVCC::None) { Check(S, SoftFail); @@ -4530,14 +4530,14 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val, static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { - const FeatureBitset &FeatureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + const FeatureBitset &FeatureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); DecodeStatus S = MCDisassembler::Success; unsigned Rn = fieldFromInstruction(Insn, 16, 4); unsigned Rm = fieldFromInstruction(Insn, 0, 4); - if (Rn == 13 && !FeatureBits[ARM::HasV8Ops]) S = MCDisassembler::SoftFail; + if (Rn == 13 && !FeatureBits[ARM::HasV8Ops]) S = MCDisassembler::SoftFail; if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) return MCDisassembler::Fail; if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) diff --git a/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ya.make index f8ce0c24d9..660cfd1063 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/Disassembler/ya.make @@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC/MCDisassembler - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc - contrib/libs/llvm12/lib/Target/ARM/TargetInfo - contrib/libs/llvm12/lib/Target/ARM/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC/MCDisassembler + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc + contrib/libs/llvm12/lib/Target/ARM/TargetInfo + contrib/libs/llvm12/lib/Target/ARM/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM/Disassembler + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM/Disassembler ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h index 8459b4ff2a..07376848c4 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h @@ -205,20 +205,20 @@ namespace ARM_AM { return V; } - /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained - /// by two SOImmVal, that -V = First + Second. - /// "R+V" can be optimized to (sub (sub R, First), Second). - /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second). - inline bool isSOImmTwoPartValNeg(unsigned V) { - unsigned First; - if (!isSOImmTwoPartVal(-V)) - return false; - // Return false if ~(-First) is not a SoImmval. - First = getSOImmTwoPartFirst(-V); - First = ~(-First); - return !(rotr32(~255U, getSOImmValRotate(First)) & First); - } - + /// isSOImmTwoPartValNeg - Return true if the specified value can be obtained + /// by two SOImmVal, that -V = First + Second. + /// "R+V" can be optimized to (sub (sub R, First), Second). + /// "R=V" can be optimized to (sub (mvn R, ~(-First)), Second). + inline bool isSOImmTwoPartValNeg(unsigned V) { + unsigned First; + if (!isSOImmTwoPartVal(-V)) + return false; + // Return false if ~(-First) is not a SoImmval. + First = getSOImmTwoPartFirst(-V); + First = ~(-First); + return !(rotr32(~255U, getSOImmValRotate(First)) & First); + } + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed /// by a left shift. Returns the shift amount to use. inline unsigned getThumbImmValShift(unsigned Imm) { @@ -687,18 +687,18 @@ namespace ARM_AM { return getFP16Imm(FPImm.bitcastToAPInt()); } - /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding - /// for it. Otherwise return -1 like getFP16Imm. - inline int getFP32FP16Imm(const APInt &Imm) { - if (Imm.getActiveBits() > 16) - return -1; - return ARM_AM::getFP16Imm(Imm.trunc(16)); - } - - inline int getFP32FP16Imm(const APFloat &FPImm) { - return getFP32FP16Imm(FPImm.bitcastToAPInt()); - } - + /// If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding + /// for it. Otherwise return -1 like getFP16Imm. + inline int getFP32FP16Imm(const APInt &Imm) { + if (Imm.getActiveBits() > 16) + return -1; + return ARM_AM::getFP16Imm(Imm.trunc(16)); + } + + inline int getFP32FP16Imm(const APFloat &FPImm) { + return getFP32FP16Imm(FPImm.bitcastToAPInt()); + } + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit /// floating-point value. If the value cannot be represented as an 8-bit /// floating-point value, then return -1. diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index b02aef3c33..697eeab4e5 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1010,7 +1010,7 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { case ARM::fixup_t2_condbranch: case ARM::fixup_t2_uncondbranch: case ARM::fixup_t2_pcrel_10: - case ARM::fixup_t2_pcrel_9: + case ARM::fixup_t2_pcrel_9: case ARM::fixup_t2_adr_pcrel_12: case ARM::fixup_arm_thumb_bl: case ARM::fixup_arm_thumb_blx: diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index ecd96114e8..5599eaaf2f 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -254,7 +254,7 @@ namespace ARMII { MO_OPTION_MASK = 0x3, /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the - /// reference is actually to the ".refptr.FOO" symbol. This is used for + /// reference is actually to the ".refptr.FOO" symbol. This is used for /// stub symbols on windows. MO_COFFSTUB = 0x4, diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h index d975d799e0..ac75bf3fca 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -30,7 +30,7 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; // Autogenerated by tblgen. - std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); virtual bool printAliasInstr(const MCInst *MI, uint64_t Address, diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 40e8e244e3..a26944a38f 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -87,7 +87,7 @@ void ARMCOFFMCAsmInfoMicrosoft::anchor() { } ARMCOFFMCAsmInfoMicrosoft::ARMCOFFMCAsmInfoMicrosoft() { AlignmentIsInBytes = false; - SupportsDebugInformation = true; + SupportsDebugInformation = true; ExceptionsType = ExceptionHandling::WinEH; PrivateGlobalPrefix = "$M"; PrivateLabelPrefix = "$M"; diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 774f2507b8..3da71ade87 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -11,13 +11,13 @@ //===----------------------------------------------------------------------===// #include "ARMMCTargetDesc.h" -#include "ARMAddressingModes.h" +#include "ARMAddressingModes.h" #include "ARMBaseInfo.h" #include "ARMInstPrinter.h" #include "ARMMCAsmInfo.h" #include "TargetInfo/ARMTargetInfo.h" #include "llvm/ADT/Triple.h" -#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCELFStreamer.h" @@ -182,23 +182,23 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { return ARMArchFeature; } -bool ARM_MC::isPredicated(const MCInst &MI, const MCInstrInfo *MCII) { - const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); - int PredOpIdx = Desc.findFirstPredOperandIdx(); - return PredOpIdx != -1 && MI.getOperand(PredOpIdx).getImm() != ARMCC::AL; -} - -bool ARM_MC::isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII) { - const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); - for (unsigned I = 0; I < MI.getNumOperands(); ++I) { - const MCOperand &MO = MI.getOperand(I); - if (MO.isReg() && MO.getReg() == ARM::CPSR && - Desc.OpInfo[I].isOptionalDef()) - return true; - } - return false; -} - +bool ARM_MC::isPredicated(const MCInst &MI, const MCInstrInfo *MCII) { + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + int PredOpIdx = Desc.findFirstPredOperandIdx(); + return PredOpIdx != -1 && MI.getOperand(PredOpIdx).getImm() != ARMCC::AL; +} + +bool ARM_MC::isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII) { + const MCInstrDesc &Desc = MCII->get(MI.getOpcode()); + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MCOperand &MO = MI.getOperand(I); + if (MO.isReg() && MO.getReg() == ARM::CPSR && + Desc.OpInfo[I].isOptionalDef()) + return true; + } + return false; +} + MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU); @@ -209,7 +209,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, ArchFS = std::string(FS); } - return createARMMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); + return createARMMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); } static MCInstrInfo *createARMMCInstrInfo() { @@ -218,120 +218,120 @@ static MCInstrInfo *createARMMCInstrInfo() { return X; } -void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { - // Mapping from CodeView to MC register id. - static const struct { - codeview::RegisterId CVReg; - MCPhysReg Reg; - } RegMap[] = { - {codeview::RegisterId::ARM_R0, ARM::R0}, - {codeview::RegisterId::ARM_R1, ARM::R1}, - {codeview::RegisterId::ARM_R2, ARM::R2}, - {codeview::RegisterId::ARM_R3, ARM::R3}, - {codeview::RegisterId::ARM_R4, ARM::R4}, - {codeview::RegisterId::ARM_R5, ARM::R5}, - {codeview::RegisterId::ARM_R6, ARM::R6}, - {codeview::RegisterId::ARM_R7, ARM::R7}, - {codeview::RegisterId::ARM_R8, ARM::R8}, - {codeview::RegisterId::ARM_R9, ARM::R9}, - {codeview::RegisterId::ARM_R10, ARM::R10}, - {codeview::RegisterId::ARM_R11, ARM::R11}, - {codeview::RegisterId::ARM_R12, ARM::R12}, - {codeview::RegisterId::ARM_SP, ARM::SP}, - {codeview::RegisterId::ARM_LR, ARM::LR}, - {codeview::RegisterId::ARM_PC, ARM::PC}, - {codeview::RegisterId::ARM_CPSR, ARM::CPSR}, - {codeview::RegisterId::ARM_FPSCR, ARM::FPSCR}, - {codeview::RegisterId::ARM_FPEXC, ARM::FPEXC}, - {codeview::RegisterId::ARM_FS0, ARM::S0}, - {codeview::RegisterId::ARM_FS1, ARM::S1}, - {codeview::RegisterId::ARM_FS2, ARM::S2}, - {codeview::RegisterId::ARM_FS3, ARM::S3}, - {codeview::RegisterId::ARM_FS4, ARM::S4}, - {codeview::RegisterId::ARM_FS5, ARM::S5}, - {codeview::RegisterId::ARM_FS6, ARM::S6}, - {codeview::RegisterId::ARM_FS7, ARM::S7}, - {codeview::RegisterId::ARM_FS8, ARM::S8}, - {codeview::RegisterId::ARM_FS9, ARM::S9}, - {codeview::RegisterId::ARM_FS10, ARM::S10}, - {codeview::RegisterId::ARM_FS11, ARM::S11}, - {codeview::RegisterId::ARM_FS12, ARM::S12}, - {codeview::RegisterId::ARM_FS13, ARM::S13}, - {codeview::RegisterId::ARM_FS14, ARM::S14}, - {codeview::RegisterId::ARM_FS15, ARM::S15}, - {codeview::RegisterId::ARM_FS16, ARM::S16}, - {codeview::RegisterId::ARM_FS17, ARM::S17}, - {codeview::RegisterId::ARM_FS18, ARM::S18}, - {codeview::RegisterId::ARM_FS19, ARM::S19}, - {codeview::RegisterId::ARM_FS20, ARM::S20}, - {codeview::RegisterId::ARM_FS21, ARM::S21}, - {codeview::RegisterId::ARM_FS22, ARM::S22}, - {codeview::RegisterId::ARM_FS23, ARM::S23}, - {codeview::RegisterId::ARM_FS24, ARM::S24}, - {codeview::RegisterId::ARM_FS25, ARM::S25}, - {codeview::RegisterId::ARM_FS26, ARM::S26}, - {codeview::RegisterId::ARM_FS27, ARM::S27}, - {codeview::RegisterId::ARM_FS28, ARM::S28}, - {codeview::RegisterId::ARM_FS29, ARM::S29}, - {codeview::RegisterId::ARM_FS30, ARM::S30}, - {codeview::RegisterId::ARM_FS31, ARM::S31}, - {codeview::RegisterId::ARM_ND0, ARM::D0}, - {codeview::RegisterId::ARM_ND1, ARM::D1}, - {codeview::RegisterId::ARM_ND2, ARM::D2}, - {codeview::RegisterId::ARM_ND3, ARM::D3}, - {codeview::RegisterId::ARM_ND4, ARM::D4}, - {codeview::RegisterId::ARM_ND5, ARM::D5}, - {codeview::RegisterId::ARM_ND6, ARM::D6}, - {codeview::RegisterId::ARM_ND7, ARM::D7}, - {codeview::RegisterId::ARM_ND8, ARM::D8}, - {codeview::RegisterId::ARM_ND9, ARM::D9}, - {codeview::RegisterId::ARM_ND10, ARM::D10}, - {codeview::RegisterId::ARM_ND11, ARM::D11}, - {codeview::RegisterId::ARM_ND12, ARM::D12}, - {codeview::RegisterId::ARM_ND13, ARM::D13}, - {codeview::RegisterId::ARM_ND14, ARM::D14}, - {codeview::RegisterId::ARM_ND15, ARM::D15}, - {codeview::RegisterId::ARM_ND16, ARM::D16}, - {codeview::RegisterId::ARM_ND17, ARM::D17}, - {codeview::RegisterId::ARM_ND18, ARM::D18}, - {codeview::RegisterId::ARM_ND19, ARM::D19}, - {codeview::RegisterId::ARM_ND20, ARM::D20}, - {codeview::RegisterId::ARM_ND21, ARM::D21}, - {codeview::RegisterId::ARM_ND22, ARM::D22}, - {codeview::RegisterId::ARM_ND23, ARM::D23}, - {codeview::RegisterId::ARM_ND24, ARM::D24}, - {codeview::RegisterId::ARM_ND25, ARM::D25}, - {codeview::RegisterId::ARM_ND26, ARM::D26}, - {codeview::RegisterId::ARM_ND27, ARM::D27}, - {codeview::RegisterId::ARM_ND28, ARM::D28}, - {codeview::RegisterId::ARM_ND29, ARM::D29}, - {codeview::RegisterId::ARM_ND30, ARM::D30}, - {codeview::RegisterId::ARM_ND31, ARM::D31}, - {codeview::RegisterId::ARM_NQ0, ARM::Q0}, - {codeview::RegisterId::ARM_NQ1, ARM::Q1}, - {codeview::RegisterId::ARM_NQ2, ARM::Q2}, - {codeview::RegisterId::ARM_NQ3, ARM::Q3}, - {codeview::RegisterId::ARM_NQ4, ARM::Q4}, - {codeview::RegisterId::ARM_NQ5, ARM::Q5}, - {codeview::RegisterId::ARM_NQ6, ARM::Q6}, - {codeview::RegisterId::ARM_NQ7, ARM::Q7}, - {codeview::RegisterId::ARM_NQ8, ARM::Q8}, - {codeview::RegisterId::ARM_NQ9, ARM::Q9}, - {codeview::RegisterId::ARM_NQ10, ARM::Q10}, - {codeview::RegisterId::ARM_NQ11, ARM::Q11}, - {codeview::RegisterId::ARM_NQ12, ARM::Q12}, - {codeview::RegisterId::ARM_NQ13, ARM::Q13}, - {codeview::RegisterId::ARM_NQ14, ARM::Q14}, - {codeview::RegisterId::ARM_NQ15, ARM::Q15}, - }; - for (unsigned I = 0; I < array_lengthof(RegMap); ++I) - MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); -} - +void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { + // Mapping from CodeView to MC register id. + static const struct { + codeview::RegisterId CVReg; + MCPhysReg Reg; + } RegMap[] = { + {codeview::RegisterId::ARM_R0, ARM::R0}, + {codeview::RegisterId::ARM_R1, ARM::R1}, + {codeview::RegisterId::ARM_R2, ARM::R2}, + {codeview::RegisterId::ARM_R3, ARM::R3}, + {codeview::RegisterId::ARM_R4, ARM::R4}, + {codeview::RegisterId::ARM_R5, ARM::R5}, + {codeview::RegisterId::ARM_R6, ARM::R6}, + {codeview::RegisterId::ARM_R7, ARM::R7}, + {codeview::RegisterId::ARM_R8, ARM::R8}, + {codeview::RegisterId::ARM_R9, ARM::R9}, + {codeview::RegisterId::ARM_R10, ARM::R10}, + {codeview::RegisterId::ARM_R11, ARM::R11}, + {codeview::RegisterId::ARM_R12, ARM::R12}, + {codeview::RegisterId::ARM_SP, ARM::SP}, + {codeview::RegisterId::ARM_LR, ARM::LR}, + {codeview::RegisterId::ARM_PC, ARM::PC}, + {codeview::RegisterId::ARM_CPSR, ARM::CPSR}, + {codeview::RegisterId::ARM_FPSCR, ARM::FPSCR}, + {codeview::RegisterId::ARM_FPEXC, ARM::FPEXC}, + {codeview::RegisterId::ARM_FS0, ARM::S0}, + {codeview::RegisterId::ARM_FS1, ARM::S1}, + {codeview::RegisterId::ARM_FS2, ARM::S2}, + {codeview::RegisterId::ARM_FS3, ARM::S3}, + {codeview::RegisterId::ARM_FS4, ARM::S4}, + {codeview::RegisterId::ARM_FS5, ARM::S5}, + {codeview::RegisterId::ARM_FS6, ARM::S6}, + {codeview::RegisterId::ARM_FS7, ARM::S7}, + {codeview::RegisterId::ARM_FS8, ARM::S8}, + {codeview::RegisterId::ARM_FS9, ARM::S9}, + {codeview::RegisterId::ARM_FS10, ARM::S10}, + {codeview::RegisterId::ARM_FS11, ARM::S11}, + {codeview::RegisterId::ARM_FS12, ARM::S12}, + {codeview::RegisterId::ARM_FS13, ARM::S13}, + {codeview::RegisterId::ARM_FS14, ARM::S14}, + {codeview::RegisterId::ARM_FS15, ARM::S15}, + {codeview::RegisterId::ARM_FS16, ARM::S16}, + {codeview::RegisterId::ARM_FS17, ARM::S17}, + {codeview::RegisterId::ARM_FS18, ARM::S18}, + {codeview::RegisterId::ARM_FS19, ARM::S19}, + {codeview::RegisterId::ARM_FS20, ARM::S20}, + {codeview::RegisterId::ARM_FS21, ARM::S21}, + {codeview::RegisterId::ARM_FS22, ARM::S22}, + {codeview::RegisterId::ARM_FS23, ARM::S23}, + {codeview::RegisterId::ARM_FS24, ARM::S24}, + {codeview::RegisterId::ARM_FS25, ARM::S25}, + {codeview::RegisterId::ARM_FS26, ARM::S26}, + {codeview::RegisterId::ARM_FS27, ARM::S27}, + {codeview::RegisterId::ARM_FS28, ARM::S28}, + {codeview::RegisterId::ARM_FS29, ARM::S29}, + {codeview::RegisterId::ARM_FS30, ARM::S30}, + {codeview::RegisterId::ARM_FS31, ARM::S31}, + {codeview::RegisterId::ARM_ND0, ARM::D0}, + {codeview::RegisterId::ARM_ND1, ARM::D1}, + {codeview::RegisterId::ARM_ND2, ARM::D2}, + {codeview::RegisterId::ARM_ND3, ARM::D3}, + {codeview::RegisterId::ARM_ND4, ARM::D4}, + {codeview::RegisterId::ARM_ND5, ARM::D5}, + {codeview::RegisterId::ARM_ND6, ARM::D6}, + {codeview::RegisterId::ARM_ND7, ARM::D7}, + {codeview::RegisterId::ARM_ND8, ARM::D8}, + {codeview::RegisterId::ARM_ND9, ARM::D9}, + {codeview::RegisterId::ARM_ND10, ARM::D10}, + {codeview::RegisterId::ARM_ND11, ARM::D11}, + {codeview::RegisterId::ARM_ND12, ARM::D12}, + {codeview::RegisterId::ARM_ND13, ARM::D13}, + {codeview::RegisterId::ARM_ND14, ARM::D14}, + {codeview::RegisterId::ARM_ND15, ARM::D15}, + {codeview::RegisterId::ARM_ND16, ARM::D16}, + {codeview::RegisterId::ARM_ND17, ARM::D17}, + {codeview::RegisterId::ARM_ND18, ARM::D18}, + {codeview::RegisterId::ARM_ND19, ARM::D19}, + {codeview::RegisterId::ARM_ND20, ARM::D20}, + {codeview::RegisterId::ARM_ND21, ARM::D21}, + {codeview::RegisterId::ARM_ND22, ARM::D22}, + {codeview::RegisterId::ARM_ND23, ARM::D23}, + {codeview::RegisterId::ARM_ND24, ARM::D24}, + {codeview::RegisterId::ARM_ND25, ARM::D25}, + {codeview::RegisterId::ARM_ND26, ARM::D26}, + {codeview::RegisterId::ARM_ND27, ARM::D27}, + {codeview::RegisterId::ARM_ND28, ARM::D28}, + {codeview::RegisterId::ARM_ND29, ARM::D29}, + {codeview::RegisterId::ARM_ND30, ARM::D30}, + {codeview::RegisterId::ARM_ND31, ARM::D31}, + {codeview::RegisterId::ARM_NQ0, ARM::Q0}, + {codeview::RegisterId::ARM_NQ1, ARM::Q1}, + {codeview::RegisterId::ARM_NQ2, ARM::Q2}, + {codeview::RegisterId::ARM_NQ3, ARM::Q3}, + {codeview::RegisterId::ARM_NQ4, ARM::Q4}, + {codeview::RegisterId::ARM_NQ5, ARM::Q5}, + {codeview::RegisterId::ARM_NQ6, ARM::Q6}, + {codeview::RegisterId::ARM_NQ7, ARM::Q7}, + {codeview::RegisterId::ARM_NQ8, ARM::Q8}, + {codeview::RegisterId::ARM_NQ9, ARM::Q9}, + {codeview::RegisterId::ARM_NQ10, ARM::Q10}, + {codeview::RegisterId::ARM_NQ11, ARM::Q11}, + {codeview::RegisterId::ARM_NQ12, ARM::Q12}, + {codeview::RegisterId::ARM_NQ13, ARM::Q13}, + {codeview::RegisterId::ARM_NQ14, ARM::Q14}, + {codeview::RegisterId::ARM_NQ15, ARM::Q15}, + }; + for (unsigned I = 0; I < array_lengthof(RegMap); ++I) + MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); +} + static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) { MCRegisterInfo *X = new MCRegisterInfo(); InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC); - ARM_MC::initLLVMToCVRegMapping(X); + ARM_MC::initLLVMToCVRegMapping(X); return X; } diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 5a0874f0ef..a84576e757 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -41,22 +41,22 @@ class raw_pwrite_stream; namespace ARM_MC { std::string ParseARMTriple(const Triple &TT, StringRef CPU); -void initLLVMToCVRegMapping(MCRegisterInfo *MRI); - -bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII); -bool isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII); - -template<class Inst> -bool isLDMBaseRegInList(const Inst &MI) { - auto BaseReg = MI.getOperand(0).getReg(); - for (unsigned I = 1, E = MI.getNumOperands(); I < E; ++I) { - const auto &Op = MI.getOperand(I); - if (Op.isReg() && Op.getReg() == BaseReg) - return true; - } - return false; -} - +void initLLVMToCVRegMapping(MCRegisterInfo *MRI); + +bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII); +bool isCPSRDefined(const MCInst &MI, const MCInstrInfo *MCII); + +template<class Inst> +bool isLDMBaseRegInList(const Inst &MI) { + auto BaseReg = MI.getOperand(0).getReg(); + for (unsigned I = 1, E = MI.getNumOperands(); I < E; ++I) { + const auto &Op = MI.getOperand(I); + if (Op.isReg() && Op.getReg() == BaseReg) + return true; + } + return false; +} + /// Create a ARM MCSubtargetInfo instance. This is exposed so Asm parser, etc. /// do not need to go through TargetRegistry. MCSubtargetInfo *createARMMCSubtargetInfo(const Triple &TT, StringRef CPU, diff --git a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ya.make index b92b47d057..0256e1fdac 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc/ya.make @@ -12,20 +12,20 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/BinaryFormat - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCDisassembler - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/ARM/TargetInfo - contrib/libs/llvm12/lib/Target/ARM/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/BinaryFormat + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCDisassembler + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/ARM/TargetInfo + contrib/libs/llvm12/lib/Target/ARM/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/libs/llvm12/lib/Target/ARM/MVEGatherScatterLowering.cpp index 56823735e2..0b6cdee512 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -44,10 +44,10 @@ using namespace llvm; -#define DEBUG_TYPE "arm-mve-gather-scatter-lowering" +#define DEBUG_TYPE "arm-mve-gather-scatter-lowering" cl::opt<bool> EnableMaskedGatherScatters( - "enable-arm-maskedgatscat", cl::Hidden, cl::init(true), + "enable-arm-maskedgatscat", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked gathers and scatters")); namespace { @@ -84,7 +84,7 @@ private: // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, + Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); @@ -132,11 +132,11 @@ private: Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, Value *Ptr, unsigned TypeScale, IRBuilder<> &Builder); - - // Optimise the base and offsets of the given address - bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); - // Try to fold consecutive geps together into one - Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); + + // Optimise the base and offsets of the given address + bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); + // Try to fold consecutive geps together into one + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -172,49 +172,49 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, return false; } -static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { - // Offsets that are not of type <N x i32> are sign extended by the - // getelementptr instruction, and MVE gathers/scatters treat the offset as - // unsigned. Thus, if the element size is smaller than 32, we can only allow - // positive offsets - i.e., the offsets are not allowed to be variables we - // can't look into. - // Additionally, <N x i32> offsets have to either originate from a zext of a - // vector with element types smaller or equal the type of the gather we're - // looking at, or consist of constants that we can check are small enough - // to fit into the gather type. - // Thus we check that 0 < value < 2^TargetElemSize. - unsigned TargetElemSize = 128 / TargetElemCount; - unsigned OffsetElemSize = cast<FixedVectorType>(Offsets->getType()) - ->getElementType() - ->getScalarSizeInBits(); - if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { - Constant *ConstOff = dyn_cast<Constant>(Offsets); - if (!ConstOff) - return false; - int64_t TargetElemMaxSize = (1ULL << TargetElemSize); - auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { - ConstantInt *OConst = dyn_cast<ConstantInt>(OffsetElem); - if (!OConst) - return false; - int SExtValue = OConst->getSExtValue(); - if (SExtValue >= TargetElemMaxSize || SExtValue < 0) - return false; - return true; - }; - if (isa<FixedVectorType>(ConstOff->getType())) { - for (unsigned i = 0; i < TargetElemCount; i++) { - if (!CheckValueSize(ConstOff->getAggregateElement(i))) - return false; - } - } else { - if (!CheckValueSize(ConstOff)) - return false; - } - } - return true; -} - -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, +static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { + // Offsets that are not of type <N x i32> are sign extended by the + // getelementptr instruction, and MVE gathers/scatters treat the offset as + // unsigned. Thus, if the element size is smaller than 32, we can only allow + // positive offsets - i.e., the offsets are not allowed to be variables we + // can't look into. + // Additionally, <N x i32> offsets have to either originate from a zext of a + // vector with element types smaller or equal the type of the gather we're + // looking at, or consist of constants that we can check are small enough + // to fit into the gather type. + // Thus we check that 0 < value < 2^TargetElemSize. + unsigned TargetElemSize = 128 / TargetElemCount; + unsigned OffsetElemSize = cast<FixedVectorType>(Offsets->getType()) + ->getElementType() + ->getScalarSizeInBits(); + if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { + Constant *ConstOff = dyn_cast<Constant>(Offsets); + if (!ConstOff) + return false; + int64_t TargetElemMaxSize = (1ULL << TargetElemSize); + auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { + ConstantInt *OConst = dyn_cast<ConstantInt>(OffsetElem); + if (!OConst) + return false; + int SExtValue = OConst->getSExtValue(); + if (SExtValue >= TargetElemMaxSize || SExtValue < 0) + return false; + return true; + }; + if (isa<FixedVectorType>(ConstOff->getType())) { + for (unsigned i = 0; i < TargetElemCount; i++) { + if (!CheckValueSize(ConstOff->getAggregateElement(i))) + return false; + } + } else { + if (!CheckValueSize(ConstOff)) + return false; + } + } + return true; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder) { if (!GEP) { @@ -225,43 +225,43 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); - Offsets = GEP->getOperand(1); - if (GEPPtr->getType()->isVectorTy() || - !isa<FixedVectorType>(Offsets->getType())) + Offsets = GEP->getOperand(1); + if (GEPPtr->getType()->isVectorTy() || + !isa<FixedVectorType>(Offsets->getType())) return nullptr; - + if (GEP->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); - unsigned OffsetsElemCount = - cast<FixedVectorType>(Offsets->getType())->getNumElements(); + unsigned OffsetsElemCount = + cast<FixedVectorType>(Offsets->getType())->getNumElements(); // Paranoid check whether the number of parallel lanes is the same - assert(Ty->getNumElements() == OffsetsElemCount); - - ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets); - if (ZextOffs) + assert(Ty->getNumElements() == OffsetsElemCount); + + ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets); + if (ZextOffs) Offsets = ZextOffs->getOperand(0); - FixedVectorType *OffsetType = cast<FixedVectorType>(Offsets->getType()); - - // If the offsets are already being zext-ed to <N x i32>, that relieves us of - // having to make sure that they won't overflow. - if (!ZextOffs || cast<FixedVectorType>(ZextOffs->getDestTy()) - ->getElementType() - ->getScalarSizeInBits() != 32) - if (!checkOffsetSize(Offsets, OffsetsElemCount)) - return nullptr; - - // The offset sizes have been checked; if any truncating or zext-ing is - // required to fix them, do that now + FixedVectorType *OffsetType = cast<FixedVectorType>(Offsets->getType()); + + // If the offsets are already being zext-ed to <N x i32>, that relieves us of + // having to make sure that they won't overflow. + if (!ZextOffs || cast<FixedVectorType>(ZextOffs->getDestTy()) + ->getElementType() + ->getScalarSizeInBits() != 32) + if (!checkOffsetSize(Offsets, OffsetsElemCount)) + return nullptr; + + // The offset sizes have been checked; if any truncating or zext-ing is + // required to fix them, do that now if (Ty != Offsets->getType()) { - if ((Ty->getElementType()->getScalarSizeInBits() < - OffsetType->getElementType()->getScalarSizeInBits())) { - Offsets = Builder.CreateTrunc(Offsets, Ty); + if ((Ty->getElementType()->getScalarSizeInBits() < + OffsetType->getElementType()->getScalarSizeInBits())) { + Offsets = Builder.CreateTrunc(Offsets, Ty); } else { - Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); + Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); } } // If none of the checks failed, return the gep's base pointer @@ -476,8 +476,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); Value *Offsets; - Value *BasePtr = - checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -617,8 +617,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); Value *Offsets; - Value *BasePtr = - checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -941,7 +941,7 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, int IncrementingBlock = -1; for (int i = 0; i < 2; i++) - if (auto *Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) + if (auto *Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) if (Op->getOpcode() == Instruction::Add && (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi)) IncrementingBlock = i; @@ -960,8 +960,8 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, // Get the value that is added to/multiplied with the phi Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp); - if (IncrementPerRound->getType() != OffsSecondOperand->getType() || - !L->isLoopInvariant(OffsSecondOperand)) + if (IncrementPerRound->getType() != OffsSecondOperand->getType() || + !L->isLoopInvariant(OffsSecondOperand)) // Something has gone wrong, abort return false; @@ -1029,128 +1029,128 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, return true; } -static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, - IRBuilder<> &Builder) { - // Splat the non-vector value to a vector of the given type - if the value is - // a constant (and its value isn't too big), we can even use this opportunity - // to scale it to the size of the vector elements - auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { - ConstantInt *Const; - if ((Const = dyn_cast<ConstantInt>(NonVectorVal)) && - VT->getElementType() != NonVectorVal->getType()) { - unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); - uint64_t N = Const->getZExtValue(); - if (N < (unsigned)(1 << (TargetElemSize - 1))) { - NonVectorVal = Builder.CreateVectorSplat( - VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); - return; - } - } - NonVectorVal = - Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); - }; - - FixedVectorType *XElType = dyn_cast<FixedVectorType>(X->getType()); - FixedVectorType *YElType = dyn_cast<FixedVectorType>(Y->getType()); - // If one of X, Y is not a vector, we have to splat it in order - // to add the two of them. - if (XElType && !YElType) { - FixSummands(XElType, Y); - YElType = cast<FixedVectorType>(Y->getType()); - } else if (YElType && !XElType) { - FixSummands(YElType, X); - XElType = cast<FixedVectorType>(X->getType()); - } - assert(XElType && YElType && "Unknown vector types"); - // Check that the summands are of compatible types - if (XElType != YElType) { - LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); - return nullptr; - } - - if (XElType->getElementType()->getScalarSizeInBits() != 32) { - // Check that by adding the vectors we do not accidentally - // create an overflow - Constant *ConstX = dyn_cast<Constant>(X); - Constant *ConstY = dyn_cast<Constant>(Y); - if (!ConstX || !ConstY) - return nullptr; - unsigned TargetElemSize = 128 / XElType->getNumElements(); - for (unsigned i = 0; i < XElType->getNumElements(); i++) { - ConstantInt *ConstXEl = - dyn_cast<ConstantInt>(ConstX->getAggregateElement(i)); - ConstantInt *ConstYEl = - dyn_cast<ConstantInt>(ConstY->getAggregateElement(i)); - if (!ConstXEl || !ConstYEl || - ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= - (unsigned)(1 << (TargetElemSize - 1))) - return nullptr; - } - } - - Value *Add = Builder.CreateAdd(X, Y); - - FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType()); - if (checkOffsetSize(Add, GEPType->getNumElements())) - return Add; - else - return nullptr; -} - -Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, - Value *&Offsets, - IRBuilder<> &Builder) { - Value *GEPPtr = GEP->getPointerOperand(); - Offsets = GEP->getOperand(1); - // We only merge geps with constant offsets, because only for those - // we can make sure that we do not cause an overflow - if (!isa<Constant>(Offsets)) - return nullptr; - GetElementPtrInst *BaseGEP; - if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) { - // Merge the two geps into one - Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); - if (!BaseBasePtr) - return nullptr; - Offsets = - CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); - if (Offsets == nullptr) - return nullptr; - return BaseBasePtr; - } - return GEPPtr; -} - -bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, - LoopInfo *LI) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Address); - if (!GEP) - return false; - bool Changed = false; - if (GEP->hasOneUse() && - dyn_cast<GetElementPtrInst>(GEP->getPointerOperand())) { - IRBuilder<> Builder(GEP->getContext()); - Builder.SetInsertPoint(GEP); - Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); - Value *Offsets; - Value *Base = foldGEP(GEP, Offsets, Builder); - // We only want to merge the geps if there is a real chance that they can be - // used by an MVE gather; thus the offset has to have the correct size - // (always i32 if it is not of vector type) and the base has to be a - // pointer. - if (Offsets && Base && Base != GEP) { - PointerType *BaseType = cast<PointerType>(Base->getType()); - GetElementPtrInst *NewAddress = GetElementPtrInst::Create( - BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); - GEP->replaceAllUsesWith(NewAddress); - GEP = NewAddress; - Changed = true; - } - } - Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); - return Changed; -} - +static Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, + IRBuilder<> &Builder) { + // Splat the non-vector value to a vector of the given type - if the value is + // a constant (and its value isn't too big), we can even use this opportunity + // to scale it to the size of the vector elements + auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { + ConstantInt *Const; + if ((Const = dyn_cast<ConstantInt>(NonVectorVal)) && + VT->getElementType() != NonVectorVal->getType()) { + unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); + uint64_t N = Const->getZExtValue(); + if (N < (unsigned)(1 << (TargetElemSize - 1))) { + NonVectorVal = Builder.CreateVectorSplat( + VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); + return; + } + } + NonVectorVal = + Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); + }; + + FixedVectorType *XElType = dyn_cast<FixedVectorType>(X->getType()); + FixedVectorType *YElType = dyn_cast<FixedVectorType>(Y->getType()); + // If one of X, Y is not a vector, we have to splat it in order + // to add the two of them. + if (XElType && !YElType) { + FixSummands(XElType, Y); + YElType = cast<FixedVectorType>(Y->getType()); + } else if (YElType && !XElType) { + FixSummands(YElType, X); + XElType = cast<FixedVectorType>(X->getType()); + } + assert(XElType && YElType && "Unknown vector types"); + // Check that the summands are of compatible types + if (XElType != YElType) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); + return nullptr; + } + + if (XElType->getElementType()->getScalarSizeInBits() != 32) { + // Check that by adding the vectors we do not accidentally + // create an overflow + Constant *ConstX = dyn_cast<Constant>(X); + Constant *ConstY = dyn_cast<Constant>(Y); + if (!ConstX || !ConstY) + return nullptr; + unsigned TargetElemSize = 128 / XElType->getNumElements(); + for (unsigned i = 0; i < XElType->getNumElements(); i++) { + ConstantInt *ConstXEl = + dyn_cast<ConstantInt>(ConstX->getAggregateElement(i)); + ConstantInt *ConstYEl = + dyn_cast<ConstantInt>(ConstY->getAggregateElement(i)); + if (!ConstXEl || !ConstYEl || + ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + (unsigned)(1 << (TargetElemSize - 1))) + return nullptr; + } + } + + Value *Add = Builder.CreateAdd(X, Y); + + FixedVectorType *GEPType = cast<FixedVectorType>(GEP->getType()); + if (checkOffsetSize(Add, GEPType->getNumElements())) + return Add; + else + return nullptr; +} + +Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, + Value *&Offsets, + IRBuilder<> &Builder) { + Value *GEPPtr = GEP->getPointerOperand(); + Offsets = GEP->getOperand(1); + // We only merge geps with constant offsets, because only for those + // we can make sure that we do not cause an overflow + if (!isa<Constant>(Offsets)) + return nullptr; + GetElementPtrInst *BaseGEP; + if ((BaseGEP = dyn_cast<GetElementPtrInst>(GEPPtr))) { + // Merge the two geps into one + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + if (!BaseBasePtr) + return nullptr; + Offsets = + CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + if (Offsets == nullptr) + return nullptr; + return BaseBasePtr; + } + return GEPPtr; +} + +bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, + LoopInfo *LI) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Address); + if (!GEP) + return false; + bool Changed = false; + if (GEP->hasOneUse() && + dyn_cast<GetElementPtrInst>(GEP->getPointerOperand())) { + IRBuilder<> Builder(GEP->getContext()); + Builder.SetInsertPoint(GEP); + Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); + Value *Offsets; + Value *Base = foldGEP(GEP, Offsets, Builder); + // We only want to merge the geps if there is a real chance that they can be + // used by an MVE gather; thus the offset has to have the correct size + // (always i32 if it is not of vector type) and the base has to be a + // pointer. + if (Offsets && Base && Base != GEP) { + PointerType *BaseType = cast<PointerType>(Base->getType()); + GetElementPtrInst *NewAddress = GetElementPtrInst::Create( + BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); + GEP->replaceAllUsesWith(NewAddress); + GEP = NewAddress; + Changed = true; + } + } + Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); + return Changed; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -1166,18 +1166,18 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) { bool Changed = false; for (BasicBlock &BB : F) { - Changed |= SimplifyInstructionsInBlock(&BB); - + Changed |= SimplifyInstructionsInBlock(&BB); + for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather && - isa<FixedVectorType>(II->getType())) { + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa<FixedVectorType>(II->getType())) { Gathers.push_back(II); - Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && - isa<FixedVectorType>(II->getArgOperand(0)->getType())) { + Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa<FixedVectorType>(II->getArgOperand(0)->getType())) { Scatters.push_back(II); - Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); + Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); } } } diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredUtils.h b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredUtils.h index 9ab5d92729..1bb23cc725 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredUtils.h +++ b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredUtils.h @@ -1,157 +1,157 @@ -//===-- MVETailPredUtils.h - Tail predication utility functions -*- C++-*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains utility functions for low overhead and tail predicated -// loops, shared between the ARMLowOverheadLoops pass and anywhere else that -// needs them. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H -#define LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H - -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/TargetInstrInfo.h" - -namespace llvm { - -static inline unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) { - switch (Opcode) { - default: - llvm_unreachable("unhandled vctp opcode"); - break; - case ARM::MVE_VCTP8: - return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; - case ARM::MVE_VCTP16: - return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; - case ARM::MVE_VCTP32: - return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; - case ARM::MVE_VCTP64: - return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; - } - return 0; -} - -static inline unsigned getTailPredVectorWidth(unsigned Opcode) { - switch (Opcode) { - default: - llvm_unreachable("unhandled vctp opcode"); - case ARM::MVE_VCTP8: - return 16; - case ARM::MVE_VCTP16: - return 8; - case ARM::MVE_VCTP32: - return 4; - case ARM::MVE_VCTP64: - return 2; - } - return 0; -} - -static inline bool isVCTP(const MachineInstr *MI) { - switch (MI->getOpcode()) { - default: - break; - case ARM::MVE_VCTP8: - case ARM::MVE_VCTP16: - case ARM::MVE_VCTP32: - case ARM::MVE_VCTP64: - return true; - } - return false; -} - -static inline bool isLoopStart(MachineInstr &MI) { - return MI.getOpcode() == ARM::t2DoLoopStart || - MI.getOpcode() == ARM::t2DoLoopStartTP || - MI.getOpcode() == ARM::t2WhileLoopStart; -} - -// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a -// beq that branches to the exit branch. -inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII, - unsigned BrOpc = ARM::t2Bcc) { - MachineBasicBlock *MBB = MI->getParent(); - - // Cmp - MachineInstrBuilder MIB = - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); - MIB.add(MI->getOperand(0)); - MIB.addImm(0); - MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::NoRegister); - - // Branch - MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); - MIB.add(MI->getOperand(1)); // branch target - MIB.addImm(ARMCC::EQ); // condition code - MIB.addReg(ARM::CPSR); - - MI->eraseFromParent(); -} - -inline void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII) { - MachineBasicBlock *MBB = MI->getParent(); - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr)) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .add(predOps(ARMCC::AL)); - - MI->eraseFromParent(); -} - -inline void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII, - bool SetFlags = false) { - MachineBasicBlock *MBB = MI->getParent(); - - MachineInstrBuilder MIB = - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); - MIB.add(MI->getOperand(0)); - MIB.add(MI->getOperand(1)); - MIB.add(MI->getOperand(2)); - MIB.addImm(ARMCC::AL); - MIB.addReg(0); - - if (SetFlags) { - MIB.addReg(ARM::CPSR); - MIB->getOperand(5).setIsDef(true); - } else - MIB.addReg(0); - - MI->eraseFromParent(); -} - -// Generate a subs, or sub and cmp, and a branch instead of an LE. -inline void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII, - unsigned BrOpc = ARM::t2Bcc, bool SkipCmp = false) { - MachineBasicBlock *MBB = MI->getParent(); - - // Create cmp - if (!SkipCmp) { - MachineInstrBuilder MIB = - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); - MIB.add(MI->getOperand(0)); - MIB.addImm(0); - MIB.addImm(ARMCC::AL); - MIB.addReg(ARM::NoRegister); - } - - // Create bne - MachineInstrBuilder MIB = - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); - MIB.add(MI->getOperand(1)); // branch target - MIB.addImm(ARMCC::NE); // condition code - MIB.addReg(ARM::CPSR); - MI->eraseFromParent(); -} - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H +//===-- MVETailPredUtils.h - Tail predication utility functions -*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains utility functions for low overhead and tail predicated +// loops, shared between the ARMLowOverheadLoops pass and anywhere else that +// needs them. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H +#define LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H + +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetInstrInfo.h" + +namespace llvm { + +static inline unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + break; + case ARM::MVE_VCTP8: + return IsDoLoop ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; + case ARM::MVE_VCTP16: + return IsDoLoop ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; + case ARM::MVE_VCTP32: + return IsDoLoop ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; + case ARM::MVE_VCTP64: + return IsDoLoop ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; + } + return 0; +} + +static inline unsigned getTailPredVectorWidth(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + case ARM::MVE_VCTP8: + return 16; + case ARM::MVE_VCTP16: + return 8; + case ARM::MVE_VCTP32: + return 4; + case ARM::MVE_VCTP64: + return 2; + } + return 0; +} + +static inline bool isVCTP(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + break; + case ARM::MVE_VCTP8: + case ARM::MVE_VCTP16: + case ARM::MVE_VCTP32: + case ARM::MVE_VCTP64: + return true; + } + return false; +} + +static inline bool isLoopStart(MachineInstr &MI) { + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2DoLoopStartTP || + MI.getOpcode() == ARM::t2WhileLoopStart; +} + +// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a +// beq that branches to the exit branch. +inline void RevertWhileLoopStart(MachineInstr *MI, const TargetInstrInfo *TII, + unsigned BrOpc = ARM::t2Bcc) { + MachineBasicBlock *MBB = MI->getParent(); + + // Cmp + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); + MIB.add(MI->getOperand(0)); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); + + // Branch + MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); + MIB.add(MI->getOperand(1)); // branch target + MIB.addImm(ARMCC::EQ); // condition code + MIB.addReg(ARM::CPSR); + + MI->eraseFromParent(); +} + +inline void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII) { + MachineBasicBlock *MBB = MI->getParent(); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::tMOVr)) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .add(predOps(ARMCC::AL)); + + MI->eraseFromParent(); +} + +inline void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII, + bool SetFlags = false) { + MachineBasicBlock *MBB = MI->getParent(); + + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); + MIB.add(MI->getOperand(0)); + MIB.add(MI->getOperand(1)); + MIB.add(MI->getOperand(2)); + MIB.addImm(ARMCC::AL); + MIB.addReg(0); + + if (SetFlags) { + MIB.addReg(ARM::CPSR); + MIB->getOperand(5).setIsDef(true); + } else + MIB.addReg(0); + + MI->eraseFromParent(); +} + +// Generate a subs, or sub and cmp, and a branch instead of an LE. +inline void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII, + unsigned BrOpc = ARM::t2Bcc, bool SkipCmp = false) { + MachineBasicBlock *MBB = MI->getParent(); + + // Create cmp + if (!SkipCmp) { + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2CMPri)); + MIB.add(MI->getOperand(0)); + MIB.addImm(0); + MIB.addImm(ARMCC::AL); + MIB.addReg(ARM::NoRegister); + } + + // Create bne + MachineInstrBuilder MIB = + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); + MIB.add(MI->getOperand(1)); // branch target + MIB.addImm(ARMCC::NE); // condition code + MIB.addReg(ARM::CPSR); + MI->eraseFromParent(); +} + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_ARM_MVETAILPREDUTILS_H diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp index cccac55952..94e71f1d60 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MVETailPredication.cpp @@ -22,13 +22,13 @@ /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are -/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these -/// get.active.lane.mask intrinsic and attempts to convert them to VCTP -/// instructions. This will be picked up by the ARM Low-overhead loop pass later -/// in the backend, which performs the final transformation to a DLSTP or WLSTP -/// tail-predicated loop. -// -//===----------------------------------------------------------------------===// +/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these +/// get.active.lane.mask intrinsic and attempts to convert them to VCTP +/// instructions. This will be picked up by the ARM Low-overhead loop pass later +/// in the backend, which performs the final transformation to a DLSTP or WLSTP +/// tail-predicated loop. +// +//===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" @@ -47,7 +47,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -57,8 +57,8 @@ using namespace llvm; #define DESC "Transform predicated vector loops to use MVE tail predication" cl::opt<TailPredication::Mode> EnableTailPredication( - "tail-predication", cl::desc("MVE tail-predication pass options"), - cl::init(TailPredication::Enabled), + "tail-predication", cl::desc("MVE tail-predication pass options"), + cl::init(TailPredication::Enabled), cl::values(clEnumValN(TailPredication::Disabled, "disabled", "Don't tail-predicate loops"), clEnumValN(TailPredication::EnabledNoReductions, @@ -103,18 +103,18 @@ public: bool runOnLoop(Loop *L, LPPassManager&) override; private: - /// Perform the relevant checks on the loop and convert active lane masks if - /// possible. - bool TryConvertActiveLaneMask(Value *TripCount); + /// Perform the relevant checks on the loop and convert active lane masks if + /// possible. + bool TryConvertActiveLaneMask(Value *TripCount); - /// Perform several checks on the arguments of @llvm.get.active.lane.mask - /// intrinsic. E.g., check that the loop induction variable and the element - /// count are of the form we expect, and also perform overflow checks for - /// the new expressions that are created. - bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); + /// Perform several checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic. E.g., check that the loop induction variable and the element + /// count are of the form we expect, and also perform overflow checks for + /// the new expressions that are created. + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -155,7 +155,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { continue; Intrinsic::ID ID = Call->getIntrinsicID(); - if (ID == Intrinsic::start_loop_iterations || + if (ID == Intrinsic::start_loop_iterations || ID == Intrinsic::test_set_loop_iterations) return cast<IntrinsicInst>(&I); } @@ -174,23 +174,23 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { return false; } - LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); - bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); + bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); - return Changed; + return Changed; } // The active lane intrinsic has this form: // -// @llvm.get.active.lane.mask(IV, TC) +// @llvm.get.active.lane.mask(IV, TC) // // Here we perform checks that this intrinsic behaves as expected, // which means: // -// 1) Check that the TripCount (TC) belongs to this loop (originally). -// 2) The element count (TC) needs to be sufficiently large that the decrement -// of element counter doesn't overflow, which means that we need to prove: +// 1) Check that the TripCount (TC) belongs to this loop (originally). +// 2) The element count (TC) needs to be sufficiently large that the decrement +// of element counter doesn't overflow, which means that we need to prove: // ceil(ElementCount / VectorWidth) >= TripCount // by rounding up ElementCount up: // ((ElementCount + (VectorWidth - 1)) / VectorWidth @@ -199,122 +199,122 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { // 3) The IV must be an induction phi with an increment equal to the // vector width. bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { + Value *TripCount) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - Value *ElemCount = ActiveLaneMask->getOperand(1); - bool Changed = false; - if (!L->makeLoopInvariant(ElemCount, Changed)) - return false; - - auto *EC= SE->getSCEV(ElemCount); - auto *TC = SE->getSCEV(TripCount); - int VectorWidth = - cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); - if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) - return false; - ConstantInt *ConstElemCount = nullptr; - - // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to - // this loop. The scalar tripcount corresponds the number of elements - // processed by the loop, so we will refer to that from this point on. - if (!SE->isLoopInvariant(EC, L)) { - LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); + Value *ElemCount = ActiveLaneMask->getOperand(1); + bool Changed = false; + if (!L->makeLoopInvariant(ElemCount, Changed)) + return false; + + auto *EC= SE->getSCEV(ElemCount); + auto *TC = SE->getSCEV(TripCount); + int VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); + if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) + return false; + ConstantInt *ConstElemCount = nullptr; + + // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to + // this loop. The scalar tripcount corresponds the number of elements + // processed by the loop, so we will refer to that from this point on. + if (!SE->isLoopInvariant(EC, L)) { + LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); return false; } - if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { - ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); - if (!TC) { - LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " - "set.loop.iterations\n"); - return false; - } - - // Calculate 2 tripcount values and check that they are consistent with - // each other. The TripCount for a predicated vector loop body is - // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we - // work it out here. - uint64_t TC1 = TC->getZExtValue(); - uint64_t TC2 = - (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; - - // If the tripcount values are inconsistent, we can't insert the VCTP and - // trigger tail-predication; keep the intrinsic as a get.active.lane.mask - // and legalize this. - if (TC1 != TC2) { - LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " - << TC1 << " from set.loop.iterations, and " - << TC2 << " from get.active.lane.mask\n"); - return false; - } - } else if (!ForceTailPredication) { - // 2) We need to prove that the sub expression that we create in the - // tail-predicated loop body, which calculates the remaining elements to be - // processed, is non-negative, i.e. it doesn't overflow: - // - // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 - // - // This is true if: - // - // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth - // - // which what we will be using here. - // - auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); - // ElementCount + (VW-1): - auto *ECPlusVWMinus1 = SE->getAddExpr(EC, - SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); - - // Ceil = ElementCount + (VW-1) / VW - auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); - - // Prevent unused variable warnings with TC - (void)TC; - LLVM_DEBUG( - dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; - dbgs() << "ARM TP: - TripCount = "; TC->dump(); - dbgs() << "ARM TP: - ElemCount = "; EC->dump(); - dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; - dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); - ); - - // As an example, almost all the tripcount expressions (produced by the - // vectoriser) look like this: - // - // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4) - // - // and "ElementCount + (VW-1) / VW": - // - // Ceil = ((3 + %N) /u 4) - // - // Check for equality of TC and Ceil by calculating SCEV expression - // TC - Ceil and test it for zero. - // - bool Zero = SE->getMinusSCEV( - SE->getBackedgeTakenCount(L), - SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), - SE->getNegativeSCEV(VW)), - VW)) - ->isZero(); - - if (!Zero) { - LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); - return false; - } + if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { + ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); + if (!TC) { + LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " + "set.loop.iterations\n"); + return false; + } + + // Calculate 2 tripcount values and check that they are consistent with + // each other. The TripCount for a predicated vector loop body is + // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we + // work it out here. + uint64_t TC1 = TC->getZExtValue(); + uint64_t TC2 = + (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; + + // If the tripcount values are inconsistent, we can't insert the VCTP and + // trigger tail-predication; keep the intrinsic as a get.active.lane.mask + // and legalize this. + if (TC1 != TC2) { + LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " + << TC1 << " from set.loop.iterations, and " + << TC2 << " from get.active.lane.mask\n"); + return false; + } + } else if (!ForceTailPredication) { + // 2) We need to prove that the sub expression that we create in the + // tail-predicated loop body, which calculates the remaining elements to be + // processed, is non-negative, i.e. it doesn't overflow: + // + // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 + // + // This is true if: + // + // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth + // + // which what we will be using here. + // + auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); + // ElementCount + (VW-1): + auto *ECPlusVWMinus1 = SE->getAddExpr(EC, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); + + // Prevent unused variable warnings with TC + (void)TC; + LLVM_DEBUG( + dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; + dbgs() << "ARM TP: - TripCount = "; TC->dump(); + dbgs() << "ARM TP: - ElemCount = "; EC->dump(); + dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; + dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); + ); + + // As an example, almost all the tripcount expressions (produced by the + // vectoriser) look like this: + // + // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4) + // + // and "ElementCount + (VW-1) / VW": + // + // Ceil = ((3 + %N) /u 4) + // + // Check for equality of TC and Ceil by calculating SCEV expression + // TC - Ceil and test it for zero. + // + bool Zero = SE->getMinusSCEV( + SE->getBackedgeTakenCount(L), + SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), + SE->getNegativeSCEV(VW)), + VW)) + ->isZero(); + + if (!Zero) { + LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); + return false; + } } - // 3) Find out if IV is an induction phi. Note that we can't use Loop + // 3) Find out if IV is an induction phi. Note that we can't use Loop // helpers here to get the induction variable, because the hardware loop is - // no longer in loopsimplify form, and also the hwloop intrinsic uses a - // different counter. Using SCEV, we check that the induction is of the + // no longer in loopsimplify form, and also the hwloop intrinsic uses a + // different counter. Using SCEV, we check that the induction is of the // form i = i + 4, where the increment must be equal to the VectorWidth. auto *IV = ActiveLaneMask->getOperand(0); auto *IVExpr = SE->getSCEV(IV); auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr); - + if (!AddExpr) { LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); return false; @@ -324,11 +324,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); return false; } - auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0)); - if (!Base || !Base->isZero()) { - LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); - return false; - } + auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0)); + if (!Base || !Base->isZero()) { + LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); + return false; + } auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1)); if (!Step) { LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; @@ -339,29 +339,29 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue - << " doesn't match vector width " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue + << " doesn't match vector width " << VectorWidth << "\n"); return false; } void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount) { + Value *TripCount) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); - unsigned VectorWidth = - cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); + unsigned VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); // Insert a phi to count the number of elements processed by the loop. - Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); + Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); - // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and - // thus represent the effect of tail predication. + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and + // thus represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); - ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); + ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); Intrinsic::ID VCTPID; switch (VectorWidth) { @@ -390,36 +390,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } -bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { - SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; - for (auto *BB : L->getBlocks()) - for (auto &I : *BB) - if (auto *Int = dyn_cast<IntrinsicInst>(&I)) - if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) - ActiveLaneMasks.push_back(Int); - - if (ActiveLaneMasks.empty()) +bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { + SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (auto *Int = dyn_cast<IntrinsicInst>(&I)) + if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) + ActiveLaneMasks.push_back(Int); + + if (ActiveLaneMasks.empty()) return false; LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - for (auto *ActiveLaneMask : ActiveLaneMasks) { + for (auto *ActiveLaneMask : ActiveLaneMasks) { LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { + if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount); } - // Remove dead instructions and now dead phis. - for (auto *II : ActiveLaneMasks) - RecursivelyDeleteTriviallyDeadInstructions(II); - for (auto I : L->blocks()) - DeleteDeadPHIs(I); + // Remove dead instructions and now dead phis. + for (auto *II : ActiveLaneMasks) + RecursivelyDeleteTriviallyDeadInstructions(II); + for (auto I : L->blocks()) + DeleteDeadPHIs(I); return true; } diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/libs/llvm12/lib/Target/ARM/MVEVPTBlockPass.cpp index c7f451cba1..89183c16ac 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVEVPTBlockPass.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MVEVPTBlockPass.cpp @@ -107,12 +107,12 @@ static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter, NumInstrsSteppedOver = 0; while (Iter != EndIter) { - if (Iter->isDebugInstr()) { - // Skip debug instructions - ++Iter; - continue; - } - + if (Iter->isDebugInstr()) { + // Skip debug instructions + ++Iter; + continue; + } + NextPred = getVPTInstrPredicate(*Iter, PredReg); assert(NextPred != ARMVCC::Else && "VPT block pass does not expect Else preds"); @@ -176,8 +176,8 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter = std::next(BlockBeg); AddedInstIter != Iter; ++AddedInstIter) { - if (AddedInstIter->isDebugInstr()) - continue; + if (AddedInstIter->isDebugInstr()) + continue; dbgs() << " adding: "; AddedInstIter->dump(); }); @@ -205,7 +205,7 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter)) break; - LLVM_DEBUG(dbgs() << " removing VPNOT: "; Iter->dump()); + LLVM_DEBUG(dbgs() << " removing VPNOT: "; Iter->dump()); // Record the new size of the block BlockSize += ElseInstCnt; @@ -219,9 +219,9 @@ CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, // Note that we are using "Iter" to iterate over the block so we can update // it at the same time. for (; Iter != VPNOTBlockEndIter; ++Iter) { - if (Iter->isDebugInstr()) - continue; - + if (Iter->isDebugInstr()) + continue; + // Find the register in which the predicate is int OpIdx = findFirstVPTPredOperandIdx(*Iter); assert(OpIdx != -1); @@ -281,27 +281,27 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { MIBuilder.add(VCMP->getOperand(1)); MIBuilder.add(VCMP->getOperand(2)); MIBuilder.add(VCMP->getOperand(3)); - - // We need to remove any kill flags between the original VCMP and the new - // insertion point. - for (MachineInstr &MII : - make_range(VCMP->getIterator(), MI->getIterator())) { - MII.clearRegisterKills(VCMP->getOperand(1).getReg(), TRI); - MII.clearRegisterKills(VCMP->getOperand(2).getReg(), TRI); - } - + + // We need to remove any kill flags between the original VCMP and the new + // insertion point. + for (MachineInstr &MII : + make_range(VCMP->getIterator(), MI->getIterator())) { + MII.clearRegisterKills(VCMP->getOperand(1).getReg(), TRI); + MII.clearRegisterKills(VCMP->getOperand(2).getReg(), TRI); + } + VCMP->eraseFromParent(); } else { MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST)); MIBuilder.addImm((uint64_t)BlockMask); } - // Erase all dead instructions (VPNOT's). Do that now so that they do not - // mess with the bundle creation. - for (MachineInstr *DeadMI : DeadInstructions) - DeadMI->eraseFromParent(); - DeadInstructions.clear(); - + // Erase all dead instructions (VPNOT's). Do that now so that they do not + // mess with the bundle creation. + for (MachineInstr *DeadMI : DeadInstructions) + DeadMI->eraseFromParent(); + DeadInstructions.clear(); + finalizeBundle( Block, MachineBasicBlock::instr_iterator(MIBuilder.getInstr()), MBIter); diff --git a/contrib/libs/llvm12/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/contrib/libs/llvm12/lib/Target/ARM/MVEVPTOptimisationsPass.cpp index 00e4449769..70fb8c5383 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/MVEVPTOptimisationsPass.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -6,28 +6,28 @@ // //===----------------------------------------------------------------------===// // -/// \file This pass does a few optimisations related to Tail predicated loops -/// and MVE VPT blocks before register allocation is performed. For VPT blocks -/// the goal is to maximize the sizes of the blocks that will be created by the -/// MVE VPT Block Insertion pass (which runs after register allocation). For -/// tail predicated loops we transform the loop into something that will -/// hopefully make the backend ARMLowOverheadLoops pass's job easier. -/// +/// \file This pass does a few optimisations related to Tail predicated loops +/// and MVE VPT blocks before register allocation is performed. For VPT blocks +/// the goal is to maximize the sizes of the blocks that will be created by the +/// MVE VPT Block Insertion pass (which runs after register allocation). For +/// tail predicated loops we transform the loop into something that will +/// hopefully make the backend ARMLowOverheadLoops pass's job easier. +/// //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMBaseInfo.h" -#include "MVETailPredUtils.h" +#include "MVETailPredUtils.h" #include "Thumb2InstrInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/InitializePasses.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include <cassert> @@ -35,11 +35,11 @@ using namespace llvm; #define DEBUG_TYPE "arm-mve-vpt-opts" -static cl::opt<bool> -MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, - cl::desc("Enable merging Loop End and Dec instructions."), - cl::init(true)); - +static cl::opt<bool> +MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, + cl::desc("Enable merging Loop End and Dec instructions."), + cl::init(true)); + namespace { class MVEVPTOptimisations : public MachineFunctionPass { public: @@ -51,315 +51,315 @@ public: bool runOnMachineFunction(MachineFunction &Fn) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineLoopInfo>(); - AU.addPreserved<MachineLoopInfo>(); - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + StringRef getPassName() const override { - return "ARM MVE TailPred and VPT Optimisation Pass"; + return "ARM MVE TailPred and VPT Optimisation Pass"; } private: - bool MergeLoopEnd(MachineLoop *ML); - bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); + bool MergeLoopEnd(MachineLoop *ML); + bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, Register Target); bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); - bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); - bool ConvertVPSEL(MachineBasicBlock &MBB); + bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); + bool ConvertVPSEL(MachineBasicBlock &MBB); }; char MVEVPTOptimisations::ID = 0; } // end anonymous namespace -INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE, - "ARM MVE TailPred and VPT Optimisations pass", false, - false) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE, - "ARM MVE TailPred and VPT Optimisations pass", false, false) - -static MachineInstr *LookThroughCOPY(MachineInstr *MI, - MachineRegisterInfo *MRI) { - while (MI && MI->getOpcode() == TargetOpcode::COPY && - MI->getOperand(1).getReg().isVirtual()) - MI = MRI->getVRegDef(MI->getOperand(1).getReg()); - return MI; -} - -// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and -// corresponding PHI that make up a low overhead loop. Only handles 'do' loops -// at the moment, returning a t2DoLoopStart in LoopStart. -static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, - MachineInstr *&LoopStart, MachineInstr *&LoopPhi, - MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { - MachineBasicBlock *Header = ML->getHeader(); - MachineBasicBlock *Latch = ML->getLoopLatch(); - if (!Header || !Latch) { - LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n"); - return false; - } - - // Find the loop end from the terminators. - LoopEnd = nullptr; - for (auto &T : Latch->terminators()) { - if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) { - LoopEnd = &T; - break; - } - if (T.getOpcode() == ARM::t2LoopEndDec && - T.getOperand(2).getMBB() == Header) { - LoopEnd = &T; - break; - } - } - if (!LoopEnd) { - LLVM_DEBUG(dbgs() << " no LoopEnd\n"); - return false; - } - LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); - - // Find the dec from the use of the end. There may be copies between - // instructions. We expect the loop to loop like: - // $vs = t2DoLoopStart ... - // loop: - // $vp = phi [ $vs ], [ $vd ] - // ... - // $vd = t2LoopDec $vp - // ... - // t2LoopEnd $vd, loop - if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) - LoopDec = LoopEnd; - else { - LoopDec = - LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); - if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { - LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); - return false; - } - } - LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); - - LoopPhi = - LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI); - if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || - LoopPhi->getNumOperands() != 5 || - (LoopPhi->getOperand(2).getMBB() != Latch && - LoopPhi->getOperand(4).getMBB() != Latch)) { - LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n"); - return false; - } - LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); - - Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch - ? LoopPhi->getOperand(3).getReg() - : LoopPhi->getOperand(1).getReg(); - LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI); - if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) { - LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n"); - return false; - } - LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); - - return true; -} - -// This function converts loops with t2LoopEnd and t2LoopEnd instructions into -// a single t2LoopEndDec instruction. To do that it needs to make sure that LR -// will be valid to be used for the low overhead loop, which means nothing else -// is using LR (especially calls) and there are no superfluous copies in the -// loop. The t2LoopEndDec is a branching terminator that produces a value (the -// decrement) around the loop edge, which means we need to be careful that they -// will be valid to allocate without any spilling. -bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { - if (!MergeEndDec) - return false; - - LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() - << "\n"); - - MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; - if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) - return false; - - // Check if there is an illegal instruction (a call) in the low overhead loop - // and if so revert it now before we get any further. - for (MachineBasicBlock *MBB : ML->blocks()) { - for (MachineInstr &MI : *MBB) { - if (MI.isCall()) { - LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI); - RevertDoLoopStart(LoopStart, TII); - RevertLoopDec(LoopDec, TII); - RevertLoopEnd(LoopEnd, TII); - return true; - } - } - } - - // Remove any copies from the loop, to ensure the phi that remains is both - // simpler and contains no extra uses. Because t2LoopEndDec is a terminator - // that cannot spill, we need to be careful what remains in the loop. - Register PhiReg = LoopPhi->getOperand(0).getReg(); - Register DecReg = LoopDec->getOperand(0).getReg(); - Register StartReg = LoopStart->getOperand(0).getReg(); - // Ensure the uses are expected, and collect any copies we want to remove. - SmallVector<MachineInstr *, 4> Copies; - auto CheckUsers = [&Copies](Register BaseReg, - ArrayRef<MachineInstr *> ExpectedUsers, - MachineRegisterInfo *MRI) { - SmallVector<Register, 4> Worklist; - Worklist.push_back(BaseReg); - while (!Worklist.empty()) { - Register Reg = Worklist.pop_back_val(); - for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { - if (count(ExpectedUsers, &MI)) - continue; - if (MI.getOpcode() != TargetOpcode::COPY || - !MI.getOperand(0).getReg().isVirtual()) { - LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); - return false; - } - Worklist.push_back(MI.getOperand(0).getReg()); - Copies.push_back(&MI); - } - } - return true; - }; - if (!CheckUsers(PhiReg, {LoopDec}, MRI) || - !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || - !CheckUsers(StartReg, {LoopPhi}, MRI)) - return false; - - MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass); - MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass); - MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass); - - if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) { - LoopPhi->getOperand(3).setReg(StartReg); - LoopPhi->getOperand(1).setReg(DecReg); - } else { - LoopPhi->getOperand(1).setReg(StartReg); - LoopPhi->getOperand(3).setReg(DecReg); - } - - // Replace the loop dec and loop end as a single instruction. - MachineInstrBuilder MI = - BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), - TII->get(ARM::t2LoopEndDec), DecReg) - .addReg(PhiReg) - .add(LoopEnd->getOperand(1)); - (void)MI; - LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); - - LoopDec->eraseFromParent(); - LoopEnd->eraseFromParent(); - for (auto *MI : Copies) - MI->eraseFromParent(); - return true; -} - -// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP -// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP -// instruction, making the backend ARMLowOverheadLoops passes job of finding the -// VCTP operand much simpler. -bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, - MachineDominatorTree *DT) { - LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " - << ML->getHeader()->getName() << "\n"); - - // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's - // in the loop. - MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; - if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) - return false; - if (LoopDec != LoopEnd) - return false; - - SmallVector<MachineInstr *, 4> VCTPs; - for (MachineBasicBlock *BB : ML->blocks()) - for (MachineInstr &MI : *BB) - if (isVCTP(&MI)) - VCTPs.push_back(&MI); - - if (VCTPs.empty()) { - LLVM_DEBUG(dbgs() << " no VCTPs\n"); - return false; - } - - // Check all VCTPs are the same. - MachineInstr *FirstVCTP = *VCTPs.begin(); - for (MachineInstr *VCTP : VCTPs) { - LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); - if (VCTP->getOpcode() != FirstVCTP->getOpcode() || - VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) { - LLVM_DEBUG(dbgs() << " VCTP's are not identical\n"); - return false; - } - } - - // Check for the register being used can be setup before the loop. We expect - // this to be: - // $vx = ... - // loop: - // $vp = PHI [ $vx ], [ $vd ] - // .. - // $vpr = VCTP $vp - // .. - // $vd = t2SUBri $vp, #n - // .. - Register CountReg = FirstVCTP->getOperand(1).getReg(); - if (!CountReg.isVirtual()) { - LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n"); - return false; - } - MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI); - if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || - Phi->getNumOperands() != 5 || - (Phi->getOperand(2).getMBB() != ML->getLoopLatch() && - Phi->getOperand(4).getMBB() != ML->getLoopLatch())) { - LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n"); - return false; - } - CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch() - ? Phi->getOperand(3).getReg() - : Phi->getOperand(1).getReg(); - - // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of - // the preheader and add the new CountReg to it. We attempt to place it late - // in the preheader, but may need to move that earlier based on uses. - MachineBasicBlock *MBB = LoopStart->getParent(); - MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); - for (MachineInstr &Use : - MRI->use_instructions(LoopStart->getOperand(0).getReg())) - if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || - !DT->dominates(ML->getHeader(), Use.getParent())) { - LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n"); - return false; - } - - MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), - TII->get(ARM::t2DoLoopStartTP)) - .add(LoopStart->getOperand(0)) - .add(LoopStart->getOperand(1)) - .addReg(CountReg); - (void)MI; - LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " - << *MI.getInstr()); - MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); - LoopStart->eraseFromParent(); - - return true; -} - +INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE TailPred and VPT Optimisations pass", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE TailPred and VPT Optimisations pass", false, false) + +static MachineInstr *LookThroughCOPY(MachineInstr *MI, + MachineRegisterInfo *MRI) { + while (MI && MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getReg().isVirtual()) + MI = MRI->getVRegDef(MI->getOperand(1).getReg()); + return MI; +} + +// Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and +// corresponding PHI that make up a low overhead loop. Only handles 'do' loops +// at the moment, returning a t2DoLoopStart in LoopStart. +static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, + MachineInstr *&LoopStart, MachineInstr *&LoopPhi, + MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { + MachineBasicBlock *Header = ML->getHeader(); + MachineBasicBlock *Latch = ML->getLoopLatch(); + if (!Header || !Latch) { + LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n"); + return false; + } + + // Find the loop end from the terminators. + LoopEnd = nullptr; + for (auto &T : Latch->terminators()) { + if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) { + LoopEnd = &T; + break; + } + if (T.getOpcode() == ARM::t2LoopEndDec && + T.getOperand(2).getMBB() == Header) { + LoopEnd = &T; + break; + } + } + if (!LoopEnd) { + LLVM_DEBUG(dbgs() << " no LoopEnd\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd); + + // Find the dec from the use of the end. There may be copies between + // instructions. We expect the loop to loop like: + // $vs = t2DoLoopStart ... + // loop: + // $vp = phi [ $vs ], [ $vd ] + // ... + // $vd = t2LoopDec $vp + // ... + // t2LoopEnd $vd, loop + if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) + LoopDec = LoopEnd; + else { + LoopDec = + LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); + if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { + LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n"); + return false; + } + } + LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec); + + LoopPhi = + LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI); + if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || + LoopPhi->getNumOperands() != 5 || + (LoopPhi->getOperand(2).getMBB() != Latch && + LoopPhi->getOperand(4).getMBB() != Latch)) { + LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi); + + Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch + ? LoopPhi->getOperand(3).getReg() + : LoopPhi->getOperand(1).getReg(); + LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI); + if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) { + LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n"); + return false; + } + LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart); + + return true; +} + +// This function converts loops with t2LoopEnd and t2LoopEnd instructions into +// a single t2LoopEndDec instruction. To do that it needs to make sure that LR +// will be valid to be used for the low overhead loop, which means nothing else +// is using LR (especially calls) and there are no superfluous copies in the +// loop. The t2LoopEndDec is a branching terminator that produces a value (the +// decrement) around the loop edge, which means we need to be careful that they +// will be valid to allocate without any spilling. +bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { + if (!MergeEndDec) + return false; + + LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() + << "\n"); + + MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; + if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) + return false; + + // Check if there is an illegal instruction (a call) in the low overhead loop + // and if so revert it now before we get any further. + for (MachineBasicBlock *MBB : ML->blocks()) { + for (MachineInstr &MI : *MBB) { + if (MI.isCall()) { + LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI); + RevertDoLoopStart(LoopStart, TII); + RevertLoopDec(LoopDec, TII); + RevertLoopEnd(LoopEnd, TII); + return true; + } + } + } + + // Remove any copies from the loop, to ensure the phi that remains is both + // simpler and contains no extra uses. Because t2LoopEndDec is a terminator + // that cannot spill, we need to be careful what remains in the loop. + Register PhiReg = LoopPhi->getOperand(0).getReg(); + Register DecReg = LoopDec->getOperand(0).getReg(); + Register StartReg = LoopStart->getOperand(0).getReg(); + // Ensure the uses are expected, and collect any copies we want to remove. + SmallVector<MachineInstr *, 4> Copies; + auto CheckUsers = [&Copies](Register BaseReg, + ArrayRef<MachineInstr *> ExpectedUsers, + MachineRegisterInfo *MRI) { + SmallVector<Register, 4> Worklist; + Worklist.push_back(BaseReg); + while (!Worklist.empty()) { + Register Reg = Worklist.pop_back_val(); + for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { + if (count(ExpectedUsers, &MI)) + continue; + if (MI.getOpcode() != TargetOpcode::COPY || + !MI.getOperand(0).getReg().isVirtual()) { + LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); + return false; + } + Worklist.push_back(MI.getOperand(0).getReg()); + Copies.push_back(&MI); + } + } + return true; + }; + if (!CheckUsers(PhiReg, {LoopDec}, MRI) || + !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || + !CheckUsers(StartReg, {LoopPhi}, MRI)) + return false; + + MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass); + MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass); + MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass); + + if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) { + LoopPhi->getOperand(3).setReg(StartReg); + LoopPhi->getOperand(1).setReg(DecReg); + } else { + LoopPhi->getOperand(1).setReg(StartReg); + LoopPhi->getOperand(3).setReg(DecReg); + } + + // Replace the loop dec and loop end as a single instruction. + MachineInstrBuilder MI = + BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), + TII->get(ARM::t2LoopEndDec), DecReg) + .addReg(PhiReg) + .add(LoopEnd->getOperand(1)); + (void)MI; + LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); + + LoopDec->eraseFromParent(); + LoopEnd->eraseFromParent(); + for (auto *MI : Copies) + MI->eraseFromParent(); + return true; +} + +// Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP +// instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP +// instruction, making the backend ARMLowOverheadLoops passes job of finding the +// VCTP operand much simpler. +bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, + MachineDominatorTree *DT) { + LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " + << ML->getHeader()->getName() << "\n"); + + // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's + // in the loop. + MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; + if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) + return false; + if (LoopDec != LoopEnd) + return false; + + SmallVector<MachineInstr *, 4> VCTPs; + for (MachineBasicBlock *BB : ML->blocks()) + for (MachineInstr &MI : *BB) + if (isVCTP(&MI)) + VCTPs.push_back(&MI); + + if (VCTPs.empty()) { + LLVM_DEBUG(dbgs() << " no VCTPs\n"); + return false; + } + + // Check all VCTPs are the same. + MachineInstr *FirstVCTP = *VCTPs.begin(); + for (MachineInstr *VCTP : VCTPs) { + LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP); + if (VCTP->getOpcode() != FirstVCTP->getOpcode() || + VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) { + LLVM_DEBUG(dbgs() << " VCTP's are not identical\n"); + return false; + } + } + + // Check for the register being used can be setup before the loop. We expect + // this to be: + // $vx = ... + // loop: + // $vp = PHI [ $vx ], [ $vd ] + // .. + // $vpr = VCTP $vp + // .. + // $vd = t2SUBri $vp, #n + // .. + Register CountReg = FirstVCTP->getOperand(1).getReg(); + if (!CountReg.isVirtual()) { + LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n"); + return false; + } + MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI); + if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || + Phi->getNumOperands() != 5 || + (Phi->getOperand(2).getMBB() != ML->getLoopLatch() && + Phi->getOperand(4).getMBB() != ML->getLoopLatch())) { + LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n"); + return false; + } + CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch() + ? Phi->getOperand(3).getReg() + : Phi->getOperand(1).getReg(); + + // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of + // the preheader and add the new CountReg to it. We attempt to place it late + // in the preheader, but may need to move that earlier based on uses. + MachineBasicBlock *MBB = LoopStart->getParent(); + MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); + for (MachineInstr &Use : + MRI->use_instructions(LoopStart->getOperand(0).getReg())) + if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || + !DT->dominates(ML->getHeader(), Use.getParent())) { + LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n"); + return false; + } + + MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), + TII->get(ARM::t2DoLoopStartTP)) + .add(LoopStart->getOperand(0)) + .add(LoopStart->getOperand(1)) + .addReg(CountReg); + (void)MI; + LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with " + << *MI.getInstr()); + MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); + LoopStart->eraseFromParent(); + + return true; +} + // Returns true if Opcode is any VCMP Opcode. static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } @@ -650,7 +650,7 @@ bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { } for (MachineInstr *DeadInstruction : DeadInstructions) - DeadInstruction->eraseFromParent(); + DeadInstruction->eraseFromParent(); return Modified; } @@ -724,160 +724,160 @@ bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { } for (MachineInstr *DeadInstruction : DeadInstructions) - DeadInstruction->eraseFromParent(); - - return !DeadInstructions.empty(); -} - -bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, - MachineDominatorTree *DT) { - // Scan through the block, looking for instructions that use constants moves - // into VPR that are the negative of one another. These are expected to be - // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant - // mask is kept it or and VPNOT's of it are added or reused as we scan through - // the function. - unsigned LastVPTImm = 0; - Register LastVPTReg = 0; - SmallSet<MachineInstr *, 4> DeadInstructions; - - for (MachineInstr &Instr : MBB.instrs()) { - // Look for predicated MVE instructions. - int PIdx = llvm::findFirstVPTPredOperandIdx(Instr); - if (PIdx == -1) - continue; - Register VPR = Instr.getOperand(PIdx + 1).getReg(); - if (!VPR.isVirtual()) - continue; - - // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. - MachineInstr *Copy = MRI->getVRegDef(VPR); - if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || - !Copy->getOperand(1).getReg().isVirtual() || - MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) { - LastVPTReg = 0; - continue; - } - Register GPR = Copy->getOperand(1).getReg(); - - // Find the Immediate used by the copy. - auto getImm = [&](Register GPR) -> unsigned { - MachineInstr *Def = MRI->getVRegDef(GPR); - if (Def && (Def->getOpcode() == ARM::t2MOVi || - Def->getOpcode() == ARM::t2MOVi16)) - return Def->getOperand(1).getImm(); - return -1U; - }; - unsigned Imm = getImm(GPR); - if (Imm == -1U) { - LastVPTReg = 0; - continue; - } - - unsigned NotImm = ~Imm & 0xffff; - if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { - Instr.getOperand(PIdx + 1).setReg(LastVPTReg); - if (MRI->use_empty(VPR)) { - DeadInstructions.insert(Copy); - if (MRI->hasOneUse(GPR)) - DeadInstructions.insert(MRI->getVRegDef(GPR)); - } - LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); - } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { - // We have found the not of a previous constant. Create a VPNot of the - // earlier predicate reg and use it instead of the copy. - Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass); - auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(), - TII->get(ARM::MVE_VPNOT), NewVPR) - .addReg(LastVPTReg); - addUnpredicatedMveVpredNOp(VPNot); - - // Use the new register and check if the def is now dead. - Instr.getOperand(PIdx + 1).setReg(NewVPR); - if (MRI->use_empty(VPR)) { - DeadInstructions.insert(Copy); - if (MRI->hasOneUse(GPR)) - DeadInstructions.insert(MRI->getVRegDef(GPR)); - } - LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " - << Instr); - VPR = NewVPR; - } - - LastVPTImm = Imm; - LastVPTReg = VPR; - } - - for (MachineInstr *DI : DeadInstructions) - DI->eraseFromParent(); - - return !DeadInstructions.empty(); -} - -// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a -// somewhat blunt approximation to allow tail predicated with vpsel -// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly -// different semantics under tail predication. Until that is modelled we just -// convert to a VMOVT (via a predicated VORR) instead. -bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { - bool HasVCTP = false; - SmallVector<MachineInstr *, 4> DeadInstructions; - - for (MachineInstr &MI : MBB.instrs()) { - if (isVCTP(&MI)) { - HasVCTP = true; - continue; - } - - if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) - continue; - - MachineInstrBuilder MIBuilder = - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR)) - .add(MI.getOperand(0)) - .add(MI.getOperand(1)) - .add(MI.getOperand(1)) - .addImm(ARMVCC::Then) - .add(MI.getOperand(4)) - .add(MI.getOperand(2)); - // Silence unused variable warning in release builds. - (void)MIBuilder; - LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump(); - dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump()); - DeadInstructions.push_back(&MI); - } - - for (MachineInstr *DeadInstruction : DeadInstructions) - DeadInstruction->eraseFromParent(); + DeadInstruction->eraseFromParent(); return !DeadInstructions.empty(); } +bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, + MachineDominatorTree *DT) { + // Scan through the block, looking for instructions that use constants moves + // into VPR that are the negative of one another. These are expected to be + // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant + // mask is kept it or and VPNOT's of it are added or reused as we scan through + // the function. + unsigned LastVPTImm = 0; + Register LastVPTReg = 0; + SmallSet<MachineInstr *, 4> DeadInstructions; + + for (MachineInstr &Instr : MBB.instrs()) { + // Look for predicated MVE instructions. + int PIdx = llvm::findFirstVPTPredOperandIdx(Instr); + if (PIdx == -1) + continue; + Register VPR = Instr.getOperand(PIdx + 1).getReg(); + if (!VPR.isVirtual()) + continue; + + // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. + MachineInstr *Copy = MRI->getVRegDef(VPR); + if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || + !Copy->getOperand(1).getReg().isVirtual() || + MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) { + LastVPTReg = 0; + continue; + } + Register GPR = Copy->getOperand(1).getReg(); + + // Find the Immediate used by the copy. + auto getImm = [&](Register GPR) -> unsigned { + MachineInstr *Def = MRI->getVRegDef(GPR); + if (Def && (Def->getOpcode() == ARM::t2MOVi || + Def->getOpcode() == ARM::t2MOVi16)) + return Def->getOperand(1).getImm(); + return -1U; + }; + unsigned Imm = getImm(GPR); + if (Imm == -1U) { + LastVPTReg = 0; + continue; + } + + unsigned NotImm = ~Imm & 0xffff; + if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { + Instr.getOperand(PIdx + 1).setReg(LastVPTReg); + if (MRI->use_empty(VPR)) { + DeadInstructions.insert(Copy); + if (MRI->hasOneUse(GPR)) + DeadInstructions.insert(MRI->getVRegDef(GPR)); + } + LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr); + } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { + // We have found the not of a previous constant. Create a VPNot of the + // earlier predicate reg and use it instead of the copy. + Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass); + auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(), + TII->get(ARM::MVE_VPNOT), NewVPR) + .addReg(LastVPTReg); + addUnpredicatedMveVpredNOp(VPNot); + + // Use the new register and check if the def is now dead. + Instr.getOperand(PIdx + 1).setReg(NewVPR); + if (MRI->use_empty(VPR)) { + DeadInstructions.insert(Copy); + if (MRI->hasOneUse(GPR)) + DeadInstructions.insert(MRI->getVRegDef(GPR)); + } + LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at " + << Instr); + VPR = NewVPR; + } + + LastVPTImm = Imm; + LastVPTReg = VPR; + } + + for (MachineInstr *DI : DeadInstructions) + DI->eraseFromParent(); + + return !DeadInstructions.empty(); +} + +// Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a +// somewhat blunt approximation to allow tail predicated with vpsel +// instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly +// different semantics under tail predication. Until that is modelled we just +// convert to a VMOVT (via a predicated VORR) instead. +bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { + bool HasVCTP = false; + SmallVector<MachineInstr *, 4> DeadInstructions; + + for (MachineInstr &MI : MBB.instrs()) { + if (isVCTP(&MI)) { + HasVCTP = true; + continue; + } + + if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) + continue; + + MachineInstrBuilder MIBuilder = + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)) + .addImm(ARMVCC::Then) + .add(MI.getOperand(4)) + .add(MI.getOperand(2)); + // Silence unused variable warning in release builds. + (void)MIBuilder; + LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump(); + dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump()); + DeadInstructions.push_back(&MI); + } + + for (MachineInstr *DeadInstruction : DeadInstructions) + DeadInstruction->eraseFromParent(); + + return !DeadInstructions.empty(); +} + bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { const ARMSubtarget &STI = static_cast<const ARMSubtarget &>(Fn.getSubtarget()); - if (!STI.isThumb2() || !STI.hasLOB()) + if (!STI.isThumb2() || !STI.hasLOB()) return false; TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); MRI = &Fn.getRegInfo(); - MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>(); - MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>(); + MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" << "********** Function: " << Fn.getName() << '\n'); bool Modified = false; - for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) { - Modified |= MergeLoopEnd(ML); - Modified |= ConvertTailPredLoop(ML, DT); - } - + for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) { + Modified |= MergeLoopEnd(ML); + Modified |= ConvertTailPredLoop(ML, DT); + } + for (MachineBasicBlock &MBB : Fn) { - Modified |= ReplaceConstByVPNOTs(MBB, DT); + Modified |= ReplaceConstByVPNOTs(MBB, DT); Modified |= ReplaceVCMPsByVPNOTs(MBB); Modified |= ReduceOldVCCRValueUses(MBB); - Modified |= ConvertVPSEL(MBB); + Modified |= ConvertVPSEL(MBB); } LLVM_DEBUG(dbgs() << "**************************************\n"); diff --git a/contrib/libs/llvm12/lib/Target/ARM/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/ARM/TargetInfo/ya.make index 089e7bf206..3f7fdcb6de 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/TargetInfo/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/TargetInfo/ya.make @@ -12,13 +12,13 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/lib/Support + contrib/libs/llvm12 + contrib/libs/llvm12/lib/Support ) ADDINCL( - contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM/TargetInfo + contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM/TargetInfo ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.cpp index d728572e28..9dd389f440 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -12,7 +12,7 @@ #include "Thumb2InstrInfo.h" #include "ARMMachineFunctionInfo.h" -#include "ARMSubtarget.h" +#include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -39,11 +39,11 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden, cl::desc("Use old-style Thumb2 if-conversion heuristics"), cl::init(false)); -static cl::opt<bool> -PreferNoCSEL("prefer-no-csel", cl::Hidden, - cl::desc("Prefer predicated Move to CSEL"), - cl::init(false)); - +static cl::opt<bool> +PreferNoCSEL("prefer-no-csel", cl::Hidden, + cl::desc("Prefer predicated Move to CSEL"), + cl::init(false)); + Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} @@ -124,31 +124,31 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL; } -MachineInstr * -Thumb2InstrInfo::optimizeSelect(MachineInstr &MI, - SmallPtrSetImpl<MachineInstr *> &SeenMIs, - bool PreferFalse) const { - // Try to use the base optimizeSelect, which uses canFoldIntoMOVCC to fold the - // MOVCC into another instruction. If that fails on 8.1-M fall back to using a - // CSEL. - MachineInstr *RV = ARMBaseInstrInfo::optimizeSelect(MI, SeenMIs, PreferFalse); - if (!RV && getSubtarget().hasV8_1MMainlineOps() && !PreferNoCSEL) { - Register DestReg = MI.getOperand(0).getReg(); - - if (!DestReg.isVirtual()) - return nullptr; - - MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - get(ARM::t2CSEL), DestReg) - .add(MI.getOperand(2)) - .add(MI.getOperand(1)) - .add(MI.getOperand(3)); - SeenMIs.insert(NewMI); - return NewMI; - } - return RV; -} - +MachineInstr * +Thumb2InstrInfo::optimizeSelect(MachineInstr &MI, + SmallPtrSetImpl<MachineInstr *> &SeenMIs, + bool PreferFalse) const { + // Try to use the base optimizeSelect, which uses canFoldIntoMOVCC to fold the + // MOVCC into another instruction. If that fails on 8.1-M fall back to using a + // CSEL. + MachineInstr *RV = ARMBaseInstrInfo::optimizeSelect(MI, SeenMIs, PreferFalse); + if (!RV && getSubtarget().hasV8_1MMainlineOps() && !PreferNoCSEL) { + Register DestReg = MI.getOperand(0).getReg(); + + if (!DestReg.isVirtual()) + return nullptr; + + MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + get(ARM::t2CSEL), DestReg) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)) + .add(MI.getOperand(3)); + SeenMIs.insert(NewMI); + return NewMI; + } + return RV; +} + void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, @@ -258,22 +258,22 @@ void Thumb2InstrInfo::expandLoadStackGuard( expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12); } -MachineInstr *Thumb2InstrInfo::commuteInstructionImpl(MachineInstr &MI, - bool NewMI, - unsigned OpIdx1, - unsigned OpIdx2) const { - switch (MI.getOpcode()) { - case ARM::MVE_VMAXNMAf16: - case ARM::MVE_VMAXNMAf32: - case ARM::MVE_VMINNMAf16: - case ARM::MVE_VMINNMAf32: - // Don't allow predicated instructions to be commuted. - if (getVPTInstrPredicate(MI) != ARMVCC::None) - return nullptr; - } - return ARMBaseInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); -} - +MachineInstr *Thumb2InstrInfo::commuteInstructionImpl(MachineInstr &MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + switch (MI.getOpcode()) { + case ARM::MVE_VMAXNMAf16: + case ARM::MVE_VMAXNMAf32: + case ARM::MVE_VMINNMAf16: + case ARM::MVE_VMINNMAf32: + // Don't allow predicated instructions to be commuted. + if (getVPTInstrPredicate(MI) != ARMVCC::None) + return nullptr; + } + return ARMBaseInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); +} + void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const DebugLoc &dl, Register DestReg, diff --git a/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.h index 808167bfdc..6fda236159 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/libs/llvm12/lib/Target/ARM/Thumb2InstrInfo.h @@ -60,14 +60,14 @@ public: /// const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } - MachineInstr *optimizeSelect(MachineInstr &MI, - SmallPtrSetImpl<MachineInstr *> &SeenMIs, - bool) const override; - - MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, - unsigned OpIdx1, - unsigned OpIdx2) const override; - + MachineInstr *optimizeSelect(MachineInstr &MI, + SmallPtrSetImpl<MachineInstr *> &SeenMIs, + bool) const override; + + MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; }; diff --git a/contrib/libs/llvm12/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/libs/llvm12/lib/Target/ARM/Thumb2SizeReduction.cpp index 0f7e190386..a200a5cf35 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/libs/llvm12/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -43,7 +43,7 @@ using namespace llvm; -#define DEBUG_TYPE "thumb2-reduce-size" +#define DEBUG_TYPE "thumb2-reduce-size" #define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass" STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); diff --git a/contrib/libs/llvm12/lib/Target/ARM/Utils/ya.make b/contrib/libs/llvm12/lib/Target/ARM/Utils/ya.make index 7a980b708c..fed79316b8 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/Utils/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/Utils/ya.make @@ -12,15 +12,15 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Support + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Support ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM/Utils + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM/Utils ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/ARM/ya.make b/contrib/libs/llvm12/lib/Target/ARM/ya.make index 9551f9f11b..7387bc4532 100644 --- a/contrib/libs/llvm12/lib/Target/ARM/ya.make +++ b/contrib/libs/llvm12/lib/Target/ARM/ya.make @@ -12,28 +12,28 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Analysis - contrib/libs/llvm12/lib/CodeGen - contrib/libs/llvm12/lib/CodeGen/AsmPrinter - contrib/libs/llvm12/lib/CodeGen/GlobalISel - contrib/libs/llvm12/lib/CodeGen/SelectionDAG - contrib/libs/llvm12/lib/IR - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target - contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc - contrib/libs/llvm12/lib/Target/ARM/TargetInfo - contrib/libs/llvm12/lib/Target/ARM/Utils - contrib/libs/llvm12/lib/Transforms/CFGuard - contrib/libs/llvm12/lib/Transforms/Scalar - contrib/libs/llvm12/lib/Transforms/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Analysis + contrib/libs/llvm12/lib/CodeGen + contrib/libs/llvm12/lib/CodeGen/AsmPrinter + contrib/libs/llvm12/lib/CodeGen/GlobalISel + contrib/libs/llvm12/lib/CodeGen/SelectionDAG + contrib/libs/llvm12/lib/IR + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target + contrib/libs/llvm12/lib/Target/ARM/MCTargetDesc + contrib/libs/llvm12/lib/Target/ARM/TargetInfo + contrib/libs/llvm12/lib/Target/ARM/Utils + contrib/libs/llvm12/lib/Transforms/CFGuard + contrib/libs/llvm12/lib/Transforms/Scalar + contrib/libs/llvm12/lib/Transforms/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM - contrib/libs/llvm12/lib/Target/ARM + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/ARM + contrib/libs/llvm12/lib/Target/ARM ) NO_COMPILER_WARNINGS() @@ -46,7 +46,7 @@ SRCS( ARMBaseInstrInfo.cpp ARMBaseRegisterInfo.cpp ARMBasicBlockInfo.cpp - ARMBlockPlacement.cpp + ARMBlockPlacement.cpp ARMCallLowering.cpp ARMCallingConv.cpp ARMConstantIslandPass.cpp @@ -69,7 +69,7 @@ SRCS( ARMParallelDSP.cpp ARMRegisterBankInfo.cpp ARMRegisterInfo.cpp - ARMSLSHardening.cpp + ARMSLSHardening.cpp ARMSelectionDAGInfo.cpp ARMSubtarget.cpp ARMTargetMachine.cpp |