diff options
author | shadchin <shadchin@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:44:30 +0300 |
commit | 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (patch) | |
tree | 012bb94d777798f1f56ac1cec429509766d05181 /contrib/libs/llvm12/lib/Target/AArch64 | |
parent | 6751af0b0c1b952fede40b19b71da8025b5d8bcf (diff) | |
download | ydb-2598ef1d0aee359b4b6d5fdd1758916d5907d04f.tar.gz |
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/AArch64')
85 files changed, 14997 insertions, 14997 deletions
diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64.h index d2170a99e0..88d25e474e 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64.h @@ -58,10 +58,10 @@ ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, AArch64Subtarget &, AArch64RegisterBankInfo &); -FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone); -FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone); -FunctionPass *createAArch64PostLegalizerLowering(); -FunctionPass *createAArch64PostSelectOptimize(); +FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone); +FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone); +FunctionPass *createAArch64PostLegalizerLowering(); +FunctionPass *createAArch64PostSelectOptimize(); FunctionPass *createAArch64StackTaggingPass(bool IsOptNone); FunctionPass *createAArch64StackTaggingPreRAPass(); @@ -82,8 +82,8 @@ void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); -void initializeAArch64PostLegalizerLoweringPass(PassRegistry &); -void initializeAArch64PostSelectOptimizePass(PassRegistry &); +void initializeAArch64PostLegalizerLoweringPass(PassRegistry &); +void initializeAArch64PostSelectOptimizePass(PassRegistry &); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64.td index 762855207d..385216a208 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64.td @@ -61,9 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; -def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", - "Enable out of line atomics to support LSE instructions">; - +def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", + "Enable out of line atomics to support LSE instructions">; + def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; @@ -75,12 +75,12 @@ def FeatureLOR : SubtargetFeature< "lor", "HasLOR", "true", "Enables ARM v8.1 Limited Ordering Regions extension">; -def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", - "true", "Enable RW operand CONTEXTIDR_EL2" >; - -def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", - "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >; +def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", + "true", "Enable RW operand CONTEXTIDR_EL2" >; +def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", + "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -218,10 +218,10 @@ def FeatureArithmeticCbzFusion : SubtargetFeature< "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", "CPU fuses arithmetic + cbz/cbnz operations">; -def FeatureCmpBccFusion : SubtargetFeature< - "cmp-bcc-fusion", "HasCmpBccFusion", "true", - "CPU fuses cmp+bcc operations">; - +def FeatureCmpBccFusion : SubtargetFeature< + "cmp-bcc-fusion", "HasCmpBccFusion", "true", + "CPU fuses cmp+bcc operations">; + def FeatureFuseAddress : SubtargetFeature< "fuse-address", "HasFuseAddress", "true", "CPU fuses address generation and memory operations">; @@ -265,8 +265,8 @@ def FeatureDotProd : SubtargetFeature< "dotprod", "HasDotProd", "true", "Enable dot product support">; -def FeaturePAuth : SubtargetFeature< - "pauth", "HasPAuth", "true", +def FeaturePAuth : SubtargetFeature< + "pauth", "HasPAuth", "true", "Enable v8.3-A Pointer Authentication extension">; def FeatureJS : SubtargetFeature< @@ -320,8 +320,8 @@ def FeatureTLB_RMI : SubtargetFeature< "tlb-rmi", "HasTLB_RMI", "true", "Enable v8.4-A TLB Range and Maintenance Instructions">; -def FeatureFlagM : SubtargetFeature< - "flagm", "HasFlagM", "true", +def FeatureFlagM : SubtargetFeature< + "flagm", "HasFlagM", "true", "Enable v8.4-A Flag Manipulation Instructions">; // 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset @@ -404,24 +404,24 @@ def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; -def FeatureXS : SubtargetFeature<"xs", "HasXS", - "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">; - -def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT", - "true", "Enable Armv8.7-A WFET and WFIT instruction">; - -def FeatureHCX : SubtargetFeature< - "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">; - -def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64", - "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">; - -def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE", - "true", "Enable Branch Record Buffer Extension">; - -def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", - "true", "Enable extra register in the Statistical Profiling Extension">; - +def FeatureXS : SubtargetFeature<"xs", "HasXS", + "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">; + +def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT", + "true", "Enable Armv8.7-A WFET and WFIT instruction">; + +def FeatureHCX : SubtargetFeature< + "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">; + +def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64", + "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">; + +def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE", + "true", "Enable Branch Record Buffer Extension">; + +def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", + "true", "Enable extra register in the Statistical Profiling Extension">; + def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", "true", "Enable fine grained virtualization traps extension">; @@ -442,14 +442,14 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", - "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth, + "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth, FeatureJS, FeatureCCIDX, FeatureComplxNum]>; def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, - FeatureNV, FeatureMPAM, FeatureDIT, + FeatureNV, FeatureMPAM, FeatureDIT, FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI, - FeatureFlagM, FeatureRCPC_IMMO]>; + FeatureFlagM, FeatureRCPC_IMMO]>; def HasV8_5aOps : SubtargetFeature< "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", @@ -462,26 +462,26 @@ def HasV8_6aOps : SubtargetFeature< [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; -def HasV8_7aOps : SubtargetFeature< - "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", - [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>; - -def HasV8_0rOps : SubtargetFeature< - "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions", - [//v8.1 - FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2, - //v8.2 - FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4, - FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV, - //v8.3 - FeatureComplxNum, FeatureCCIDX, FeatureJS, - FeaturePAuth, FeatureRCPC, - //v8.4 - FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4, - FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO, - //v8.5 - FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>; - +def HasV8_7aOps : SubtargetFeature< + "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", + [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>; + +def HasV8_0rOps : SubtargetFeature< + "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions", + [//v8.1 + FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2, + //v8.2 + FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4, + FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV, + //v8.3 + FeatureComplxNum, FeatureCCIDX, FeatureJS, + FeaturePAuth, FeatureRCPC, + //v8.4 + FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4, + FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO, + //v8.5 + FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -543,11 +543,11 @@ def SVEUnsupported : AArch64Unsupported { } def PAUnsupported : AArch64Unsupported { - let F = [HasPAuth]; + let F = [HasPAuth]; } include "AArch64SchedA53.td" -include "AArch64SchedA55.td" +include "AArch64SchedA55.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" include "AArch64SchedFalkor.td" @@ -557,9 +557,9 @@ include "AArch64SchedExynosM4.td" include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" -include "AArch64SchedA64FX.td" +include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" -include "AArch64SchedTSV110.td" +include "AArch64SchedTSV110.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -619,9 +619,9 @@ def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", FeatureDotProd, FeatureFPARMv8, FeatureFullFP16, - FeatureFuseAddress, - FeatureFuseAES, - FeatureFuseLiterals, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseLiterals, FeatureNEON, FeatureRAS, FeatureRCPC, @@ -634,7 +634,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", FeatureCrypto, FeatureFPARMv8, FeatureFuseAES, - FeatureFuseLiterals, + FeatureFuseLiterals, FeatureNEON, FeaturePerfMon ]>; @@ -666,7 +666,7 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ HasV8_2aOps, FeatureFPARMv8, - FeatureFuseAES, + FeatureFuseAES, FeatureNEON, FeatureRCPC, FeatureCrypto, @@ -678,9 +678,9 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", [ HasV8_2aOps, - FeatureCmpBccFusion, + FeatureCmpBccFusion, FeatureFPARMv8, - FeatureFuseAES, + FeatureFuseAES, FeatureNEON, FeatureRCPC, FeatureCrypto, FeatureFullFP16, @@ -691,7 +691,7 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", [ HasV8_2aOps, - FeatureCmpBccFusion, + FeatureCmpBccFusion, FeatureCrypto, FeatureFPARMv8, FeatureFuseAES, @@ -704,39 +704,39 @@ def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", FeatureSSBS, FeatureDotProd]>; -def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily", - "CortexA78C", - "Cortex-A78C ARM processors", [ - HasV8_2aOps, - FeatureCmpBccFusion, - FeatureCrypto, - FeatureDotProd, - FeatureFlagM, - FeatureFP16FML, - FeatureFPARMv8, - FeatureFullFP16, - FeatureFuseAES, - FeatureNEON, - FeaturePAuth, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureRCPC, - FeatureSPE, - FeatureSSBS]>; - -def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", - "CortexR82", - "Cortex-R82 ARM Processors", [ - FeaturePostRAScheduler, - // TODO: crypto and FuseAES - // All other features are implied by v8_0r ops: - HasV8_0rOps, - ]>; - +def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily", + "CortexA78C", + "Cortex-A78C ARM processors", [ + HasV8_2aOps, + FeatureCmpBccFusion, + FeatureCrypto, + FeatureDotProd, + FeatureFlagM, + FeatureFP16FML, + FeatureFPARMv8, + FeatureFullFP16, + FeatureFuseAES, + FeatureNEON, + FeaturePAuth, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureRCPC, + FeatureSPE, + FeatureSSBS]>; + +def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", + "CortexR82", + "Cortex-R82 ARM Processors", [ + FeaturePostRAScheduler, + // TODO: crypto and FuseAES + // All other features are implied by v8_0r ops: + HasV8_0rOps, + ]>; + def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", "Cortex-X1 ARM processors", [ HasV8_2aOps, - FeatureCmpBccFusion, + FeatureCmpBccFusion, FeatureCrypto, FeatureFPARMv8, FeatureFuseAES, @@ -758,10 +758,10 @@ def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", FeatureFullFP16, FeatureSVE, FeaturePostRAScheduler, - FeatureComplxNum, - FeatureAggressiveFMA, - FeatureArithmeticBccFusion, - FeaturePredictableSelectIsExpensive + FeatureComplxNum, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeaturePredictableSelectIsExpensive ]>; def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel", @@ -868,38 +868,38 @@ def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", HasV8_4aOps ]>; -def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", - "Apple A14", [ - FeatureAggressiveFMA, - FeatureAlternateSExtLoadCVTF32Pattern, - FeatureAltFPCmp, - FeatureArithmeticBccFusion, - FeatureArithmeticCbzFusion, - FeatureCrypto, - FeatureDisableLatencySchedHeuristic, - FeatureFPARMv8, - FeatureFRInt3264, - FeatureFuseAddress, - FeatureFuseAES, - FeatureFuseArithmeticLogic, - FeatureFuseCCSelect, - FeatureFuseCryptoEOR, - FeatureFuseLiterals, - FeatureNEON, - FeaturePerfMon, - FeatureSpecRestrict, - FeatureSSBS, - FeatureSB, - FeaturePredRes, - FeatureCacheDeepPersist, - FeatureZCRegMove, - FeatureZCZeroing, - FeatureFullFP16, - FeatureFP16FML, - FeatureSHA3, - HasV8_4aOps - ]>; - +def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", + "Apple A14", [ + FeatureAggressiveFMA, + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAltFPCmp, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFRInt3264, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseCryptoEOR, + FeatureFuseLiterals, + FeatureNEON, + FeaturePerfMon, + FeatureSpecRestrict, + FeatureSSBS, + FeatureSB, + FeaturePredRes, + FeatureCacheDeepPersist, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + FeatureFP16FML, + FeatureSHA3, + HasV8_4aOps + ]>; + def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", [FeatureCRC, @@ -993,38 +993,38 @@ def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", FeatureSSBS, ]>; -def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", - "NeoverseN2", - "Neoverse N2 ARM processors", [ - HasV8_5aOps, - FeatureBF16, - FeatureETE, - FeatureMatMulInt8, - FeatureMTE, - FeatureSVE2, - FeatureSVE2BitPerm, - FeatureTRBE]>; - -def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", - "NeoverseV1", - "Neoverse V1 ARM processors", [ - HasV8_4aOps, - FeatureBF16, - FeatureCacheDeepPersist, - FeatureCrypto, - FeatureFPARMv8, - FeatureFP16FML, - FeatureFullFP16, - FeatureFuseAES, - FeatureMatMulInt8, - FeatureNEON, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureRandGen, - FeatureSPE, - FeatureSSBS, - FeatureSVE]>; - +def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", + "NeoverseN2", + "Neoverse N2 ARM processors", [ + HasV8_5aOps, + FeatureBF16, + FeatureETE, + FeatureMatMulInt8, + FeatureMTE, + FeatureSVE2, + FeatureSVE2BitPerm, + FeatureTRBE]>; + +def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", + "NeoverseV1", + "Neoverse V1 ARM processors", [ + HasV8_4aOps, + FeatureBF16, + FeatureCacheDeepPersist, + FeatureCrypto, + FeatureFPARMv8, + FeatureFP16FML, + FeatureFullFP16, + FeatureFuseAES, + FeatureMatMulInt8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureRandGen, + FeatureSPE, + FeatureSSBS, + FeatureSVE]>; + def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ FeatureCrypto, @@ -1065,7 +1065,7 @@ def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily", FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureLSE, - FeaturePAuth, + FeaturePAuth, FeatureUseAA, FeatureBalanceFPOps, FeaturePerfMon, @@ -1147,7 +1147,7 @@ def : ProcessorModel<"generic", NoSchedModel, [ def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; -def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>; +def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>; def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>; @@ -1158,13 +1158,13 @@ def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>; def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>; -def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>; -def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>; +def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>; +def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>; def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; -def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>; -def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>; +def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>; +def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>; def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>; @@ -1180,7 +1180,7 @@ def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; // Marvell ThunderX3T110 Processors. def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>; -def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>; +def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>; // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode. def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>; @@ -1193,7 +1193,7 @@ def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>; def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>; def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>; def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>; -def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>; +def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>; // watch CPUs. def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>; @@ -1203,7 +1203,7 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>; def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>; // Fujitsu A64FX -def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>; +def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>; // Nvidia Carmel def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index c996d2df8c..74fd2411f4 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -123,7 +123,7 @@ static bool isFPR64(unsigned Reg, unsigned SubReg, } // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 -// copy instruction. Return nullptr if the instruction is not a copy. +// copy instruction. Return nullptr if the instruction is not a copy. static MachineOperand *getSrcFromCopy(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &SubReg) { diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64AsmPrinter.cpp index a0c5498ee6..419af6785c 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -32,7 +32,7 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/FaultMaps.h" +#include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -55,7 +55,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" +#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -71,13 +71,13 @@ namespace { class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; - FaultMaps FM; + FaultMaps FM; const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), - SM(*this), FM(*this) {} + SM(*this), FM(*this) {} StringRef getPassName() const override { return "AArch64 Assembly Printer"; } @@ -92,15 +92,15 @@ public: void emitFunctionEntryLabel() override; - void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI); + void LowerJumpTableDest(MCStreamer &OutStreamer, const MachineInstr &MI); void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI); - void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); - void LowerFAULTING_OP(const MachineInstr &MI); + void LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + void LowerFAULTING_OP(const MachineInstr &MI); void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI); void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); @@ -195,24 +195,24 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) { return; // Assemble feature flags that may require creation of a note section. - unsigned Flags = 0; - if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( - M.getModuleFlag("branch-target-enforcement"))) - if (BTE->getZExtValue()) - Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; + unsigned Flags = 0; + if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("branch-target-enforcement"))) + if (BTE->getZExtValue()) + Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; - if (const auto *Sign = mdconst::extract_or_null<ConstantInt>( - M.getModuleFlag("sign-return-address"))) - if (Sign->getZExtValue()) - Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + if (const auto *Sign = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("sign-return-address"))) + if (Sign->getZExtValue()) + Flags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; if (Flags == 0) return; // Emit a .note.gnu.property section with the flags. - if (auto *TS = static_cast<AArch64TargetStreamer *>( - OutStreamer->getTargetStreamer())) - TS->emitNoteSection(Flags); + if (auto *TS = static_cast<AArch64TargetStreamer *>( + OutStreamer->getTargetStreamer())) + TS->emitNoteSection(Flags); } void AArch64AsmPrinter::emitFunctionHeaderComment() { @@ -303,7 +303,7 @@ void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) { std::string SymName = "__hwasan_check_x" + utostr(Reg - AArch64::X0) + "_" + utostr(AccessInfo); if (IsShort) - SymName += "_short_v2"; + SymName += "_short_v2"; Sym = OutContext.getOrCreateSymbol(SymName); } @@ -320,7 +320,7 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { assert(TT.isOSBinFormatELF()); std::unique_ptr<MCSubtargetInfo> STI( TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); - assert(STI && "Unable to create subtarget info"); + assert(STI && "Unable to create subtarget info"); MCSymbol *HwasanTagMismatchV1Sym = OutContext.getOrCreateSymbol("__hwasan_tag_mismatch"); @@ -340,15 +340,15 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { IsShort ? HwasanTagMismatchV2Ref : HwasanTagMismatchV1Ref; MCSymbol *Sym = P.second; - bool HasMatchAllTag = - (AccessInfo >> HWASanAccessInfo::HasMatchAllShift) & 1; - uint8_t MatchAllTag = - (AccessInfo >> HWASanAccessInfo::MatchAllShift) & 0xff; - unsigned Size = - 1 << ((AccessInfo >> HWASanAccessInfo::AccessSizeShift) & 0xf); - bool CompileKernel = - (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1; - + bool HasMatchAllTag = + (AccessInfo >> HWASanAccessInfo::HasMatchAllShift) & 1; + uint8_t MatchAllTag = + (AccessInfo >> HWASanAccessInfo::MatchAllShift) & 0xff; + unsigned Size = + 1 << ((AccessInfo >> HWASanAccessInfo::AccessSizeShift) & 0xf); + bool CompileKernel = + (AccessInfo >> HWASanAccessInfo::CompileKernelShift) & 1; + OutStreamer->SwitchSection(OutContext.getELFSection( ".text.hot", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, @@ -359,21 +359,21 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden); OutStreamer->emitLabel(Sym); - OutStreamer->emitInstruction(MCInstBuilder(AArch64::SBFMXri) + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SBFMXri) .addReg(AArch64::X16) .addReg(Reg) .addImm(4) .addImm(55), *STI); OutStreamer->emitInstruction( - MCInstBuilder(AArch64::LDRBBroX) - .addReg(AArch64::W16) - .addReg(IsShort ? AArch64::X20 : AArch64::X9) - .addReg(AArch64::X16) - .addImm(0) - .addImm(0), - *STI); - OutStreamer->emitInstruction( + MCInstBuilder(AArch64::LDRBBroX) + .addReg(AArch64::W16) + .addReg(IsShort ? AArch64::X20 : AArch64::X9) + .addReg(AArch64::X16) + .addImm(0) + .addImm(0), + *STI); + OutStreamer->emitInstruction( MCInstBuilder(AArch64::SUBSXrs) .addReg(AArch64::XZR) .addReg(AArch64::X16) @@ -393,26 +393,26 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI); OutStreamer->emitLabel(HandleMismatchOrPartialSym); - if (HasMatchAllTag) { - OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri) - .addReg(AArch64::X16) - .addReg(Reg) - .addImm(56) - .addImm(63), - *STI); - OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSXri) - .addReg(AArch64::XZR) - .addReg(AArch64::X16) - .addImm(MatchAllTag) - .addImm(0), - *STI); - OutStreamer->emitInstruction( - MCInstBuilder(AArch64::Bcc) - .addImm(AArch64CC::EQ) - .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), - *STI); - } - + if (HasMatchAllTag) { + OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri) + .addReg(AArch64::X16) + .addReg(Reg) + .addImm(56) + .addImm(63), + *STI); + OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSXri) + .addReg(AArch64::XZR) + .addReg(AArch64::X16) + .addImm(MatchAllTag) + .addImm(0), + *STI); + OutStreamer->emitInstruction( + MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)), + *STI); + } + if (IsShort) { OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri) .addReg(AArch64::WZR) @@ -501,40 +501,40 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) { .addImm(0), *STI); OutStreamer->emitInstruction( - MCInstBuilder(AArch64::MOVZXi) - .addReg(AArch64::X1) - .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask) - .addImm(0), + MCInstBuilder(AArch64::MOVZXi) + .addReg(AArch64::X1) + .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask) + .addImm(0), *STI); - - if (CompileKernel) { - // The Linux kernel's dynamic loader doesn't support GOT relative - // relocations, but it doesn't support late binding either, so just call - // the function directly. - OutStreamer->emitInstruction( - MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef), *STI); - } else { - // Intentionally load the GOT entry and branch to it, rather than possibly - // late binding the function, which may clobber the registers before we - // have a chance to save them. - OutStreamer->emitInstruction( - MCInstBuilder(AArch64::ADRP) - .addReg(AArch64::X16) - .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, - OutContext)), - *STI); - OutStreamer->emitInstruction( - MCInstBuilder(AArch64::LDRXui) - .addReg(AArch64::X16) - .addReg(AArch64::X16) - .addExpr(AArch64MCExpr::create( - HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, - OutContext)), - *STI); - OutStreamer->emitInstruction( - MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); - } + + if (CompileKernel) { + // The Linux kernel's dynamic loader doesn't support GOT relative + // relocations, but it doesn't support late binding either, so just call + // the function directly. + OutStreamer->emitInstruction( + MCInstBuilder(AArch64::B).addExpr(HwasanTagMismatchRef), *STI); + } else { + // Intentionally load the GOT entry and branch to it, rather than possibly + // late binding the function, which may clobber the registers before we + // have a chance to save them. + OutStreamer->emitInstruction( + MCInstBuilder(AArch64::ADRP) + .addReg(AArch64::X16) + .addExpr(AArch64MCExpr::create( + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE, + OutContext)), + *STI); + OutStreamer->emitInstruction( + MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X16) + .addReg(AArch64::X16) + .addExpr(AArch64MCExpr::create( + HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12, + OutContext)), + *STI); + OutStreamer->emitInstruction( + MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI); + } } } @@ -550,11 +550,11 @@ void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) { // generates code that does this, it is always safe to set. OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } - - // Emit stack and fault map information. + + // Emit stack and fault map information. emitStackMaps(SM); - FM.serializeToFaultMapSection(); - + FM.serializeToFaultMapSection(); + } void AArch64AsmPrinter::EmitLOHs() { @@ -647,8 +647,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterInfo *RI = STI->getRegisterInfo(); Register Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); - if (!RI->regsOverlap(RegToPrint, Reg)) - return true; + if (!RI->regsOverlap(RegToPrint, Reg)) + return true; O << AArch64InstPrinter::getRegisterName(RegToPrint, AltName); return false; } @@ -809,24 +809,24 @@ void AArch64AsmPrinter::emitJumpTableInfo() { emitAlignment(Align(Size)); OutStreamer->emitLabel(GetJTISymbol(JTI)); - const MCSymbol *BaseSym = AArch64FI->getJumpTableEntryPCRelSymbol(JTI); - const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext); - - for (auto *JTBB : JTBBs) { - const MCExpr *Value = - MCSymbolRefExpr::create(JTBB->getSymbol(), OutContext); - - // Each entry is: - // .byte/.hword (LBB - Lbase)>>2 - // or plain: - // .word LBB - Lbase - Value = MCBinaryExpr::createSub(Value, Base, OutContext); - if (Size != 4) - Value = MCBinaryExpr::createLShr( - Value, MCConstantExpr::create(2, OutContext), OutContext); - - OutStreamer->emitValue(Value, Size); - } + const MCSymbol *BaseSym = AArch64FI->getJumpTableEntryPCRelSymbol(JTI); + const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext); + + for (auto *JTBB : JTBBs) { + const MCExpr *Value = + MCSymbolRefExpr::create(JTBB->getSymbol(), OutContext); + + // Each entry is: + // .byte/.hword (LBB - Lbase)>>2 + // or plain: + // .word LBB - Lbase + Value = MCBinaryExpr::createSub(Value, Base, OutContext); + if (Size != 4) + Value = MCBinaryExpr::createLShr( + Value, MCConstantExpr::create(2, OutContext), OutContext); + + OutStreamer->emitValue(Value, Size); + } } } @@ -851,9 +851,9 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { /// /// adr xDest, .LBB0_0 /// ldrb wScratch, [xTable, xEntry] (with "lsl #1" for ldrh). -/// add xDest, xDest, xScratch (with "lsl #2" for smaller entries) -void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer, - const llvm::MachineInstr &MI) { +/// add xDest, xDest, xScratch (with "lsl #2" for smaller entries) +void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer, + const llvm::MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); Register ScratchReg = MI.getOperand(1).getReg(); Register ScratchRegW = @@ -861,50 +861,50 @@ void AArch64AsmPrinter::LowerJumpTableDest(llvm::MCStreamer &OutStreamer, Register TableReg = MI.getOperand(2).getReg(); Register EntryReg = MI.getOperand(3).getReg(); int JTIdx = MI.getOperand(4).getIndex(); - int Size = AArch64FI->getJumpTableEntrySize(JTIdx); + int Size = AArch64FI->getJumpTableEntrySize(JTIdx); // This has to be first because the compression pass based its reachability // calculations on the start of the JumpTableDest instruction. auto Label = MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx); - - // If we don't already have a symbol to use as the base, use the ADR - // instruction itself. - if (!Label) { - Label = MF->getContext().createTempSymbol(); - AArch64FI->setJumpTableEntryInfo(JTIdx, Size, Label); - OutStreamer.emitLabel(Label); - } - - auto LabelExpr = MCSymbolRefExpr::create(Label, MF->getContext()); + + // If we don't already have a symbol to use as the base, use the ADR + // instruction itself. + if (!Label) { + Label = MF->getContext().createTempSymbol(); + AArch64FI->setJumpTableEntryInfo(JTIdx, Size, Label); + OutStreamer.emitLabel(Label); + } + + auto LabelExpr = MCSymbolRefExpr::create(Label, MF->getContext()); EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR) .addReg(DestReg) - .addExpr(LabelExpr)); + .addExpr(LabelExpr)); // Load the number of instruction-steps to offset from the label. - unsigned LdrOpcode; - switch (Size) { - case 1: LdrOpcode = AArch64::LDRBBroX; break; - case 2: LdrOpcode = AArch64::LDRHHroX; break; - case 4: LdrOpcode = AArch64::LDRSWroX; break; - default: - llvm_unreachable("Unknown jump table size"); - } - + unsigned LdrOpcode; + switch (Size) { + case 1: LdrOpcode = AArch64::LDRBBroX; break; + case 2: LdrOpcode = AArch64::LDRHHroX; break; + case 4: LdrOpcode = AArch64::LDRSWroX; break; + default: + llvm_unreachable("Unknown jump table size"); + } + EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode) - .addReg(Size == 4 ? ScratchReg : ScratchRegW) + .addReg(Size == 4 ? ScratchReg : ScratchRegW) .addReg(TableReg) .addReg(EntryReg) .addImm(0) - .addImm(Size == 1 ? 0 : 1)); + .addImm(Size == 1 ? 0 : 1)); - // Add to the already materialized base label address, multiplying by 4 if - // compressed. + // Add to the already materialized base label address, multiplying by 4 if + // compressed. EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs) .addReg(DestReg) .addReg(DestReg) .addReg(ScratchReg) - .addImm(Size == 4 ? 0 : 2)); + .addImm(Size == 4 ? 0 : 2)); } void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, @@ -982,83 +982,83 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } -void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { - StatepointOpers SOpers(&MI); - if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { - assert(PatchBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); - for (unsigned i = 0; i < PatchBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); - } else { - // Lower call target and choose correct opcode - const MachineOperand &CallTarget = SOpers.getCallTarget(); - MCOperand CallTargetMCOp; - unsigned CallOpcode; - switch (CallTarget.getType()) { - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - MCInstLowering.lowerOperand(CallTarget, CallTargetMCOp); - CallOpcode = AArch64::BL; - break; - case MachineOperand::MO_Immediate: - CallTargetMCOp = MCOperand::createImm(CallTarget.getImm()); - CallOpcode = AArch64::BL; - break; - case MachineOperand::MO_Register: - CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); - CallOpcode = AArch64::BLR; - break; - default: - llvm_unreachable("Unsupported operand type in statepoint call target"); - break; - } - - EmitToStreamer(OutStreamer, - MCInstBuilder(CallOpcode).addOperand(CallTargetMCOp)); - } - - auto &Ctx = OutStreamer.getContext(); - MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer.emitLabel(MILabel); - SM.recordStatepoint(*MILabel, MI); -} - -void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) { - // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, - // <opcode>, <operands> - - Register DefRegister = FaultingMI.getOperand(0).getReg(); - FaultMaps::FaultKind FK = - static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); - MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); - unsigned Opcode = FaultingMI.getOperand(3).getImm(); - unsigned OperandsBeginIdx = 4; - - auto &Ctx = OutStreamer->getContext(); - MCSymbol *FaultingLabel = Ctx.createTempSymbol(); - OutStreamer->emitLabel(FaultingLabel); - - assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); - FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel); - - MCInst MI; - MI.setOpcode(Opcode); - - if (DefRegister != (Register)0) - MI.addOperand(MCOperand::createReg(DefRegister)); - - for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx, - E = FaultingMI.operands_end(); - I != E; ++I) { - MCOperand Dest; - lowerOperand(*I, Dest); - MI.addOperand(Dest); - } - - OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); - OutStreamer->emitInstruction(MI, getSubtargetInfo()); -} - +void AArch64AsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + StatepointOpers SOpers(&MI); + if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { + assert(PatchBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + for (unsigned i = 0; i < PatchBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + } else { + // Lower call target and choose correct opcode + const MachineOperand &CallTarget = SOpers.getCallTarget(); + MCOperand CallTargetMCOp; + unsigned CallOpcode; + switch (CallTarget.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCInstLowering.lowerOperand(CallTarget, CallTargetMCOp); + CallOpcode = AArch64::BL; + break; + case MachineOperand::MO_Immediate: + CallTargetMCOp = MCOperand::createImm(CallTarget.getImm()); + CallOpcode = AArch64::BL; + break; + case MachineOperand::MO_Register: + CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); + CallOpcode = AArch64::BLR; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } + + EmitToStreamer(OutStreamer, + MCInstBuilder(CallOpcode).addOperand(CallTargetMCOp)); + } + + auto &Ctx = OutStreamer.getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer.emitLabel(MILabel); + SM.recordStatepoint(*MILabel, MI); +} + +void AArch64AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI) { + // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, + // <opcode>, <operands> + + Register DefRegister = FaultingMI.getOperand(0).getReg(); + FaultMaps::FaultKind FK = + static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); + MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); + unsigned Opcode = FaultingMI.getOperand(3).getImm(); + unsigned OperandsBeginIdx = 4; + + auto &Ctx = OutStreamer->getContext(); + MCSymbol *FaultingLabel = Ctx.createTempSymbol(); + OutStreamer->emitLabel(FaultingLabel); + + assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); + FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel); + + MCInst MI; + MI.setOpcode(Opcode); + + if (DefRegister != (Register)0) + MI.addOperand(MCOperand::createReg(DefRegister)); + + for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx, + E = FaultingMI.operands_end(); + I != E; ++I) { + MCOperand Dest; + lowerOperand(*I, Dest); + MI.addOperand(Dest); + } + + OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); + OutStreamer->emitInstruction(MI, getSubtargetInfo()); +} + void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { @@ -1272,28 +1272,28 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Adrp); MCInst Ldr; - if (STI->isTargetILP32()) { - Ldr.setOpcode(AArch64::LDRWui); - Ldr.addOperand(MCOperand::createReg(AArch64::W1)); - } else { - Ldr.setOpcode(AArch64::LDRXui); - Ldr.addOperand(MCOperand::createReg(AArch64::X1)); - } + if (STI->isTargetILP32()) { + Ldr.setOpcode(AArch64::LDRWui); + Ldr.addOperand(MCOperand::createReg(AArch64::W1)); + } else { + Ldr.setOpcode(AArch64::LDRXui); + Ldr.addOperand(MCOperand::createReg(AArch64::X1)); + } Ldr.addOperand(MCOperand::createReg(AArch64::X0)); Ldr.addOperand(SymTLSDescLo12); Ldr.addOperand(MCOperand::createImm(0)); EmitToStreamer(*OutStreamer, Ldr); MCInst Add; - if (STI->isTargetILP32()) { - Add.setOpcode(AArch64::ADDWri); - Add.addOperand(MCOperand::createReg(AArch64::W0)); - Add.addOperand(MCOperand::createReg(AArch64::W0)); - } else { - Add.setOpcode(AArch64::ADDXri); - Add.addOperand(MCOperand::createReg(AArch64::X0)); - Add.addOperand(MCOperand::createReg(AArch64::X0)); - } + if (STI->isTargetILP32()) { + Add.setOpcode(AArch64::ADDWri); + Add.addOperand(MCOperand::createReg(AArch64::W0)); + Add.addOperand(MCOperand::createReg(AArch64::W0)); + } else { + Add.setOpcode(AArch64::ADDXri); + Add.addOperand(MCOperand::createReg(AArch64::X0)); + Add.addOperand(MCOperand::createReg(AArch64::X0)); + } Add.addOperand(SymTLSDescLo12); Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0))); EmitToStreamer(*OutStreamer, Add); @@ -1313,10 +1313,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { return; } - case AArch64::JumpTableDest32: + case AArch64::JumpTableDest32: case AArch64::JumpTableDest16: case AArch64::JumpTableDest8: - LowerJumpTableDest(*OutStreamer, *MI); + LowerJumpTableDest(*OutStreamer, *MI); return; case AArch64::FMOVH0: @@ -1331,12 +1331,12 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { case TargetOpcode::PATCHPOINT: return LowerPATCHPOINT(*OutStreamer, SM, *MI); - case TargetOpcode::STATEPOINT: - return LowerSTATEPOINT(*OutStreamer, SM, *MI); - - case TargetOpcode::FAULTING_OP: - return LowerFAULTING_OP(*MI); - + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(*OutStreamer, SM, *MI); + + case TargetOpcode::FAULTING_OP: + return LowerFAULTING_OP(*MI); + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: LowerPATCHABLE_FUNCTION_ENTER(*MI); return; @@ -1381,14 +1381,14 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { return; case AArch64::SEH_SaveRegP: - if (MI->getOperand(1).getImm() == 30 && MI->getOperand(0).getImm() >= 19 && - MI->getOperand(0).getImm() <= 28) { - assert((MI->getOperand(0).getImm() - 19) % 2 == 0 && - "Register paired with LR must be odd"); - TS->EmitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(), - MI->getOperand(2).getImm()); - return; - } + if (MI->getOperand(1).getImm() == 30 && MI->getOperand(0).getImm() >= 19 && + MI->getOperand(0).getImm() <= 28) { + assert((MI->getOperand(0).getImm() - 19) % 2 == 0 && + "Register paired with LR must be odd"); + TS->EmitARM64WinCFISaveLRPair(MI->getOperand(0).getImm(), + MI->getOperand(2).getImm()); + return; + } assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) && "Non-consecutive registers not allowed for save_regp"); TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(), diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64BranchTargets.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64BranchTargets.cpp index d3b5166585..12a4c8ce9d 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -16,7 +16,7 @@ // //===----------------------------------------------------------------------===// -#include "AArch64MachineFunctionInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -58,13 +58,13 @@ FunctionPass *llvm::createAArch64BranchTargetsPass() { } bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { - if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) + if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) return false; LLVM_DEBUG( dbgs() << "********** AArch64 Branch Targets **********\n" << "********** Function: " << MF.getName() << '\n'); - const Function &F = MF.getFunction(); + const Function &F = MF.getFunction(); // LLVM does not consider basic blocks which are the targets of jump tables // to be address-taken (the address can't escape anywhere else), but they are diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64CallingConvention.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64CallingConvention.cpp index c51dd48cab..ab1a31e1e7 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -42,51 +42,51 @@ static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2, static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, Align SlotAlign) { - if (LocVT.isScalableVector()) { - const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( - State.getMachineFunction().getSubtarget()); - const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - - // We are about to reinvoke the CCAssignFn auto-generated handler. If we - // don't unset these flags we will get stuck in an infinite loop forever - // invoking the custom handler. - ArgFlags.setInConsecutiveRegs(false); - ArgFlags.setInConsecutiveRegsLast(false); - - // The calling convention for passing SVE tuples states that in the event - // we cannot allocate enough registers for the tuple we should still leave - // any remaining registers unallocated. However, when we call the - // CCAssignFn again we want it to behave as if all remaining registers are - // allocated. This will force the code to pass the tuple indirectly in - // accordance with the PCS. - bool RegsAllocated[8]; - for (int I = 0; I < 8; I++) { - RegsAllocated[I] = State.isAllocated(ZRegList[I]); - State.AllocateReg(ZRegList[I]); - } - - auto &It = PendingMembers[0]; - CCAssignFn *AssignFn = - TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false); - if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full, - ArgFlags, State)) - llvm_unreachable("Call operand has unhandled type"); - - // Return the flags to how they were before. - ArgFlags.setInConsecutiveRegs(true); - ArgFlags.setInConsecutiveRegsLast(true); - - // Return the register state back to how it was before, leaving any - // unallocated registers available for other smaller types. - for (int I = 0; I < 8; I++) - if (!RegsAllocated[I]) - State.DeallocateReg(ZRegList[I]); - - // All pending members have now been allocated - PendingMembers.clear(); - return true; - } - + if (LocVT.isScalableVector()) { + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); + + // We are about to reinvoke the CCAssignFn auto-generated handler. If we + // don't unset these flags we will get stuck in an infinite loop forever + // invoking the custom handler. + ArgFlags.setInConsecutiveRegs(false); + ArgFlags.setInConsecutiveRegsLast(false); + + // The calling convention for passing SVE tuples states that in the event + // we cannot allocate enough registers for the tuple we should still leave + // any remaining registers unallocated. However, when we call the + // CCAssignFn again we want it to behave as if all remaining registers are + // allocated. This will force the code to pass the tuple indirectly in + // accordance with the PCS. + bool RegsAllocated[8]; + for (int I = 0; I < 8; I++) { + RegsAllocated[I] = State.isAllocated(ZRegList[I]); + State.AllocateReg(ZRegList[I]); + } + + auto &It = PendingMembers[0]; + CCAssignFn *AssignFn = + TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false); + if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full, + ArgFlags, State)) + llvm_unreachable("Call operand has unhandled type"); + + // Return the flags to how they were before. + ArgFlags.setInConsecutiveRegs(true); + ArgFlags.setInConsecutiveRegsLast(true); + + // Return the register state back to how it was before, leaving any + // unallocated registers available for other smaller types. + for (int I = 0; I < 8; I++) + if (!RegsAllocated[I]) + State.DeallocateReg(ZRegList[I]); + + // All pending members have now been allocated + PendingMembers.clear(); + return true; + } + unsigned Size = LocVT.getSizeInBits() / 8; const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); @@ -191,11 +191,11 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } - if (!LocVT.isScalableVector()) { - // Mark all regs in the class as unavailable - for (auto Reg : RegList) - State.AllocateReg(Reg); - } + if (!LocVT.isScalableVector()) { + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + } const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Combine.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Combine.td index b1e714653f..03d92b8d50 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Combine.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Combine.td @@ -75,68 +75,68 @@ def ext: GICombineRule < // instruction. def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>; -def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">; -def vashr_vlshr_imm : GICombineRule< - (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo), - (match (wip_match_opcode G_ASHR, G_LSHR):$root, - [{ return matchVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]) ->; - -def form_duplane_matchdata : - GIDefMatchData<"std::pair<unsigned, int>">; -def form_duplane : GICombineRule < - (defs root:$root, form_duplane_matchdata:$matchinfo), - (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, - [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }]) ->; - -def adjust_icmp_imm_matchdata : - GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">; -def adjust_icmp_imm : GICombineRule < - (defs root:$root, adjust_icmp_imm_matchdata:$matchinfo), - (match (wip_match_opcode G_ICMP):$root, - [{ return matchAdjustICmpImmAndPred(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }]) ->; - -def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>; - -def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">; -def extractvecelt_pairwise_add : GICombineRule< - (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo), - (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root, - [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }]) ->; - -def mul_const_matchdata : GIDefMatchData<"std::function<void(MachineIRBuilder&, Register)>">; -def mul_const : GICombineRule< - (defs root:$root, mul_const_matchdata:$matchinfo), - (match (wip_match_opcode G_MUL):$root, - [{ return matchAArch64MulConstCombine(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }]) ->; - -// Post-legalization combines which should happen at all optimization levels. -// (E.g. ones that facilitate matching for the selector) For example, matching -// pseudos. -def AArch64PostLegalizerLoweringHelper - : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper", - [shuffle_vector_pseudos, vashr_vlshr_imm, - icmp_lowering, form_duplane]> { - let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule"; -} - -// Post-legalization combines which are primarily optimizations. +def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">; +def vashr_vlshr_imm : GICombineRule< + (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo), + (match (wip_match_opcode G_ASHR, G_LSHR):$root, + [{ return matchVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]) +>; + +def form_duplane_matchdata : + GIDefMatchData<"std::pair<unsigned, int>">; +def form_duplane : GICombineRule < + (defs root:$root, form_duplane_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def adjust_icmp_imm_matchdata : + GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">; +def adjust_icmp_imm : GICombineRule < + (defs root:$root, adjust_icmp_imm_matchdata:$matchinfo), + (match (wip_match_opcode G_ICMP):$root, + [{ return matchAdjustICmpImmAndPred(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }]) +>; + +def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>; + +def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">; +def extractvecelt_pairwise_add : GICombineRule< + (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo), + (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root, + [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def mul_const_matchdata : GIDefMatchData<"std::function<void(MachineIRBuilder&, Register)>">; +def mul_const : GICombineRule< + (defs root:$root, mul_const_matchdata:$matchinfo), + (match (wip_match_opcode G_MUL):$root, + [{ return matchAArch64MulConstCombine(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }]) +>; + +// Post-legalization combines which should happen at all optimization levels. +// (E.g. ones that facilitate matching for the selector) For example, matching +// pseudos. +def AArch64PostLegalizerLoweringHelper + : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper", + [shuffle_vector_pseudos, vashr_vlshr_imm, + icmp_lowering, form_duplane]> { + let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule"; +} + +// Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombinerHelper : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", - [copy_prop, erase_undef_store, combines_for_extload, - sext_trunc_sextload, - hoist_logic_op_with_same_opcode_hands, - redundant_and, xor_of_and_with_same_reg, - extractvecelt_pairwise_add, redundant_or, - mul_const]> { + [copy_prop, erase_undef_store, combines_for_extload, + sext_trunc_sextload, + hoist_logic_op_with_same_opcode_hands, + redundant_and, xor_of_and_with_same_reg, + extractvecelt_pairwise_add, redundant_or, + mul_const]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 2328a8b4de..d419598aaa 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -37,14 +37,14 @@ class AArch64CompressJumpTables : public MachineFunctionPass { MachineFunction *MF; SmallVector<int, 8> BlockInfo; - /// Returns the size in instructions of the block \p MBB, or None if we - /// couldn't get a safe upper bound. - Optional<int> computeBlockSize(MachineBasicBlock &MBB); - - /// Gather information about the function, returns false if we can't perform - /// this optimization for some reason. - bool scanFunction(); - + /// Returns the size in instructions of the block \p MBB, or None if we + /// couldn't get a safe upper bound. + Optional<int> computeBlockSize(MachineBasicBlock &MBB); + + /// Gather information about the function, returns false if we can't perform + /// this optimization for some reason. + bool scanFunction(); + bool compressJumpTable(MachineInstr &MI, int Offset); public: @@ -64,27 +64,27 @@ public: } }; char AArch64CompressJumpTables::ID = 0; -} // namespace +} // namespace INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE, "AArch64 compress jump tables pass", false, false) -Optional<int> -AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) { +Optional<int> +AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) { int Size = 0; - for (const MachineInstr &MI : MBB) { - // Inline asm may contain some directives like .bytes which we don't - // currently have the ability to parse accurately. To be safe, just avoid - // computing a size and bail out. - if (MI.getOpcode() == AArch64::INLINEASM || - MI.getOpcode() == AArch64::INLINEASM_BR) - return None; + for (const MachineInstr &MI : MBB) { + // Inline asm may contain some directives like .bytes which we don't + // currently have the ability to parse accurately. To be safe, just avoid + // computing a size and bail out. + if (MI.getOpcode() == AArch64::INLINEASM || + MI.getOpcode() == AArch64::INLINEASM_BR) + return None; Size += TII->getInstSizeInBytes(MI); - } + } return Size; } -bool AArch64CompressJumpTables::scanFunction() { +bool AArch64CompressJumpTables::scanFunction() { BlockInfo.clear(); BlockInfo.resize(MF->getNumBlockIDs()); @@ -97,12 +97,12 @@ bool AArch64CompressJumpTables::scanFunction() { else AlignedOffset = alignTo(Offset, Alignment); BlockInfo[MBB.getNumber()] = AlignedOffset; - auto BlockSize = computeBlockSize(MBB); - if (!BlockSize) - return false; - Offset = AlignedOffset + *BlockSize; + auto BlockSize = computeBlockSize(MBB); + if (!BlockSize) + return false; + Offset = AlignedOffset + *BlockSize; } - return true; + return true; } bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI, @@ -121,7 +121,7 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI, int MaxOffset = std::numeric_limits<int>::min(), MinOffset = std::numeric_limits<int>::max(); MachineBasicBlock *MinBlock = nullptr; - for (auto *Block : JT.MBBs) { + for (auto *Block : JT.MBBs) { int BlockOffset = BlockInfo[Block->getNumber()]; assert(BlockOffset % 4 == 0 && "misaligned basic block"); @@ -141,14 +141,14 @@ bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI, } int Span = MaxOffset - MinOffset; - auto *AFI = MF->getInfo<AArch64FunctionInfo>(); + auto *AFI = MF->getInfo<AArch64FunctionInfo>(); if (isUInt<8>(Span / 4)) { AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol()); MI.setDesc(TII->get(AArch64::JumpTableDest8)); ++NumJT8; return true; - } - if (isUInt<16>(Span / 4)) { + } + if (isUInt<16>(Span / 4)) { AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol()); MI.setDesc(TII->get(AArch64::JumpTableDest16)); ++NumJT16; @@ -169,8 +169,8 @@ bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) { if (ST.force32BitJumpTables() && !MF->getFunction().hasMinSize()) return false; - if (!scanFunction()) - return false; + if (!scanFunction()) + return false; for (MachineBasicBlock &MBB : *MF) { int Offset = BlockInfo[MBB.getNumber()]; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index e57650ae60..1a8731883f 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -83,8 +83,8 @@ private: bool expandSVESpillFill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opc, unsigned N); - bool expandCALL_RVMARKER(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI); + bool expandCALL_RVMARKER(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); }; } // end anonymous namespace @@ -629,46 +629,46 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB, return true; } -bool AArch64ExpandPseudo::expandCALL_RVMARKER( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { - // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29, - // x29` marker. Mark the sequence as bundle, to avoid passes moving other code - // in between. - MachineInstr &MI = *MBBI; - - MachineInstr *OriginalCall; - MachineOperand &CallTarget = MI.getOperand(0); - assert((CallTarget.isGlobal() || CallTarget.isReg()) && - "invalid operand for regular call"); - unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; - OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); - OriginalCall->addOperand(CallTarget); - - unsigned RegMaskStartIdx = 1; - // Skip register arguments. Those are added during ISel, but are not - // needed for the concrete branch. - while (!MI.getOperand(RegMaskStartIdx).isRegMask()) { - assert(MI.getOperand(RegMaskStartIdx).isReg() && - "should only skip register operands"); - RegMaskStartIdx++; - } - for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx) - OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx)); - - auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) - .addReg(AArch64::FP, RegState::Define) - .addReg(AArch64::XZR) - .addReg(AArch64::FP) - .addImm(0) - .getInstr(); - if (MI.shouldUpdateCallSiteInfo()) - MBB.getParent()->moveCallSiteInfo(&MI, Marker); - MI.eraseFromParent(); - finalizeBundle(MBB, OriginalCall->getIterator(), - std::next(Marker->getIterator())); - return true; -} - +bool AArch64ExpandPseudo::expandCALL_RVMARKER( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + // Expand CALL_RVMARKER pseudo to a branch, followed by the special `mov x29, + // x29` marker. Mark the sequence as bundle, to avoid passes moving other code + // in between. + MachineInstr &MI = *MBBI; + + MachineInstr *OriginalCall; + MachineOperand &CallTarget = MI.getOperand(0); + assert((CallTarget.isGlobal() || CallTarget.isReg()) && + "invalid operand for regular call"); + unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR; + OriginalCall = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr(); + OriginalCall->addOperand(CallTarget); + + unsigned RegMaskStartIdx = 1; + // Skip register arguments. Those are added during ISel, but are not + // needed for the concrete branch. + while (!MI.getOperand(RegMaskStartIdx).isRegMask()) { + assert(MI.getOperand(RegMaskStartIdx).isReg() && + "should only skip register operands"); + RegMaskStartIdx++; + } + for (; RegMaskStartIdx < MI.getNumOperands(); ++RegMaskStartIdx) + OriginalCall->addOperand(MI.getOperand(RegMaskStartIdx)); + + auto *Marker = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs)) + .addReg(AArch64::FP, RegState::Define) + .addReg(AArch64::XZR) + .addReg(AArch64::FP) + .addImm(0) + .getInstr(); + if (MI.shouldUpdateCallSiteInfo()) + MBB.getParent()->moveCallSiteInfo(&MI, Marker); + MI.eraseFromParent(); + finalizeBundle(MBB, OriginalCall->getIterator(), + std::next(Marker->getIterator())); + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -1056,8 +1056,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3); case AArch64::LDR_ZZXI: return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2); - case AArch64::BLR_RVMARKER: - return expandCALL_RVMARKER(MBB, MBBI); + case AArch64::BLR_RVMARKER: + return expandCALL_RVMARKER(MBB, MBBI); } return false; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 209f9f7255..afd8765f45 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -54,7 +54,7 @@ using namespace llvm; -#define DEBUG_TYPE "aarch64-falkor-hwpf-fix" +#define DEBUG_TYPE "aarch64-falkor-hwpf-fix" STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); STATISTIC(NumCollisionsAvoided, @@ -146,7 +146,7 @@ bool FalkorMarkStridedAccesses::run() { bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { // Only mark strided loads in the inner-most loop - if (!L.isInnermost()) + if (!L.isInnermost()) return false; bool MadeChange = false; @@ -224,10 +224,10 @@ struct LoadInfo { char FalkorHWPFFix::ID = 0; -INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", +INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late", "Falkor HW Prefetch Fix Late Phase", false, false) static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { @@ -830,7 +830,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { for (MachineLoop *I : LI) for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) // Only process inner-loops - if (L->isInnermost()) + if (L->isInnermost()) runOnLoop(**L, Fn); return Modified; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FastISel.cpp index 9801036653..b4e4233448 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FastISel.cpp @@ -3409,7 +3409,7 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. - if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) std::swap(LHS, RHS); // Simplify multiplies. @@ -3651,10 +3651,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); return true; - case Intrinsic::debugtrap: - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) - .addImm(0xF000); - return true; + case Intrinsic::debugtrap: + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) + .addImm(0xF000); + return true; case Intrinsic::sqrt: { Type *RetTy = II->getCalledFunction()->getReturnType(); @@ -3696,7 +3696,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { const Value *LHS = II->getArgOperand(0); const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. - if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && II->isCommutative()) std::swap(LHS, RHS); // Simplify multiplies. diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.cpp index 65ee501604..9aa8f7a804 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -175,10 +175,10 @@ static cl::opt<bool> StackTaggingMergeSetTag( cl::desc("merge settag instruction in function epilog"), cl::init(true), cl::Hidden); -static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", - cl::desc("sort stack allocations"), - cl::init(true), cl::Hidden); - +static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", + cl::desc("sort stack allocations"), + cl::init(true), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns the argument pop size. @@ -249,7 +249,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { TargetStackID::Value AArch64FrameLowering::getStackIDForScalableVectors() const { - return TargetStackID::ScalableVector; + return TargetStackID::ScalableVector; } /// Returns the size of the fixed object area (allocated next to sp on entry) @@ -273,7 +273,7 @@ static unsigned getFixedObjectSize(const MachineFunction &MF, /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); + return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE()); } bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { @@ -365,15 +365,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(Amount), TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(Amount), TII); } } else if (CalleePopAmount != 0) { // If the calling convention demands that the callee pops arguments from the // stack, we want to add it back if we have a reserved call frame. assert(CalleePopAmount < 0xffffff && "call frame too large"); emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(-(int64_t)CalleePopAmount), TII); + StackOffset::getFixed(-(int64_t)CalleePopAmount), TII); } return MBB.erase(I); } @@ -413,8 +413,8 @@ static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP( const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const { int64_t NumBytes, NumVGScaledBytes; - AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes, - NumVGScaledBytes); + AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(OffsetFromSP, NumBytes, + NumVGScaledBytes); std::string CommentBuffer = "sp"; llvm::raw_string_ostream Comment(CommentBuffer); @@ -441,8 +441,8 @@ MCCFIInstruction AArch64FrameLowering::createCfaOffset( const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &OffsetFromDefCFA) const { int64_t NumBytes, NumVGScaledBytes; - AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( - OffsetFromDefCFA, NumBytes, NumVGScaledBytes); + AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( + OffsetFromDefCFA, NumBytes, NumVGScaledBytes); unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); @@ -496,14 +496,14 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( continue; StackOffset Offset; - if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) { + if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector) { AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - Offset = - StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - - StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); + Offset = + StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) - + StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI)); } else { - Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea()); + Offset = StackOffset::getFixed(MFI.getObjectOffset(Info.getFrameIdx()) - + getOffsetOfLocalArea()); } unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) @@ -584,12 +584,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF, !F.hasFnAttribute("no-stack-arg-probe"); } -static bool needsWinCFI(const MachineFunction &MF) { - const Function &F = MF.getFunction(); - return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && - F.needsUnwindTableEntry(); -} - +static bool needsWinCFI(const MachineFunction &MF) { + const Function &F = MF.getFunction(); + return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() && + F.needsUnwindTableEntry(); +} + bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( MachineFunction &MF, uint64_t StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -600,18 +600,18 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (AFI->getLocalStackSize() == 0) return false; - // For WinCFI, if optimizing for size, prefer to not combine the stack bump - // (to force a stp with predecrement) to match the packed unwind format, - // provided that there actually are any callee saved registers to merge the - // decrement with. - // This is potentially marginally slower, but allows using the packed - // unwind format for functions that both have a local area and callee saved - // registers. Using the packed unwind format notably reduces the size of - // the unwind info. - if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 && - MF.getFunction().hasOptSize()) - return false; - + // For WinCFI, if optimizing for size, prefer to not combine the stack bump + // (to force a stp with predecrement) to match the packed unwind format, + // provided that there actually are any callee saved registers to merge the + // decrement with. + // This is potentially marginally slower, but allows using the packed + // unwind format for functions that both have a local area and callee saved + // registers. Using the packed unwind format notably reduces the size of + // the unwind info. + if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 && + MF.getFunction().hasOptSize()) + return false; + // 512 is the maximum immediate for stp/ldp that will be used for // callee-save save/restores if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) @@ -1051,16 +1051,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // to determine the end of the prologue. DebugLoc DL; - const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); - if (MFnI.shouldSignReturnAddress()) { - if (MFnI.shouldSignWithBKey()) { + const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); + if (MFnI.shouldSignReturnAddress()) { + if (MFnI.shouldSignWithBKey()) { BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY)) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP)) .setMIFlag(MachineInstr::FrameSetup); - } else { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP)) - .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP)) + .setMIFlag(MachineInstr::FrameSetup); } unsigned CFIIndex = @@ -1075,13 +1075,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - // Set tagged base pointer to the requested stack slot. + // Set tagged base pointer to the requested stack slot. // Ideally it should match SP value after prologue. - Optional<int> TBPI = AFI->getTaggedBasePointerIndex(); - if (TBPI) - AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); - else - AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + Optional<int> TBPI = AFI->getTaggedBasePointerIndex(); + if (TBPI) + AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); + else + AFI->setTaggedBasePointerOffset(MFI.getStackSize()); const StackOffset &SVEStackSize = getSVEStackSize(MF); @@ -1108,8 +1108,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++NumRedZoneFunctions; } else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); if (!NeedsWinCFI && needsFrameMoves) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); @@ -1142,8 +1142,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (CombineSPBump) { assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); NumBytes = 0; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( @@ -1167,7 +1167,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // For funclets the FP belongs to the containing function. if (!IsFunclet && HasFP) { // Only set up FP if we actually need to. - int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); + int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset(); if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -1177,8 +1177,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Note: All stores of callee-saved registers are marked as "FrameSetup". // This code marks the instruction(s) that set the FP also. emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, - StackOffset::getFixed(FPOffset), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + StackOffset::getFixed(FPOffset), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1288,7 +1288,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; CalleeSavesEnd = MBBI; - AllocateBefore = StackOffset::getScalable(CalleeSavedSize); + AllocateBefore = StackOffset::getScalable(CalleeSavedSize); AllocateAfter = SVEStackSize - AllocateBefore; } @@ -1320,8 +1320,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + StackOffset::getFixed(-NumBytes), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); @@ -1458,15 +1458,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // .cfi_offset w28, -32 if (HasFP) { - const int OffsetToFirstCalleeSaveFromFP = - AFI->getCalleeSaveBaseToFrameRecordOffset() - - AFI->getCalleeSavedStackSize(); - Register FramePtr = RegInfo->getFrameRegister(MF); - + const int OffsetToFirstCalleeSaveFromFP = + AFI->getCalleeSaveBaseToFrameRecordOffset() - + AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); + // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); + MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1476,7 +1476,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, const TargetSubtargetInfo &STI = MF.getSubtarget(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); StackOffset TotalSize = - SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); + SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize()); CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize)); } else { // Encode the stack size of the leaf function. @@ -1496,8 +1496,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, static void InsertReturnAddressAuth(MachineFunction &MF, MachineBasicBlock &MBB) { - const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); - if (!MFI.shouldSignReturnAddress()) + const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); + if (!MFI.shouldSignReturnAddress()) return; const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); @@ -1511,16 +1511,16 @@ static void InsertReturnAddressAuth(MachineFunction &MF, // this instruction can safely used for any v8a architecture. // From v8.3a onwards there are optimised authenticate LR and return // instructions, namely RETA{A,B}, that can be used instead. - if (Subtarget.hasPAuth() && MBBI != MBB.end() && + if (Subtarget.hasPAuth() && MBBI != MBB.end() && MBBI->getOpcode() == AArch64::RET_ReallyLR) { BuildMI(MBB, MBBI, DL, - TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA)) + TII->get(MFI.shouldSignWithBKey() ? AArch64::RETAB : AArch64::RETAA)) .copyImplicitOps(*MBBI); MBB.erase(MBBI); } else { BuildMI( MBB, MBBI, DL, - TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP)) + TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP)) .setMIFlag(MachineInstr::FrameDestroy); } } @@ -1545,7 +1545,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; bool IsFunclet = false; - auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); }); + auto WinCFI = make_scope_exit([&]() { assert(HasWinCFI == MF.hasWinCFI()); }); if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); @@ -1645,13 +1645,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NeedsWinCFI, &HasWinCFI); } - if (MF.hasWinCFI()) { - // If the prologue didn't contain any SEH opcodes and didn't set the - // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the - // EpilogStart - to avoid generating CFI for functions that don't need it. - // (And as we didn't generate any prologue at all, it would be asymmetrical - // to the epilogue.) By the end of the function, we assert that - // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption. + if (MF.hasWinCFI()) { + // If the prologue didn't contain any SEH opcodes and didn't set the + // MF.hasWinCFI() flag, assume the epilogue won't either, and skip the + // EpilogStart - to avoid generating CFI for functions that don't need it. + // (And as we didn't generate any prologue at all, it would be asymmetrical + // to the epilogue.) By the end of the function, we assert that + // HasWinCFI is equal to MF.hasWinCFI(), to verify this assumption. HasWinCFI = true; BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart)) .setMIFlag(MachineInstr::FrameDestroy); @@ -1663,10 +1663,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (CombineSPBump) { assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize), - TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, - &HasWinCFI); - if (HasWinCFI) + StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize), + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); + if (HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); @@ -1689,8 +1689,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, assert(IsSVECalleeSave(RestoreBegin) && IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); - StackOffset CalleeSavedSizeAsOffset = - StackOffset::getScalable(CalleeSavedSize); + StackOffset CalleeSavedSizeAsOffset = + StackOffset::getScalable(CalleeSavedSize); DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset; DeallocateAfter = CalleeSavedSizeAsOffset; } @@ -1703,15 +1703,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be reloaded. The code below will deallocate the stack space // space by moving FP -> SP. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, - StackOffset::getScalable(-CalleeSavedSize), TII, + StackOffset::getScalable(-CalleeSavedSize), TII, MachineInstr::FrameDestroy); } else { if (AFI->getSVECalleeSavedStackSize()) { // Deallocate the non-SVE locals first before we can deallocate (and // restore callee saves) from the SVE area. emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, - MachineInstr::FrameDestroy); + StackOffset::getFixed(NumBytes), TII, + MachineInstr::FrameDestroy); NumBytes = 0; } @@ -1744,10 +1744,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(StackRestoreBytes), TII, + StackOffset::getFixed(StackRestoreBytes), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (Done) { - if (HasWinCFI) { + if (HasWinCFI) { BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); @@ -1763,14 +1763,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { - emitFrameOffset( - MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()), - TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); + emitFrameOffset( + MBB, LastPopI, DL, AArch64::SP, AArch64::FP, + StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()), + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); } else if (NumBytes) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(NumBytes), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); + StackOffset::getFixed(NumBytes), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI); // This must be placed after the callee-save restore code because that code // assumes the SP is at the same location as it was after the callee-save save @@ -1791,10 +1791,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed((int64_t)AfterCSRPopSize), TII, + StackOffset::getFixed((int64_t)AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } - if (HasWinCFI) + if (HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); } @@ -1803,51 +1803,51 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, /// debug info. It's the same as what we use for resolving the code-gen /// references for now. FIXME: This can go wrong when references are /// SP-relative and simple call frames aren't used. -StackOffset -AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - Register &FrameReg) const { +StackOffset +AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const { return resolveFrameIndexReference( - MF, FI, FrameReg, - /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), - /*ForSimm=*/false); + MF, FI, FrameReg, + /*PreferFP=*/ + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + /*ForSimm=*/false); } -StackOffset -AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF, - int FI) const { - return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI)); +StackOffset +AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF, + int FI) const { + return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI)); } -static StackOffset getFPOffset(const MachineFunction &MF, - int64_t ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, + int64_t ObjectOffset) { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); - int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo()); - int64_t FPAdjust = - CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset(); - return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust); + int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo()); + int64_t FPAdjust = + CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset(); + return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust); } -static StackOffset getStackOffset(const MachineFunction &MF, - int64_t ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, + int64_t ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize()); + return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize()); } - // TODO: This function currently does not work for scalable vectors. + // TODO: This function currently does not work for scalable vectors. int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const { const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP - ? getFPOffset(MF, ObjectOffset).getFixed() - : getStackOffset(MF, ObjectOffset).getFixed(); + ? getFPOffset(MF, ObjectOffset).getFixed() + : getStackOffset(MF, ObjectOffset).getFixed(); } StackOffset AArch64FrameLowering::resolveFrameIndexReference( @@ -1856,7 +1856,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector; + bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector; return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, PreferFP, ForSimm); } @@ -1870,8 +1870,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); - int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); + int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed(); + int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); @@ -1946,16 +1946,16 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( "non-argument/CSR objects cannot be accessed through the frame pointer"); if (isSVE) { - StackOffset FPOffset = - StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); - StackOffset SPOffset = - SVEStackSize + - StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), - ObjectOffset); + StackOffset FPOffset = + StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset); + StackOffset SPOffset = + SVEStackSize + + StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), + ObjectOffset); // Always use the FP for SVE spills if available and beneficial. if (hasFP(MF) && - (SPOffset.getFixed() || - FPOffset.getScalable() < SPOffset.getScalable() || + (SPOffset.getFixed() || + FPOffset.getScalable() < SPOffset.getScalable() || RegInfo->needsStackRealignment(MF))) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; @@ -1974,7 +1974,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return StackOffset::getFixed(FPOffset) + ScalableOffset; + return StackOffset::getFixed(FPOffset) + ScalableOffset; } // Use the base pointer if we have one. @@ -1991,7 +1991,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return StackOffset::getFixed(Offset) + ScalableOffset; + return StackOffset::getFixed(Offset) + ScalableOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -2013,12 +2013,12 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) { } static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool IsFirst) { + bool NeedsWinCFI, bool IsFirst) { // If we are generating register pairs for a Windows function that requires // EH support, then pair consecutive registers only. There are no unwind // opcodes for saves/restores of non-consectuve register pairs. - // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x, - // save_lrpair. + // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x, + // save_lrpair. // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling if (Reg2 == AArch64::FP) @@ -2027,14 +2027,14 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return false; if (Reg2 == Reg1 + 1) return false; - // If pairing a GPR with LR, the pair can be described by the save_lrpair - // opcode. If this is the first register pair, it would end up with a - // predecrement, but there's no save_lrpair_x opcode, so we can only do this - // if LR is paired with something else than the first register. - // The save_lrpair opcode requires the first register to be an odd one. - if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 && - (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst) - return false; + // If pairing a GPR with LR, the pair can be described by the save_lrpair + // opcode. If this is the first register pair, it would end up with a + // predecrement, but there's no save_lrpair_x opcode, so we can only do this + // if LR is paired with something else than the first register. + // The save_lrpair opcode requires the first register to be an odd one. + if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 && + (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst) + return false; return true; } @@ -2043,10 +2043,10 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool UsesWinAAPCS, bool NeedsWinCFI, - bool NeedsFrameRecord, bool IsFirst) { + bool UsesWinAAPCS, bool NeedsWinCFI, + bool NeedsFrameRecord, bool IsFirst) { if (UsesWinAAPCS) - return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst); + return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst); // If we need to store the frame record, don't pair any register // with LR other than FP. @@ -2110,22 +2110,22 @@ static void computeCalleeSaveRegisterPairs( (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); int ByteOffset = AFI->getCalleeSavedStackSize(); - int StackFillDir = -1; - int RegInc = 1; - unsigned FirstReg = 0; - if (NeedsWinCFI) { - // For WinCFI, fill the stack from the bottom up. - ByteOffset = 0; - StackFillDir = 1; - // As the CSI array is reversed to match PrologEpilogInserter, iterate - // backwards, to pair up registers starting from lower numbered registers. - RegInc = -1; - FirstReg = Count - 1; - } + int StackFillDir = -1; + int RegInc = 1; + unsigned FirstReg = 0; + if (NeedsWinCFI) { + // For WinCFI, fill the stack from the bottom up. + ByteOffset = 0; + StackFillDir = 1; + // As the CSI array is reversed to match PrologEpilogInserter, iterate + // backwards, to pair up registers starting from lower numbered registers. + RegInc = -1; + FirstReg = Count - 1; + } int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); - // When iterating backwards, the loop condition relies on unsigned wraparound. - for (unsigned i = FirstReg; i < Count; i += RegInc) { + // When iterating backwards, the loop condition relies on unsigned wraparound. + for (unsigned i = FirstReg; i < Count; i += RegInc) { RegPairInfo RPI; RPI.Reg1 = CSI[i].getReg(); @@ -2143,20 +2143,20 @@ static void computeCalleeSaveRegisterPairs( llvm_unreachable("Unsupported register class."); // Add the next reg to the pair if it is in the same register class. - if (unsigned(i + RegInc) < Count) { - unsigned NextReg = CSI[i + RegInc].getReg(); - bool IsFirst = i == FirstReg; + if (unsigned(i + RegInc) < Count) { + unsigned NextReg = CSI[i + RegInc].getReg(); + bool IsFirst = i == FirstReg; switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, - NeedsWinCFI, NeedsFrameRecord, IsFirst)) + !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, + NeedsWinCFI, NeedsFrameRecord, IsFirst)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: if (AArch64::FPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, - IsFirst)) + !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + IsFirst)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR128: @@ -2185,7 +2185,7 @@ static void computeCalleeSaveRegisterPairs( // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. assert((!RPI.isPaired() || - (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) && + (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) && "Out of order callee saved regs!"); assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || @@ -2207,73 +2207,73 @@ static void computeCalleeSaveRegisterPairs( "Callee-save registers not saved as adjacent register pair!"); RPI.FrameIdx = CSI[i].getFrameIdx(); - if (NeedsWinCFI && - RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair - RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); + if (NeedsWinCFI && + RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair + RPI.FrameIdx = CSI[i + RegInc].getFrameIdx(); int Scale = RPI.getScale(); - - int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; - assert(OffsetPre % Scale == 0); - + + int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset; + assert(OffsetPre % Scale == 0); + if (RPI.isScalable()) - ScalableByteOffset += StackFillDir * Scale; + ScalableByteOffset += StackFillDir * Scale; else - ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); + ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); assert(!(RPI.isScalable() && RPI.isPaired()) && "Paired spill/fill instructions don't exist for SVE vectors"); // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. - if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI && + if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI && !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { - ByteOffset += 8 * StackFillDir; + ByteOffset += 8 * StackFillDir; assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); - // A stack frame with a gap looks like this, bottom up: - // d9, d8. x21, gap, x20, x19. - // Set extra alignment on the x21 object (the only unpaired register) - // to create the gap above it. + // A stack frame with a gap looks like this, bottom up: + // d9, d8. x21, gap, x20, x19. + // Set extra alignment on the x21 object (the only unpaired register) + // to create the gap above it. MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); } - int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; - assert(OffsetPost % Scale == 0); - // If filling top down (default), we want the offset after incrementing it. - // If fillibg bootom up (WinCFI) we need the original offset. - int Offset = NeedsWinCFI ? OffsetPre : OffsetPost; + int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; + assert(OffsetPost % Scale == 0); + // If filling top down (default), we want the offset after incrementing it. + // If fillibg bootom up (WinCFI) we need the original offset. + int Offset = NeedsWinCFI ? OffsetPre : OffsetPost; RPI.Offset = Offset / Scale; assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); - // Save the offset to frame record so that the FP register can point to the - // innermost frame record (spilled FP and LR registers). - if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR && - RPI.Reg2 == AArch64::FP) || - (IsWindows && RPI.Reg1 == AArch64::FP && - RPI.Reg2 == AArch64::LR))) - AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); - + // Save the offset to frame record so that the FP register can point to the + // innermost frame record (spilled FP and LR registers). + if (NeedsFrameRecord && ((!IsWindows && RPI.Reg1 == AArch64::LR && + RPI.Reg2 == AArch64::FP) || + (IsWindows && RPI.Reg1 == AArch64::FP && + RPI.Reg2 == AArch64::LR))) + AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); + RegPairs.push_back(RPI); if (RPI.isPaired()) - i += RegInc; - } - if (NeedsWinCFI) { - // If we need an alignment gap in the stack, align the topmost stack - // object. A stack frame with a gap looks like this, bottom up: - // x19, d8. d9, gap. - // Set extra alignment on the topmost stack object (the first element in - // CSI, which goes top down), to create the gap above it. - if (AFI->hasCalleeSaveStackFreeSpace()) - MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16)); - // We iterated bottom up over the registers; flip RegPairs back to top - // down order. - std::reverse(RegPairs.begin(), RegPairs.end()); - } + i += RegInc; + } + if (NeedsWinCFI) { + // If we need an alignment gap in the stack, align the topmost stack + // object. A stack frame with a gap looks like this, bottom up: + // x19, d8. d9, gap. + // Set extra alignment on the topmost stack object (the first element in + // CSI, which goes top down), to create the gap above it. + if (AFI->hasCalleeSaveStackFreeSpace()) + MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16)); + // We iterated bottom up over the registers; flip RegPairs back to top + // down order. + std::reverse(RegPairs.begin(), RegPairs.end()); + } } bool AArch64FrameLowering::spillCalleeSavedRegisters( @@ -2412,7 +2412,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Update the StackIDs of the SVE stack slots. MachineFrameInfo &MFI = MF.getFrameInfo(); if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) - MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); + MFI.setStackID(RPI.FrameIdx, TargetStackID::ScalableVector); } return true; @@ -2704,21 +2704,21 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); } -bool AArch64FrameLowering::assignCalleeSavedSpillSlots( - MachineFunction &MF, const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI) const { - bool NeedsWinCFI = needsWinCFI(MF); - // To match the canonical windows frame layout, reverse the list of - // callee saved registers to get them laid out by PrologEpilogInserter - // in the right order. (PrologEpilogInserter allocates stack objects top - // down. Windows canonical prologs store higher numbered registers at - // the top, thus have the CSI array start from the highest registers.) - if (NeedsWinCFI) - std::reverse(CSI.begin(), CSI.end()); - // Let the generic code do the rest of the setup. - return false; -} - +bool AArch64FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + bool NeedsWinCFI = needsWinCFI(MF); + // To match the canonical windows frame layout, reverse the list of + // callee saved registers to get them laid out by PrologEpilogInserter + // in the right order. (PrologEpilogInserter allocates stack objects top + // down. Windows canonical prologs store higher numbered registers at + // the top, thus have the CSI array start from the highest registers.) + if (NeedsWinCFI) + std::reverse(CSI.begin(), CSI.end()); + // Let the generic code do the rest of the setup. + return false; +} + bool AArch64FrameLowering::enableStackSlotScavenging( const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -2761,7 +2761,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, #ifndef NDEBUG // First process all fixed stack objects. for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) - assert(MFI.getStackID(I) != TargetStackID::ScalableVector && + assert(MFI.getStackID(I) != TargetStackID::ScalableVector && "SVE vectors should never be passed on the stack by value, only by " "reference."); #endif @@ -2791,7 +2791,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, SmallVector<int, 8> ObjectsToAllocate; for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { unsigned StackID = MFI.getStackID(I); - if (StackID != TargetStackID::ScalableVector) + if (StackID != TargetStackID::ScalableVector) continue; if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) continue; @@ -2945,12 +2945,12 @@ void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { const int64_t kMaxOffset = 255 * 16; Register BaseReg = FrameReg; - int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed(); + int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed(); if (BaseRegOffsetBytes < kMinOffset || BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, - StackOffset::getFixed(BaseRegOffsetBytes), TII); + StackOffset::getFixed(BaseRegOffsetBytes), TII); BaseReg = ScratchReg; BaseRegOffsetBytes = 0; } @@ -3007,7 +3007,7 @@ void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { LoopI->setFlags(FrameRegUpdateFlags); int64_t ExtraBaseRegUpdate = - FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0; + FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0; if (LoopSize < Size) { assert(FrameRegUpdate); assert(Size - LoopSize == 16); @@ -3111,7 +3111,7 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, // realistically happens in function epilogue. Also, STGloop is expanded // before that pass. if (InsertI != MBB->end() && - canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size, + canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size, &TotalOffset)) { UpdateInstr = &*InsertI++; LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " @@ -3274,7 +3274,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP /// before the update. This is easily retrieved as it is exactly the offset /// that is set in processFunctionBeforeFrameFinalized. -StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( +StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( const MachineFunction &MF, int FI, Register &FrameReg, bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3282,7 +3282,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP( LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " << MFI.getObjectOffset(FI) << "\n"); FrameReg = AArch64::SP; - return StackOffset::getFixed(MFI.getObjectOffset(FI)); + return StackOffset::getFixed(MFI.getObjectOffset(FI)); } return getFrameIndexReference(MF, FI, FrameReg); @@ -3306,162 +3306,162 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), getStackAlign()); } - -namespace { -struct FrameObject { - bool IsValid = false; - // Index of the object in MFI. - int ObjectIndex = 0; - // Group ID this object belongs to. - int GroupIndex = -1; - // This object should be placed first (closest to SP). - bool ObjectFirst = false; - // This object's group (which always contains the object with - // ObjectFirst==true) should be placed first. - bool GroupFirst = false; -}; - -class GroupBuilder { - SmallVector<int, 8> CurrentMembers; - int NextGroupIndex = 0; - std::vector<FrameObject> &Objects; - -public: - GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {} - void AddMember(int Index) { CurrentMembers.push_back(Index); } - void EndCurrentGroup() { - if (CurrentMembers.size() > 1) { - // Create a new group with the current member list. This might remove them - // from their pre-existing groups. That's OK, dealing with overlapping - // groups is too hard and unlikely to make a difference. - LLVM_DEBUG(dbgs() << "group:"); - for (int Index : CurrentMembers) { - Objects[Index].GroupIndex = NextGroupIndex; - LLVM_DEBUG(dbgs() << " " << Index); - } - LLVM_DEBUG(dbgs() << "\n"); - NextGroupIndex++; - } - CurrentMembers.clear(); - } -}; - -bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { - // Objects at a lower index are closer to FP; objects at a higher index are - // closer to SP. - // - // For consistency in our comparison, all invalid objects are placed - // at the end. This also allows us to stop walking when we hit the - // first invalid item after it's all sorted. - // - // The "first" object goes first (closest to SP), followed by the members of - // the "first" group. - // - // The rest are sorted by the group index to keep the groups together. - // Higher numbered groups are more likely to be around longer (i.e. untagged - // in the function epilogue and not at some earlier point). Place them closer - // to SP. - // - // If all else equal, sort by the object index to keep the objects in the - // original order. - return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, - A.ObjectIndex) < - std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, - B.ObjectIndex); -} -} // namespace - -void AArch64FrameLowering::orderFrameObjects( - const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { - if (!OrderFrameObjects || ObjectsToAllocate.empty()) - return; - - const MachineFrameInfo &MFI = MF.getFrameInfo(); - std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); - for (auto &Obj : ObjectsToAllocate) { - FrameObjects[Obj].IsValid = true; - FrameObjects[Obj].ObjectIndex = Obj; - } - - // Identify stack slots that are tagged at the same time. - GroupBuilder GB(FrameObjects); - for (auto &MBB : MF) { - for (auto &MI : MBB) { - if (MI.isDebugInstr()) - continue; - int OpIndex; - switch (MI.getOpcode()) { - case AArch64::STGloop: - case AArch64::STZGloop: - OpIndex = 3; - break; - case AArch64::STGOffset: - case AArch64::STZGOffset: - case AArch64::ST2GOffset: - case AArch64::STZ2GOffset: - OpIndex = 1; - break; - default: - OpIndex = -1; - } - - int TaggedFI = -1; - if (OpIndex >= 0) { - const MachineOperand &MO = MI.getOperand(OpIndex); - if (MO.isFI()) { - int FI = MO.getIndex(); - if (FI >= 0 && FI < MFI.getObjectIndexEnd() && - FrameObjects[FI].IsValid) - TaggedFI = FI; - } - } - - // If this is a stack tagging instruction for a slot that is not part of a - // group yet, either start a new group or add it to the current one. - if (TaggedFI >= 0) - GB.AddMember(TaggedFI); - else - GB.EndCurrentGroup(); - } - // Groups should never span multiple basic blocks. - GB.EndCurrentGroup(); - } - - // If the function's tagged base pointer is pinned to a stack slot, we want to - // put that slot first when possible. This will likely place it at SP + 0, - // and save one instruction when generating the base pointer because IRG does - // not allow an immediate offset. - const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); - Optional<int> TBPI = AFI.getTaggedBasePointerIndex(); - if (TBPI) { - FrameObjects[*TBPI].ObjectFirst = true; - FrameObjects[*TBPI].GroupFirst = true; - int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex; - if (FirstGroupIndex >= 0) - for (FrameObject &Object : FrameObjects) - if (Object.GroupIndex == FirstGroupIndex) - Object.GroupFirst = true; - } - - llvm::stable_sort(FrameObjects, FrameObjectCompare); - - int i = 0; - for (auto &Obj : FrameObjects) { - // All invalid items are sorted at the end, so it's safe to stop. - if (!Obj.IsValid) - break; - ObjectsToAllocate[i++] = Obj.ObjectIndex; - } - - LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj - : FrameObjects) { - if (!Obj.IsValid) - break; - dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex; - if (Obj.ObjectFirst) - dbgs() << ", first"; - if (Obj.GroupFirst) - dbgs() << ", group-first"; - dbgs() << "\n"; - }); -} + +namespace { +struct FrameObject { + bool IsValid = false; + // Index of the object in MFI. + int ObjectIndex = 0; + // Group ID this object belongs to. + int GroupIndex = -1; + // This object should be placed first (closest to SP). + bool ObjectFirst = false; + // This object's group (which always contains the object with + // ObjectFirst==true) should be placed first. + bool GroupFirst = false; +}; + +class GroupBuilder { + SmallVector<int, 8> CurrentMembers; + int NextGroupIndex = 0; + std::vector<FrameObject> &Objects; + +public: + GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {} + void AddMember(int Index) { CurrentMembers.push_back(Index); } + void EndCurrentGroup() { + if (CurrentMembers.size() > 1) { + // Create a new group with the current member list. This might remove them + // from their pre-existing groups. That's OK, dealing with overlapping + // groups is too hard and unlikely to make a difference. + LLVM_DEBUG(dbgs() << "group:"); + for (int Index : CurrentMembers) { + Objects[Index].GroupIndex = NextGroupIndex; + LLVM_DEBUG(dbgs() << " " << Index); + } + LLVM_DEBUG(dbgs() << "\n"); + NextGroupIndex++; + } + CurrentMembers.clear(); + } +}; + +bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { + // Objects at a lower index are closer to FP; objects at a higher index are + // closer to SP. + // + // For consistency in our comparison, all invalid objects are placed + // at the end. This also allows us to stop walking when we hit the + // first invalid item after it's all sorted. + // + // The "first" object goes first (closest to SP), followed by the members of + // the "first" group. + // + // The rest are sorted by the group index to keep the groups together. + // Higher numbered groups are more likely to be around longer (i.e. untagged + // in the function epilogue and not at some earlier point). Place them closer + // to SP. + // + // If all else equal, sort by the object index to keep the objects in the + // original order. + return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, + A.ObjectIndex) < + std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, + B.ObjectIndex); +} +} // namespace + +void AArch64FrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const { + if (!OrderFrameObjects || ObjectsToAllocate.empty()) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd()); + for (auto &Obj : ObjectsToAllocate) { + FrameObjects[Obj].IsValid = true; + FrameObjects[Obj].ObjectIndex = Obj; + } + + // Identify stack slots that are tagged at the same time. + GroupBuilder GB(FrameObjects); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; + int OpIndex; + switch (MI.getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + OpIndex = 3; + break; + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + OpIndex = 1; + break; + default: + OpIndex = -1; + } + + int TaggedFI = -1; + if (OpIndex >= 0) { + const MachineOperand &MO = MI.getOperand(OpIndex); + if (MO.isFI()) { + int FI = MO.getIndex(); + if (FI >= 0 && FI < MFI.getObjectIndexEnd() && + FrameObjects[FI].IsValid) + TaggedFI = FI; + } + } + + // If this is a stack tagging instruction for a slot that is not part of a + // group yet, either start a new group or add it to the current one. + if (TaggedFI >= 0) + GB.AddMember(TaggedFI); + else + GB.EndCurrentGroup(); + } + // Groups should never span multiple basic blocks. + GB.EndCurrentGroup(); + } + + // If the function's tagged base pointer is pinned to a stack slot, we want to + // put that slot first when possible. This will likely place it at SP + 0, + // and save one instruction when generating the base pointer because IRG does + // not allow an immediate offset. + const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); + Optional<int> TBPI = AFI.getTaggedBasePointerIndex(); + if (TBPI) { + FrameObjects[*TBPI].ObjectFirst = true; + FrameObjects[*TBPI].GroupFirst = true; + int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex; + if (FirstGroupIndex >= 0) + for (FrameObject &Object : FrameObjects) + if (Object.GroupIndex == FirstGroupIndex) + Object.GroupFirst = true; + } + + llvm::stable_sort(FrameObjects, FrameObjectCompare); + + int i = 0; + for (auto &Obj : FrameObjects) { + // All invalid items are sorted at the end, so it's safe to stop. + if (!Obj.IsValid) + break; + ObjectsToAllocate[i++] = Obj.ObjectIndex; + } + + LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj + : FrameObjects) { + if (!Obj.IsValid) + break; + dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex; + if (Obj.ObjectFirst) + dbgs() << ", first"; + if (Obj.GroupFirst) + dbgs() << ", group-first"; + dbgs() << "\n"; + }); +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.h index 80079a9d98..b3a402de03 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Support/TypeSize.h" +#include "llvm/Support/TypeSize.h" #include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -41,8 +41,8 @@ public: bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; - StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, - Register &FrameReg) const override; + StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const override; StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, bool ForSimm) const; @@ -67,11 +67,11 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; - bool - assignCalleeSavedSpillSlots(MachineFunction &MF, - const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI) const override; - + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; @@ -94,12 +94,12 @@ public: unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; - StackOffset - getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - Register &FrameReg, - bool IgnoreSPUpdates) const override; - StackOffset getNonLocalFrameIndexReference(const MachineFunction &MF, - int FI) const override; + StackOffset + getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, + Register &FrameReg, + bool IgnoreSPUpdates) const override; + StackOffset getNonLocalFrameIndexReference(const MachineFunction &MF, + int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; bool isSupportedStackID(TargetStackID::Value ID) const override { @@ -107,7 +107,7 @@ public: default: return false; case TargetStackID::Default: - case TargetStackID::ScalableVector: + case TargetStackID::ScalableVector: case TargetStackID::NoAlloc: return true; } @@ -116,13 +116,13 @@ public: bool isStackIdSafeForLocalArea(unsigned StackId) const override { // We don't support putting SVE objects into the pre-allocated local // frame block at the moment. - return StackId != TargetStackID::ScalableVector; + return StackId != TargetStackID::ScalableVector; } - void - orderFrameObjects(const MachineFunction &MF, - SmallVectorImpl<int> &ObjectsToAllocate) const override; - + void + orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl<int> &ObjectsToAllocate) const override; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 94b5d7718d..a570f2d3b0 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "AArch64MachineFunctionInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/APSInt.h" @@ -191,16 +191,16 @@ public: return SelectSVELogicalImm(N, VT, Imm); } - template <MVT::SimpleValueType VT> - bool SelectSVEArithImm(SDValue N, SDValue &Imm) { - return SelectSVEArithImm(N, VT, Imm); - } - - template <unsigned Low, unsigned High, bool AllowSaturation = false> - bool SelectSVEShiftImm(SDValue N, SDValue &Imm) { - return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm); + template <MVT::SimpleValueType VT> + bool SelectSVEArithImm(SDValue N, SDValue &Imm) { + return SelectSVEArithImm(N, VT, Imm); } + template <unsigned Low, unsigned High, bool AllowSaturation = false> + bool SelectSVEShiftImm(SDValue N, SDValue &Imm) { + return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm); + } + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template<signed Min, signed Max, signed Scale, bool Shift> bool SelectCntImm(SDValue N, SDValue &Imm) { @@ -329,10 +329,10 @@ private: bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); - bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High, - bool AllowSaturation, SDValue &Imm); + bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High, + bool AllowSaturation, SDValue &Imm); - bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); + bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); }; @@ -1377,12 +1377,12 @@ void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - // Transfer memoperands. In the case of AArch64::LD64B, there won't be one, - // because it's too simple to have needed special treatment during lowering. - if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) { - MachineMemOperand *MemOp = MemIntr->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp}); - } + // Transfer memoperands. In the case of AArch64::LD64B, there won't be one, + // because it's too simple to have needed special treatment during lowering. + if (auto *MemIntr = dyn_cast<MemIntrinsicSDNode>(N)) { + MachineMemOperand *MemOp = MemIntr->getMemOperand(); + CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp}); + } CurDAG->RemoveDeadNode(N); } @@ -3136,28 +3136,28 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { return false; } -bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) { +bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm) { if (auto CNode = dyn_cast<ConstantSDNode>(N)) { - uint64_t ImmVal = CNode->getZExtValue(); - - switch (VT.SimpleTy) { - case MVT::i8: - ImmVal &= 0xFF; - break; - case MVT::i16: - ImmVal &= 0xFFFF; - break; - case MVT::i32: - ImmVal &= 0xFFFFFFFF; - break; - case MVT::i64: - break; - default: - llvm_unreachable("Unexpected type"); - } - + uint64_t ImmVal = CNode->getZExtValue(); + + switch (VT.SimpleTy) { + case MVT::i8: + ImmVal &= 0xFF; + break; + case MVT::i16: + ImmVal &= 0xFFFF; + break; + case MVT::i32: + ImmVal &= 0xFFFFFFFF; + break; + case MVT::i64: + break; + default: + llvm_unreachable("Unexpected type"); + } + if (ImmVal < 256) { - Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32); + Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32); return true; } } @@ -3201,30 +3201,30 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) { return false; } -// SVE shift intrinsics allow shift amounts larger than the element's bitwidth. -// Rather than attempt to normalise everything we can sometimes saturate the -// shift amount during selection. This function also allows for consistent -// isel patterns by ensuring the resulting "Imm" node is of the i32 type -// required by the instructions. -bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low, - uint64_t High, bool AllowSaturation, - SDValue &Imm) { +// SVE shift intrinsics allow shift amounts larger than the element's bitwidth. +// Rather than attempt to normalise everything we can sometimes saturate the +// shift amount during selection. This function also allows for consistent +// isel patterns by ensuring the resulting "Imm" node is of the i32 type +// required by the instructions. +bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low, + uint64_t High, bool AllowSaturation, + SDValue &Imm) { if (auto *CN = dyn_cast<ConstantSDNode>(N)) { uint64_t ImmVal = CN->getZExtValue(); - // Reject shift amounts that are too small. - if (ImmVal < Low) - return false; - - // Reject or saturate shift amounts that are too big. - if (ImmVal > High) { - if (!AllowSaturation) - return false; - ImmVal = High; + // Reject shift amounts that are too small. + if (ImmVal < Low) + return false; + + // Reject or saturate shift amounts that are too big. + if (ImmVal > High) { + if (!AllowSaturation) + return false; + ImmVal = High; } - - Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32); - return true; + + Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32); + return true; } return false; @@ -3833,9 +3833,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } break; - case Intrinsic::aarch64_ld64b: - SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0); - return; + case Intrinsic::aarch64_ld64b: + SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0); + return; } } break; case ISD::INTRINSIC_WO_CHAIN: { @@ -4854,8 +4854,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, return EVT(); ElementCount EC = PredVT.getVectorElementCount(); - EVT ScalarVT = - EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue()); + EVT ScalarVT = + EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue()); EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); return MemVT; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.cpp index c522ee7662..513c8932b3 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -112,76 +112,76 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, "optimization"), cl::init(true)); -// Temporary option added for the purpose of testing functionality added -// to DAGCombiner.cpp in D92230. It is expected that this can be removed -// in future when both implementations will be based off MGATHER rather -// than the GLD1 nodes added for the SVE gather load intrinsics. -static cl::opt<bool> -EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, - cl::desc("Combine extends of AArch64 masked " - "gather intrinsics"), - cl::init(true)); - +// Temporary option added for the purpose of testing functionality added +// to DAGCombiner.cpp in D92230. It is expected that this can be removed +// in future when both implementations will be based off MGATHER rather +// than the GLD1 nodes added for the SVE gather load intrinsics. +static cl::opt<bool> +EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, + cl::desc("Combine extends of AArch64 masked " + "gather intrinsics"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; -static inline EVT getPackedSVEVectorVT(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("unexpected element type for vector"); - case MVT::i8: - return MVT::nxv16i8; - case MVT::i16: - return MVT::nxv8i16; - case MVT::i32: - return MVT::nxv4i32; - case MVT::i64: - return MVT::nxv2i64; - case MVT::f16: - return MVT::nxv8f16; - case MVT::f32: - return MVT::nxv4f32; - case MVT::f64: - return MVT::nxv2f64; - case MVT::bf16: - return MVT::nxv8bf16; - } -} - -// NOTE: Currently there's only a need to return integer vector types. If this -// changes then just add an extra "type" parameter. -static inline EVT getPackedSVEVectorVT(ElementCount EC) { - switch (EC.getKnownMinValue()) { - default: - llvm_unreachable("unexpected element count for vector"); - case 16: - return MVT::nxv16i8; - case 8: - return MVT::nxv8i16; - case 4: - return MVT::nxv4i32; - case 2: - return MVT::nxv2i64; - } -} - -static inline EVT getPromotedVTForPredicate(EVT VT) { - assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && - "Expected scalable predicate vector type!"); - switch (VT.getVectorMinNumElements()) { - default: - llvm_unreachable("unexpected element count for vector"); - case 2: - return MVT::nxv2i64; - case 4: - return MVT::nxv4i32; - case 8: - return MVT::nxv8i16; - case 16: - return MVT::nxv16i8; - } -} - +static inline EVT getPackedSVEVectorVT(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unexpected element type for vector"); + case MVT::i8: + return MVT::nxv16i8; + case MVT::i16: + return MVT::nxv8i16; + case MVT::i32: + return MVT::nxv4i32; + case MVT::i64: + return MVT::nxv2i64; + case MVT::f16: + return MVT::nxv8f16; + case MVT::f32: + return MVT::nxv4f32; + case MVT::f64: + return MVT::nxv2f64; + case MVT::bf16: + return MVT::nxv8bf16; + } +} + +// NOTE: Currently there's only a need to return integer vector types. If this +// changes then just add an extra "type" parameter. +static inline EVT getPackedSVEVectorVT(ElementCount EC) { + switch (EC.getKnownMinValue()) { + default: + llvm_unreachable("unexpected element count for vector"); + case 16: + return MVT::nxv16i8; + case 8: + return MVT::nxv8i16; + case 4: + return MVT::nxv4i32; + case 2: + return MVT::nxv2i64; + } +} + +static inline EVT getPromotedVTForPredicate(EVT VT) { + assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) && + "Expected scalable predicate vector type!"); + switch (VT.getVectorMinNumElements()) { + default: + llvm_unreachable("unexpected element count for vector"); + case 2: + return MVT::nxv2i64; + case 4: + return MVT::nxv4i32; + case 8: + return MVT::nxv8i16; + case 16: + return MVT::nxv16i8; + } +} + /// Returns true if VT's elements occupy the lowest bit positions of its /// associated register class without any intervening space. /// @@ -194,42 +194,42 @@ static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) { VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock; } -// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading -// predicate and end with a passthru value matching the result type. -static bool isMergePassthruOpcode(unsigned Opc) { - switch (Opc) { - default: - return false; - case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: - case AArch64ISD::BSWAP_MERGE_PASSTHRU: - case AArch64ISD::CTLZ_MERGE_PASSTHRU: - case AArch64ISD::CTPOP_MERGE_PASSTHRU: - case AArch64ISD::DUP_MERGE_PASSTHRU: - case AArch64ISD::ABS_MERGE_PASSTHRU: - case AArch64ISD::NEG_MERGE_PASSTHRU: - case AArch64ISD::FNEG_MERGE_PASSTHRU: - case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: - case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: - case AArch64ISD::FCEIL_MERGE_PASSTHRU: - case AArch64ISD::FFLOOR_MERGE_PASSTHRU: - case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: - case AArch64ISD::FRINT_MERGE_PASSTHRU: - case AArch64ISD::FROUND_MERGE_PASSTHRU: - case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: - case AArch64ISD::FTRUNC_MERGE_PASSTHRU: - case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: - case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: - case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: - case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: - case AArch64ISD::FCVTZU_MERGE_PASSTHRU: - case AArch64ISD::FCVTZS_MERGE_PASSTHRU: - case AArch64ISD::FSQRT_MERGE_PASSTHRU: - case AArch64ISD::FRECPX_MERGE_PASSTHRU: - case AArch64ISD::FABS_MERGE_PASSTHRU: - return true; - } -} - +// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading +// predicate and end with a passthru value matching the result type. +static bool isMergePassthruOpcode(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: + case AArch64ISD::BSWAP_MERGE_PASSTHRU: + case AArch64ISD::CTLZ_MERGE_PASSTHRU: + case AArch64ISD::CTPOP_MERGE_PASSTHRU: + case AArch64ISD::DUP_MERGE_PASSTHRU: + case AArch64ISD::ABS_MERGE_PASSTHRU: + case AArch64ISD::NEG_MERGE_PASSTHRU: + case AArch64ISD::FNEG_MERGE_PASSTHRU: + case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU: + case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU: + case AArch64ISD::FCEIL_MERGE_PASSTHRU: + case AArch64ISD::FFLOOR_MERGE_PASSTHRU: + case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU: + case AArch64ISD::FRINT_MERGE_PASSTHRU: + case AArch64ISD::FROUND_MERGE_PASSTHRU: + case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU: + case AArch64ISD::FTRUNC_MERGE_PASSTHRU: + case AArch64ISD::FP_ROUND_MERGE_PASSTHRU: + case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: + case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: + case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: + case AArch64ISD::FCVTZU_MERGE_PASSTHRU: + case AArch64ISD::FCVTZS_MERGE_PASSTHRU: + case AArch64ISD::FSQRT_MERGE_PASSTHRU: + case AArch64ISD::FRECPX_MERGE_PASSTHRU: + case AArch64ISD::FABS_MERGE_PASSTHRU: + return true; + } +} + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -263,8 +263,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); - if (Subtarget->hasBF16()) - addDRTypeForNEON(MVT::v4bf16); + if (Subtarget->hasBF16()) + addDRTypeForNEON(MVT::v4bf16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -273,8 +273,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); - if (Subtarget->hasBF16()) - addQRTypeForNEON(MVT::v8bf16); + if (Subtarget->hasBF16()) + addQRTypeForNEON(MVT::v8bf16); } if (Subtarget->hasSVE()) { @@ -303,7 +303,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass); } - if (Subtarget->useSVEForFixedLengthVectors()) { + if (Subtarget->useSVEForFixedLengthVectors()) { for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addRegisterClass(VT, &AArch64::ZPRRegClass); @@ -334,9 +334,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::nxv2f64 }) { setCondCodeAction(ISD::SETO, VT, Expand); setCondCodeAction(ISD::SETOLT, VT, Expand); - setCondCodeAction(ISD::SETLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); setCondCodeAction(ISD::SETOLE, VT, Expand); - setCondCodeAction(ISD::SETLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); setCondCodeAction(ISD::SETULT, VT, Expand); setCondCodeAction(ISD::SETULE, VT, Expand); setCondCodeAction(ISD::SETUGE, VT, Expand); @@ -402,12 +402,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); - setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FDIV, MVT::f128, LibCall); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, Expand); - setOperationAction(ISD::FMUL, MVT::f128, LibCall); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); @@ -415,7 +415,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); - setOperationAction(ISD::FSUB, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); @@ -451,10 +451,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); @@ -509,9 +509,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::CTPOP, MVT::i128, Custom); - setOperationAction(ISD::ABS, MVT::i32, Custom); - setOperationAction(ISD::ABS, MVT::i64, Custom); - + setOperationAction(ISD::ABS, MVT::i32, Custom); + setOperationAction(ISD::ABS, MVT::i64, Custom); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); for (MVT VT : MVT::fixedlen_vector_valuetypes()) { @@ -699,57 +699,57 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); - // Generate outline atomics library calls only if LSE was not specified for - // subtarget - if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); - setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); -#define LCALLNAMES(A, B, N) \ - setLibcallName(A##N##_RELAX, #B #N "_relax"); \ - setLibcallName(A##N##_ACQ, #B #N "_acq"); \ - setLibcallName(A##N##_REL, #B #N "_rel"); \ - setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); -#define LCALLNAME4(A, B) \ - LCALLNAMES(A, B, 1) \ - LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) -#define LCALLNAME5(A, B) \ - LCALLNAMES(A, B, 1) \ - LCALLNAMES(A, B, 2) \ - LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) - LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) - LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) - LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) - LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) - LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) - LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) -#undef LCALLNAMES -#undef LCALLNAME4 -#undef LCALLNAME5 - } - + // Generate outline atomics library calls only if LSE was not specified for + // subtarget + if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) { + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall); +#define LCALLNAMES(A, B, N) \ + setLibcallName(A##N##_RELAX, #B #N "_relax"); \ + setLibcallName(A##N##_ACQ, #B #N "_acq"); \ + setLibcallName(A##N##_REL, #B #N "_rel"); \ + setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel"); +#define LCALLNAME4(A, B) \ + LCALLNAMES(A, B, 1) \ + LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) +#define LCALLNAME5(A, B) \ + LCALLNAMES(A, B, 1) \ + LCALLNAMES(A, B, 2) \ + LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16) + LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas) + LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp) + LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd) + LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset) + LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr) + LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor) +#undef LCALLNAMES +#undef LCALLNAME4 +#undef LCALLNAME5 + } + // 128-bit loads and stores can be done without expanding setOperationAction(ISD::LOAD, MVT::i128, Custom); setOperationAction(ISD::STORE, MVT::i128, Custom); @@ -839,8 +839,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); - setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); - setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); + setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); @@ -850,7 +850,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::ABS); + setTargetDAGCombine(ISD::ABS); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); @@ -867,15 +867,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::MGATHER); - setTargetDAGCombine(ISD::MSCATTER); - + setTargetDAGCombine(ISD::MGATHER); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); @@ -884,8 +884,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::VECREDUCE_ADD); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::VECREDUCE_ADD); setTargetDAGCombine(ISD::GlobalAddress); @@ -1005,34 +1005,34 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - // Saturates + // Saturates for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SADDSAT, VT, Legal); setOperationAction(ISD::UADDSAT, VT, Legal); setOperationAction(ISD::SSUBSAT, VT, Legal); setOperationAction(ISD::USUBSAT, VT, Legal); - } + } - // Vector reductions + // Vector reductions for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { - if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { - setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); - - setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); - } - } - for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, - MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { - setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) { + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + + setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); + } } - setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); + for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, + MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } + setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); @@ -1093,112 +1093,112 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a // splat of 0 or undef) once vector selects supported in SVE codegen. See // D68877 for more details. - for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { - setOperationAction(ISD::BITREVERSE, VT, Custom); - setOperationAction(ISD::BSWAP, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); - setOperationAction(ISD::UINT_TO_FP, VT, Custom); - setOperationAction(ISD::SINT_TO_FP, VT, Custom); - setOperationAction(ISD::FP_TO_UINT, VT, Custom); - setOperationAction(ISD::FP_TO_SINT, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); - setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction(ISD::SDIV, VT, Custom); - setOperationAction(ISD::UDIV, VT, Custom); - setOperationAction(ISD::SMIN, VT, Custom); - setOperationAction(ISD::UMIN, VT, Custom); - setOperationAction(ISD::SMAX, VT, Custom); - setOperationAction(ISD::UMAX, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::ABS, VT, Custom); - setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_AND, VT, Custom); - setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { + setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::BSWAP, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); } - // Illegal unpacked integer vector types. - for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { + // Illegal unpacked integer vector types. + for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); - } - - for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::VECREDUCE_AND, VT, Custom); - setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); - - // There are no legal MVT::nxv16f## based types. - if (VT != MVT::nxv16i1) { - setOperationAction(ISD::SINT_TO_FP, VT, Custom); - setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } + + for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + + // There are no legal MVT::nxv16f## based types. + if (VT != MVT::nxv16i1) { + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); } } - for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, - MVT::nxv4f32, MVT::nxv2f64}) { - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction(ISD::FADD, VT, Custom); - setOperationAction(ISD::FDIV, VT, Custom); - setOperationAction(ISD::FMA, VT, Custom); - setOperationAction(ISD::FMAXNUM, VT, Custom); - setOperationAction(ISD::FMINNUM, VT, Custom); - setOperationAction(ISD::FMUL, VT, Custom); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FSUB, VT, Custom); - setOperationAction(ISD::FCEIL, VT, Custom); - setOperationAction(ISD::FFLOOR, VT, Custom); - setOperationAction(ISD::FNEARBYINT, VT, Custom); - setOperationAction(ISD::FRINT, VT, Custom); - setOperationAction(ISD::FROUND, VT, Custom); - setOperationAction(ISD::FROUNDEVEN, VT, Custom); - setOperationAction(ISD::FTRUNC, VT, Custom); - setOperationAction(ISD::FSQRT, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FP_EXTEND, VT, Custom); - setOperationAction(ISD::FP_ROUND, VT, Custom); - setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); - } - - for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { - setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); - } - - setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); - - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - + for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, + MVT::nxv4f32, MVT::nxv2f64}) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FDIV, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FMAXNUM, VT, Custom); + setOperationAction(ISD::FMINNUM, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FSUB, VT, Custom); + setOperationAction(ISD::FCEIL, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Custom); + setOperationAction(ISD::FNEARBYINT, VT, Custom); + setOperationAction(ISD::FRINT, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FROUNDEVEN, VT, Custom); + setOperationAction(ISD::FTRUNC, VT, Custom); + setOperationAction(ISD::FSQRT, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + } + + for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + } + + setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. - if (Subtarget->useSVEForFixedLengthVectors()) { + if (Subtarget->useSVEForFixedLengthVectors()) { for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) if (useSVEForFixedLengthVectorVT(VT)) addTypeForFixedLengthSVE(VT); @@ -1216,61 +1216,61 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, VT, Custom); for (auto VT : {MVT::v8f16, MVT::v4f32}) setOperationAction(ISD::FP_ROUND, VT, Expand); - - // These operations are not supported on NEON but SVE can do them. - setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v1i64, Custom); - setOperationAction(ISD::MUL, MVT::v2i64, Custom); - setOperationAction(ISD::SDIV, MVT::v8i8, Custom); - setOperationAction(ISD::SDIV, MVT::v16i8, Custom); - setOperationAction(ISD::SDIV, MVT::v4i16, Custom); - setOperationAction(ISD::SDIV, MVT::v8i16, Custom); - setOperationAction(ISD::SDIV, MVT::v2i32, Custom); - setOperationAction(ISD::SDIV, MVT::v4i32, Custom); - setOperationAction(ISD::SDIV, MVT::v1i64, Custom); - setOperationAction(ISD::SDIV, MVT::v2i64, Custom); - setOperationAction(ISD::SMAX, MVT::v1i64, Custom); - setOperationAction(ISD::SMAX, MVT::v2i64, Custom); - setOperationAction(ISD::SMIN, MVT::v1i64, Custom); - setOperationAction(ISD::SMIN, MVT::v2i64, Custom); - setOperationAction(ISD::UDIV, MVT::v8i8, Custom); - setOperationAction(ISD::UDIV, MVT::v16i8, Custom); - setOperationAction(ISD::UDIV, MVT::v4i16, Custom); - setOperationAction(ISD::UDIV, MVT::v8i16, Custom); - setOperationAction(ISD::UDIV, MVT::v2i32, Custom); - setOperationAction(ISD::UDIV, MVT::v4i32, Custom); - setOperationAction(ISD::UDIV, MVT::v1i64, Custom); - setOperationAction(ISD::UDIV, MVT::v2i64, Custom); - setOperationAction(ISD::UMAX, MVT::v1i64, Custom); - setOperationAction(ISD::UMAX, MVT::v2i64, Custom); - setOperationAction(ISD::UMIN, MVT::v1i64, Custom); - setOperationAction(ISD::UMIN, MVT::v2i64, Custom); - setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); - - // Int operations with no NEON support. - for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, - MVT::v2i32, MVT::v4i32, MVT::v2i64}) { - setOperationAction(ISD::BITREVERSE, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); - setOperationAction(ISD::VECREDUCE_AND, VT, Custom); - setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); - } - - // FP operations with no NEON support. - for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, - MVT::v1f64, MVT::v2f64}) - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); - - // Use SVE for vectors with more than 2 elements. - for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) - setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + + // These operations are not supported on NEON but SVE can do them. + setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v1i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); + setOperationAction(ISD::MUL, MVT::v1i64, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::SDIV, MVT::v8i8, Custom); + setOperationAction(ISD::SDIV, MVT::v16i8, Custom); + setOperationAction(ISD::SDIV, MVT::v4i16, Custom); + setOperationAction(ISD::SDIV, MVT::v8i16, Custom); + setOperationAction(ISD::SDIV, MVT::v2i32, Custom); + setOperationAction(ISD::SDIV, MVT::v4i32, Custom); + setOperationAction(ISD::SDIV, MVT::v1i64, Custom); + setOperationAction(ISD::SDIV, MVT::v2i64, Custom); + setOperationAction(ISD::SMAX, MVT::v1i64, Custom); + setOperationAction(ISD::SMAX, MVT::v2i64, Custom); + setOperationAction(ISD::SMIN, MVT::v1i64, Custom); + setOperationAction(ISD::SMIN, MVT::v2i64, Custom); + setOperationAction(ISD::UDIV, MVT::v8i8, Custom); + setOperationAction(ISD::UDIV, MVT::v16i8, Custom); + setOperationAction(ISD::UDIV, MVT::v4i16, Custom); + setOperationAction(ISD::UDIV, MVT::v8i16, Custom); + setOperationAction(ISD::UDIV, MVT::v2i32, Custom); + setOperationAction(ISD::UDIV, MVT::v4i32, Custom); + setOperationAction(ISD::UDIV, MVT::v1i64, Custom); + setOperationAction(ISD::UDIV, MVT::v2i64, Custom); + setOperationAction(ISD::UMAX, MVT::v1i64, Custom); + setOperationAction(ISD::UMAX, MVT::v2i64, Custom); + setOperationAction(ISD::UMIN, MVT::v1i64, Custom); + setOperationAction(ISD::UMIN, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); + + // Int operations with no NEON support. + for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, + MVT::v2i32, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + } + + // FP operations with no NEON support. + for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, + MVT::v1f64, MVT::v2f64}) + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + + // Use SVE for vectors with more than 2 elements. + for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32}) + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } } @@ -1342,7 +1342,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. if (VT.isFloatingPoint() && - VT.getVectorElementType() != MVT::bf16 && + VT.getVectorElementType() != MVT::bf16 && (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM}) @@ -1368,64 +1368,64 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Lower fixed length vector operations to scalable equivalents. - setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); - setOperationAction(ISD::AND, VT, Custom); - setOperationAction(ISD::ANY_EXTEND, VT, Custom); - setOperationAction(ISD::BITREVERSE, VT, Custom); - setOperationAction(ISD::BSWAP, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::ANY_EXTEND, VT, Custom); + setOperationAction(ISD::BITREVERSE, VT, Custom); + setOperationAction(ISD::BSWAP, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); - setOperationAction(ISD::FCEIL, VT, Custom); - setOperationAction(ISD::FDIV, VT, Custom); - setOperationAction(ISD::FFLOOR, VT, Custom); - setOperationAction(ISD::FMA, VT, Custom); - setOperationAction(ISD::FMAXNUM, VT, Custom); - setOperationAction(ISD::FMINNUM, VT, Custom); - setOperationAction(ISD::FMUL, VT, Custom); - setOperationAction(ISD::FNEARBYINT, VT, Custom); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FRINT, VT, Custom); - setOperationAction(ISD::FROUND, VT, Custom); - setOperationAction(ISD::FSQRT, VT, Custom); - setOperationAction(ISD::FSUB, VT, Custom); - setOperationAction(ISD::FTRUNC, VT, Custom); + setOperationAction(ISD::FCEIL, VT, Custom); + setOperationAction(ISD::FDIV, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FMAXNUM, VT, Custom); + setOperationAction(ISD::FMINNUM, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FNEARBYINT, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FRINT, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FSQRT, VT, Custom); + setOperationAction(ISD::FSUB, VT, Custom); + setOperationAction(ISD::FTRUNC, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom); - setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::OR, VT, Custom); - setOperationAction(ISD::SDIV, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SIGN_EXTEND, VT, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); - setOperationAction(ISD::SMAX, VT, Custom); - setOperationAction(ISD::SMIN, VT, Custom); - setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SIGN_EXTEND, VT, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); - setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::UDIV, VT, Custom); - setOperationAction(ISD::UMAX, VT, Custom); - setOperationAction(ISD::UMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_AND, VT, Custom); - setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); - setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); - setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::XOR, VT, Custom); - setOperationAction(ISD::ZERO_EXTEND, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND, VT, Custom); } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { @@ -1597,7 +1597,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( KnownBits Known2; Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = KnownBits::commonBits(Known, Known2); break; } case AArch64ISD::LOADgot: @@ -1737,38 +1737,38 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ADD_PRED) - MAKE_CASE(AArch64ISD::MUL_PRED) + MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::SDIV_PRED) - MAKE_CASE(AArch64ISD::SHL_PRED) - MAKE_CASE(AArch64ISD::SMAX_PRED) - MAKE_CASE(AArch64ISD::SMIN_PRED) - MAKE_CASE(AArch64ISD::SRA_PRED) - MAKE_CASE(AArch64ISD::SRL_PRED) - MAKE_CASE(AArch64ISD::SUB_PRED) + MAKE_CASE(AArch64ISD::SHL_PRED) + MAKE_CASE(AArch64ISD::SMAX_PRED) + MAKE_CASE(AArch64ISD::SMIN_PRED) + MAKE_CASE(AArch64ISD::SRA_PRED) + MAKE_CASE(AArch64ISD::SRL_PRED) + MAKE_CASE(AArch64ISD::SUB_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) - MAKE_CASE(AArch64ISD::UMAX_PRED) - MAKE_CASE(AArch64ISD::UMIN_PRED) - MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::UMAX_PRED) + MAKE_CASE(AArch64ISD::UMIN_PRED) + MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO) MAKE_CASE(AArch64ISD::ADC) MAKE_CASE(AArch64ISD::SBC) @@ -1837,14 +1837,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::UADDV) MAKE_CASE(AArch64ISD::SRHADD) MAKE_CASE(AArch64ISD::URHADD) - MAKE_CASE(AArch64ISD::SHADD) - MAKE_CASE(AArch64ISD::UHADD) + MAKE_CASE(AArch64ISD::SHADD) + MAKE_CASE(AArch64ISD::UHADD) MAKE_CASE(AArch64ISD::SMINV) MAKE_CASE(AArch64ISD::UMINV) MAKE_CASE(AArch64ISD::SMAXV) MAKE_CASE(AArch64ISD::UMAXV) - MAKE_CASE(AArch64ISD::SADDV_PRED) - MAKE_CASE(AArch64ISD::UADDV_PRED) + MAKE_CASE(AArch64ISD::SADDV_PRED) + MAKE_CASE(AArch64ISD::UADDV_PRED) MAKE_CASE(AArch64ISD::SMAXV_PRED) MAKE_CASE(AArch64ISD::UMAXV_PRED) MAKE_CASE(AArch64ISD::SMINV_PRED) @@ -1862,16 +1862,16 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FADD_PRED) MAKE_CASE(AArch64ISD::FADDA_PRED) MAKE_CASE(AArch64ISD::FADDV_PRED) - MAKE_CASE(AArch64ISD::FDIV_PRED) + MAKE_CASE(AArch64ISD::FDIV_PRED) MAKE_CASE(AArch64ISD::FMA_PRED) MAKE_CASE(AArch64ISD::FMAXV_PRED) - MAKE_CASE(AArch64ISD::FMAXNM_PRED) + MAKE_CASE(AArch64ISD::FMAXNM_PRED) MAKE_CASE(AArch64ISD::FMAXNMV_PRED) MAKE_CASE(AArch64ISD::FMINV_PRED) - MAKE_CASE(AArch64ISD::FMINNM_PRED) + MAKE_CASE(AArch64ISD::FMINNM_PRED) MAKE_CASE(AArch64ISD::FMINNMV_PRED) - MAKE_CASE(AArch64ISD::FMUL_PRED) - MAKE_CASE(AArch64ISD::FSUB_PRED) + MAKE_CASE(AArch64ISD::FMUL_PRED) + MAKE_CASE(AArch64ISD::FSUB_PRED) MAKE_CASE(AArch64ISD::BIT) MAKE_CASE(AArch64ISD::CBZ) MAKE_CASE(AArch64ISD::CBNZ) @@ -1983,15 +1983,15 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::LDP) MAKE_CASE(AArch64ISD::STP) MAKE_CASE(AArch64ISD::STNP) - MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) - MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::INDEX_VECTOR) - MAKE_CASE(AArch64ISD::UABD) - MAKE_CASE(AArch64ISD::SABD) - MAKE_CASE(AArch64ISD::CALL_RVMARKER) + MAKE_CASE(AArch64ISD::UABD) + MAKE_CASE(AArch64ISD::SABD) + MAKE_CASE(AArch64ISD::CALL_RVMARKER) } #undef MAKE_CASE return nullptr; @@ -2079,7 +2079,7 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: - case TargetOpcode::STATEPOINT: + case TargetOpcode::STATEPOINT: return emitPatchPoint(MI, BB); case AArch64::CATCHRET: @@ -2905,9 +2905,9 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { return std::make_pair(Value, Overflow); } -SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) - return LowerToScalableOp(Op, DAG); +SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToScalableOp(Op, DAG); SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); @@ -3083,18 +3083,18 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isScalableVector()) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); - + if (Op.getValueType().isScalableVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); - return SDValue(); + return SDValue(); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isScalableVector()) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); - + if (Op.getValueType().isScalableVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); + bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); @@ -3108,7 +3108,7 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, return Op; } - return SDValue(); + return SDValue(); } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -3118,14 +3118,14 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); - - if (VT.isScalableVector()) { - unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT - ? AArch64ISD::FCVTZU_MERGE_PASSTHRU - : AArch64ISD::FCVTZS_MERGE_PASSTHRU; - return LowerToPredicatedOp(Op, DAG, Opcode); - } - + + if (VT.isScalableVector()) { + unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT + ? AArch64ISD::FCVTZU_MERGE_PASSTHRU + : AArch64ISD::FCVTZS_MERGE_PASSTHRU; + return LowerToPredicatedOp(Op, DAG, Opcode); + } + unsigned NumElts = InVT.getVectorNumElements(); // f16 conversions are promoted to f32 when full fp16 is not supported. @@ -3138,9 +3138,9 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); } - uint64_t VTSize = VT.getFixedSizeInBits(); - uint64_t InVTSize = InVT.getFixedSizeInBits(); - if (VTSize < InVTSize) { + uint64_t VTSize = VT.getFixedSizeInBits(); + uint64_t InVTSize = InVT.getFixedSizeInBits(); + if (VTSize < InVTSize) { SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), @@ -3148,7 +3148,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); } - if (VTSize > InVTSize) { + if (VTSize > InVTSize) { SDLoc dl(Op); MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), @@ -3183,11 +3183,11 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, return Op; } - return SDValue(); + return SDValue(); } -SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. @@ -3195,38 +3195,38 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, SDLoc dl(Op); SDValue In = Op.getOperand(0); EVT InVT = In.getValueType(); - unsigned Opc = Op.getOpcode(); - bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; - - if (VT.isScalableVector()) { - if (InVT.getVectorElementType() == MVT::i1) { - // We can't directly extend an SVE predicate; extend it first. - unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - EVT CastVT = getPromotedVTForPredicate(InVT); - In = DAG.getNode(CastOpc, dl, CastVT, In); - return DAG.getNode(Opc, dl, VT, In); - } - - unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU - : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; - return LowerToPredicatedOp(Op, DAG, Opcode); - } - - uint64_t VTSize = VT.getFixedSizeInBits(); - uint64_t InVTSize = InVT.getFixedSizeInBits(); - if (VTSize < InVTSize) { + unsigned Opc = Op.getOpcode(); + bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; + + if (VT.isScalableVector()) { + if (InVT.getVectorElementType() == MVT::i1) { + // We can't directly extend an SVE predicate; extend it first. + unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + EVT CastVT = getPromotedVTForPredicate(InVT); + In = DAG.getNode(CastOpc, dl, CastVT, In); + return DAG.getNode(Opc, dl, VT, In); + } + + unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU + : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; + return LowerToPredicatedOp(Op, DAG, Opcode); + } + + uint64_t VTSize = VT.getFixedSizeInBits(); + uint64_t InVTSize = InVT.getFixedSizeInBits(); + if (VTSize < InVTSize) { MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); - In = DAG.getNode(Opc, dl, CastVT, In); + In = DAG.getNode(Opc, dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } - if (VTSize > InVTSize) { - unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + if (VTSize > InVTSize) { + unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); - return DAG.getNode(Opc, dl, VT, In); + return DAG.getNode(Opc, dl, VT, In); } return Op; @@ -3259,7 +3259,7 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, // fp128. if (Op.getValueType() != MVT::f128) return Op; - return SDValue(); + return SDValue(); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, @@ -3373,8 +3373,8 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, } static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND || - N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) + if (N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), @@ -3399,13 +3399,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::SIGN_EXTEND || - N->getOpcode() == ISD::ANY_EXTEND || + N->getOpcode() == ISD::ANY_EXTEND || isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::ZERO_EXTEND || - N->getOpcode() == ISD::ANY_EXTEND || + N->getOpcode() == ISD::ANY_EXTEND || isExtendedBUILD_VECTOR(N, DAG, false); } @@ -3454,15 +3454,15 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, return DAG.getMergeValues({AND, Chain}, dl); } -SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - // If SVE is available then i64 vector multiplications can also be made legal. - bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; - - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); - +SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If SVE is available then i64 vector multiplications can also be made legal. + bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64; + + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED, OverrideNEON); + // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. assert(VT.is128BitVector() && VT.isInteger() && @@ -3623,77 +3623,77 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); - case Intrinsic::aarch64_sve_clz: - return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_cnt: { - SDValue Data = Op.getOperand(3); - // CTPOP only supports integer operands. - if (Data.getValueType().isFloatingPoint()) - Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); - return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Data, Op.getOperand(1)); - } + case Intrinsic::aarch64_sve_clz: + return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_cnt: { + SDValue Data = Op.getOperand(3); + // CTPOP only supports integer operands. + if (Data.getValueType().isFloatingPoint()) + Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data); + return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Data, Op.getOperand(1)); + } case Intrinsic::aarch64_sve_dupq_lane: return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_convert_from_svbool: return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(), Op.getOperand(1)); - case Intrinsic::aarch64_sve_fneg: - return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frintp: - return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frintm: - return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frinti: - return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frintx: - return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frinta: - return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frintn: - return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frintz: - return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_ucvtf: - return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, - Op.getValueType(), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_scvtf: - return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, - Op.getValueType(), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_fcvtzu: - return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, - Op.getValueType(), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_fcvtzs: - return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, - Op.getValueType(), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_fsqrt: - return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_frecpx: - return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_fabs: - return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_abs: - return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_neg: - return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_fneg: + return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintp: + return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintm: + return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frinti: + return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintx: + return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frinta: + return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintn: + return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frintz: + return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_ucvtf: + return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_scvtf: + return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_fcvtzu: + return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_fcvtzs: + return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_fsqrt: + return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frecpx: + return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_fabs: + return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_abs: + return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_neg: + return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_convert_to_svbool: { EVT OutVT = Op.getValueType(); EVT InVT = Op.getOperand(1).getValueType(); @@ -3719,49 +3719,49 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), Op.getOperand(1), Scalar); } - case Intrinsic::aarch64_sve_rbit: - return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, - Op.getValueType(), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_revb: - return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sve_sxtb: - return DAG.getNode( - AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_sxth: - return DAG.getNode( - AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_sxtw: - return DAG.getNode( - AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_uxtb: - return DAG.getNode( - AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_uxth: - return DAG.getNode( - AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), - Op.getOperand(1)); - case Intrinsic::aarch64_sve_uxtw: - return DAG.getNode( - AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(3), - DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), - Op.getOperand(1)); + case Intrinsic::aarch64_sve_rbit: + return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl, + Op.getValueType(), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_revb: + return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_sxtb: + return DAG.getNode( + AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_sxth: + return DAG.getNode( + AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_sxtw: + return DAG.getNode( + AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uxtb: + return DAG.getNode( + AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uxth: + return DAG.getNode( + AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uxtw: + return DAG.getNode( + AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), + DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)), + Op.getOperand(1)); case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); @@ -3801,291 +3801,291 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::aarch64_neon_srhadd: - case Intrinsic::aarch64_neon_urhadd: - case Intrinsic::aarch64_neon_shadd: - case Intrinsic::aarch64_neon_uhadd: { - bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || - IntNo == Intrinsic::aarch64_neon_shadd); - bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || - IntNo == Intrinsic::aarch64_neon_urhadd); - unsigned Opcode = - IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); + case Intrinsic::aarch64_neon_urhadd: + case Intrinsic::aarch64_neon_shadd: + case Intrinsic::aarch64_neon_uhadd: { + bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || + IntNo == Intrinsic::aarch64_neon_shadd); + bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd || + IntNo == Intrinsic::aarch64_neon_urhadd); + unsigned Opcode = + IsSignedAdd ? (IsRoundingAdd ? AArch64ISD::SRHADD : AArch64ISD::SHADD) + : (IsRoundingAdd ? AArch64ISD::URHADD : AArch64ISD::UHADD); return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } - - case Intrinsic::aarch64_neon_uabd: { - return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - case Intrinsic::aarch64_neon_sabd: { - return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - } -} - -bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { - if (VT.getVectorElementType() == MVT::i32 && - VT.getVectorElementCount().getKnownMinValue() >= 4) - return true; - - return false; -} - + + case Intrinsic::aarch64_neon_uabd: { + return DAG.getNode(AArch64ISD::UABD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::aarch64_neon_sabd: { + return DAG.getNode(AArch64ISD::SABD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + } +} + +bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { + if (VT.getVectorElementType() == MVT::i32 && + VT.getVectorElementCount().getKnownMinValue() >= 4) + return true; + + return false; +} + bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return ExtVal.getValueType().isScalableVector(); } -unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { - std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::GLD1_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::GLD1_UXTW_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::GLD1_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::GLD1_SXTW_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::GLD1_SCALED_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::GLD1_SCALED_MERGE_ZERO}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, - }; - auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); - return AddrModes.find(Key)->second; -} - -unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { - std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::SST1_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::SST1_UXTW_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::SST1_PRED}, - {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::SST1_SXTW_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), - AArch64ISD::SST1_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), - AArch64ISD::SST1_UXTW_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), - AArch64ISD::SST1_SCALED_PRED}, - {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), - AArch64ISD::SST1_SXTW_SCALED_PRED}, - }; - auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); - return AddrModes.find(Key)->second; -} - -unsigned getSignExtendedGatherOpcode(unsigned Opcode) { - switch (Opcode) { - default: - llvm_unreachable("unimplemented opcode"); - return Opcode; - case AArch64ISD::GLD1_MERGE_ZERO: - return AArch64ISD::GLD1S_MERGE_ZERO; - case AArch64ISD::GLD1_IMM_MERGE_ZERO: - return AArch64ISD::GLD1S_IMM_MERGE_ZERO; - case AArch64ISD::GLD1_UXTW_MERGE_ZERO: - return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; - case AArch64ISD::GLD1_SXTW_MERGE_ZERO: - return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; - case AArch64ISD::GLD1_SCALED_MERGE_ZERO: - return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; - case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: - return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; - case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: - return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; - } -} - -bool getGatherScatterIndexIsExtended(SDValue Index) { - unsigned Opcode = Index.getOpcode(); - if (Opcode == ISD::SIGN_EXTEND_INREG) - return true; - - if (Opcode == ISD::AND) { - SDValue Splat = Index.getOperand(1); - if (Splat.getOpcode() != ISD::SPLAT_VECTOR) - return false; - ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0)); - if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) - return false; - return true; - } - - return false; -} - -// If the base pointer of a masked gather or scatter is null, we -// may be able to swap BasePtr & Index and use the vector + register -// or vector + immediate addressing mode, e.g. -// VECTOR + REGISTER: -// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) -// -> getelementptr %offset, <vscale x N x T> %indices -// VECTOR + IMMEDIATE: -// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices) -// -> getelementptr #x, <vscale x N x T> %indices -void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, - unsigned &Opcode, bool IsGather, - SelectionDAG &DAG) { - if (!isNullConstant(BasePtr)) - return; - - ConstantSDNode *Offset = nullptr; - if (Index.getOpcode() == ISD::ADD) - if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) { - if (isa<ConstantSDNode>(SplatVal)) - Offset = cast<ConstantSDNode>(SplatVal); - else { - BasePtr = SplatVal; - Index = Index->getOperand(0); - return; - } - } - - unsigned NewOp = - IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED; - - if (!Offset) { - std::swap(BasePtr, Index); - Opcode = NewOp; - return; - } - - uint64_t OffsetVal = Offset->getZExtValue(); - unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8; - auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64); - - if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) { - // Index is out of range for the immediate addressing mode - BasePtr = ConstOffset; - Index = Index->getOperand(0); - return; - } - - // Immediate is in range - Opcode = NewOp; - BasePtr = Index->getOperand(0); - Index = ConstOffset; -} - -SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); - assert(MGT && "Can only custom lower gather load nodes"); - - SDValue Index = MGT->getIndex(); - SDValue Chain = MGT->getChain(); - SDValue PassThru = MGT->getPassThru(); - SDValue Mask = MGT->getMask(); - SDValue BasePtr = MGT->getBasePtr(); - ISD::LoadExtType ExtTy = MGT->getExtensionType(); - - ISD::MemIndexType IndexType = MGT->getIndexType(); - bool IsScaled = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; - bool IsSigned = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; - bool IdxNeedsExtend = - getGatherScatterIndexIsExtended(Index) || - Index.getSimpleValueType().getVectorElementType() == MVT::i32; - bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; - - EVT VT = PassThru.getSimpleValueType(); - EVT MemVT = MGT->getMemoryVT(); - SDValue InputVT = DAG.getValueType(MemVT); - - if (VT.getVectorElementType() == MVT::bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); - - // Handle FP data by using an integer gather and casting the result. - if (VT.isFloatingPoint()) { - EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount()); - PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG); - InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); - } - - SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other); - - if (getGatherScatterIndexIsExtended(Index)) - Index = Index.getOperand(0); - - unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); - selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, - /*isGather=*/true, DAG); - - if (ResNeedsSignExtend) - Opcode = getSignExtendedGatherOpcode(Opcode); - - SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; - SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops); - - if (VT.isFloatingPoint()) { - SDValue Cast = getSVESafeBitCast(VT, Gather, DAG); - return DAG.getMergeValues({Cast, Gather}, DL); - } - - return Gather; -} - -SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); - assert(MSC && "Can only custom lower scatter store nodes"); - - SDValue Index = MSC->getIndex(); - SDValue Chain = MSC->getChain(); - SDValue StoreVal = MSC->getValue(); - SDValue Mask = MSC->getMask(); - SDValue BasePtr = MSC->getBasePtr(); - - ISD::MemIndexType IndexType = MSC->getIndexType(); - bool IsScaled = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; - bool IsSigned = - IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; - bool NeedsExtend = - getGatherScatterIndexIsExtended(Index) || - Index.getSimpleValueType().getVectorElementType() == MVT::i32; - - EVT VT = StoreVal.getSimpleValueType(); - SDVTList VTs = DAG.getVTList(MVT::Other); - EVT MemVT = MSC->getMemoryVT(); - SDValue InputVT = DAG.getValueType(MemVT); - - if (VT.getVectorElementType() == MVT::bf16 && - !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) - return SDValue(); - - // Handle FP data by casting the data so an integer scatter can be used. - if (VT.isFloatingPoint()) { - EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); - StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG); - InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); - } - - if (getGatherScatterIndexIsExtended(Index)) - Index = Index.getOperand(0); - - unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend); - selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, - /*isGather=*/false, DAG); - - SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; - return DAG.getNode(Opcode, DL, VTs, Ops); -} - +unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { + std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::GLD1_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::GLD1_UXTW_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::GLD1_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::GLD1_SXTW_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::GLD1_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::GLD1_SCALED_MERGE_ZERO}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO}, + }; + auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); + return AddrModes.find(Key)->second; +} + +unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { + std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = { + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::SST1_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::SST1_UXTW_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::SST1_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::SST1_SXTW_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::SST1_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::SST1_UXTW_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::SST1_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::SST1_SXTW_SCALED_PRED}, + }; + auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); + return AddrModes.find(Key)->second; +} + +unsigned getSignExtendedGatherOpcode(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unimplemented opcode"); + return Opcode; + case AArch64ISD::GLD1_MERGE_ZERO: + return AArch64ISD::GLD1S_MERGE_ZERO; + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + return AArch64ISD::GLD1S_IMM_MERGE_ZERO; + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; + } +} + +bool getGatherScatterIndexIsExtended(SDValue Index) { + unsigned Opcode = Index.getOpcode(); + if (Opcode == ISD::SIGN_EXTEND_INREG) + return true; + + if (Opcode == ISD::AND) { + SDValue Splat = Index.getOperand(1); + if (Splat.getOpcode() != ISD::SPLAT_VECTOR) + return false; + ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0)); + if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) + return false; + return true; + } + + return false; +} + +// If the base pointer of a masked gather or scatter is null, we +// may be able to swap BasePtr & Index and use the vector + register +// or vector + immediate addressing mode, e.g. +// VECTOR + REGISTER: +// getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) +// -> getelementptr %offset, <vscale x N x T> %indices +// VECTOR + IMMEDIATE: +// getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices) +// -> getelementptr #x, <vscale x N x T> %indices +void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT, + unsigned &Opcode, bool IsGather, + SelectionDAG &DAG) { + if (!isNullConstant(BasePtr)) + return; + + ConstantSDNode *Offset = nullptr; + if (Index.getOpcode() == ISD::ADD) + if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) { + if (isa<ConstantSDNode>(SplatVal)) + Offset = cast<ConstantSDNode>(SplatVal); + else { + BasePtr = SplatVal; + Index = Index->getOperand(0); + return; + } + } + + unsigned NewOp = + IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED; + + if (!Offset) { + std::swap(BasePtr, Index); + Opcode = NewOp; + return; + } + + uint64_t OffsetVal = Offset->getZExtValue(); + unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8; + auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64); + + if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) { + // Index is out of range for the immediate addressing mode + BasePtr = ConstOffset; + Index = Index->getOperand(0); + return; + } + + // Immediate is in range + Opcode = NewOp; + BasePtr = Index->getOperand(0); + Index = ConstOffset; +} + +SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op); + assert(MGT && "Can only custom lower gather load nodes"); + + SDValue Index = MGT->getIndex(); + SDValue Chain = MGT->getChain(); + SDValue PassThru = MGT->getPassThru(); + SDValue Mask = MGT->getMask(); + SDValue BasePtr = MGT->getBasePtr(); + ISD::LoadExtType ExtTy = MGT->getExtensionType(); + + ISD::MemIndexType IndexType = MGT->getIndexType(); + bool IsScaled = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; + bool IsSigned = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; + bool IdxNeedsExtend = + getGatherScatterIndexIsExtended(Index) || + Index.getSimpleValueType().getVectorElementType() == MVT::i32; + bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; + + EVT VT = PassThru.getSimpleValueType(); + EVT MemVT = MGT->getMemoryVT(); + SDValue InputVT = DAG.getValueType(MemVT); + + if (VT.getVectorElementType() == MVT::bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + // Handle FP data by using an integer gather and casting the result. + if (VT.isFloatingPoint()) { + EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount()); + PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG); + InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); + } + + SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other); + + if (getGatherScatterIndexIsExtended(Index)) + Index = Index.getOperand(0); + + unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); + selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, + /*isGather=*/true, DAG); + + if (ResNeedsSignExtend) + Opcode = getSignExtendedGatherOpcode(Opcode); + + SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; + SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops); + + if (VT.isFloatingPoint()) { + SDValue Cast = getSVESafeBitCast(VT, Gather, DAG); + return DAG.getMergeValues({Cast, Gather}, DL); + } + + return Gather; +} + +SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op); + assert(MSC && "Can only custom lower scatter store nodes"); + + SDValue Index = MSC->getIndex(); + SDValue Chain = MSC->getChain(); + SDValue StoreVal = MSC->getValue(); + SDValue Mask = MSC->getMask(); + SDValue BasePtr = MSC->getBasePtr(); + + ISD::MemIndexType IndexType = MSC->getIndexType(); + bool IsScaled = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; + bool IsSigned = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; + bool NeedsExtend = + getGatherScatterIndexIsExtended(Index) || + Index.getSimpleValueType().getVectorElementType() == MVT::i32; + + EVT VT = StoreVal.getSimpleValueType(); + SDVTList VTs = DAG.getVTList(MVT::Other); + EVT MemVT = MSC->getMemoryVT(); + SDValue InputVT = DAG.getValueType(MemVT); + + if (VT.getVectorElementType() == MVT::bf16 && + !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + // Handle FP data by casting the data so an integer scatter can be used. + if (VT.isFloatingPoint()) { + EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); + StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG); + InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); + } + + if (getGatherScatterIndexIsExtended(Index)) + Index = Index.getOperand(0); + + unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend); + selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode, + /*isGather=*/false, DAG); + + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; + return DAG.getNode(Opcode, DL, VTs, Ops); +} + // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, @@ -4151,9 +4151,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, // 256 bit non-temporal stores can be lowered to STNP. Do this as part of // the custom lowering, as there are no un-paired non-temporal stores and // legalization will break up 256 bit inputs. - ElementCount EC = MemVT.getVectorElementCount(); + ElementCount EC = MemVT.getVectorElementCount(); if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && - EC.isKnownEven() && + EC.isKnownEven() && ((MemVT.getScalarSizeInBits() == 8u || MemVT.getScalarSizeInBits() == 16u || MemVT.getScalarSizeInBits() == 32u || @@ -4162,11 +4162,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); - SDValue Hi = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, - MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - StoreNode->getValue(), - DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, @@ -4191,25 +4191,25 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, return SDValue(); } -// Generate SUBS and CSEL for integer abs. -SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); - - if (VT.isVector()) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); - - SDLoc DL(Op); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - Op.getOperand(0)); - // Generate SUBS & CSEL. - SDValue Cmp = - DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), DAG.getConstant(0, DL, VT)); - return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, - DAG.getConstant(AArch64CC::PL, DL, MVT::i32), - Cmp.getValue(1)); -} - +// Generate SUBS and CSEL for integer abs. +SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + + if (VT.isVector()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU); + + SDLoc DL(Op); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Op.getOperand(0)); + // Generate SUBS & CSEL. + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), + Op.getOperand(0), DAG.getConstant(0, DL, VT)); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg, + DAG.getConstant(AArch64CC::PL, DL, MVT::i32), + Cmp.getValue(1)); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -4262,35 +4262,35 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); case ISD::FSUB: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); case ISD::FMUL: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); case ISD::FMA: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); - case ISD::FNEG: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); - case ISD::FCEIL: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); - case ISD::FFLOOR: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); - case ISD::FNEARBYINT: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); - case ISD::FRINT: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); - case ISD::FROUND: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); - case ISD::FROUNDEVEN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); - case ISD::FTRUNC: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); - case ISD::FSQRT: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); - case ISD::FABS: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); + case ISD::FNEG: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); + case ISD::FCEIL: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU); + case ISD::FFLOOR: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU); + case ISD::FNEARBYINT: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU); + case ISD::FRINT: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU); + case ISD::FROUND: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU); + case ISD::FROUNDEVEN: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU); + case ISD::FTRUNC: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU); + case ISD::FSQRT: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU); + case ISD::FABS: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); @@ -4304,8 +4304,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); - case ISD::CONCAT_VECTORS: - return LowerCONCAT_VECTORS(Op, DAG); + case ISD::CONCAT_VECTORS: + return LowerCONCAT_VECTORS(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: @@ -4322,19 +4322,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerINSERT_SUBVECTOR(Op, DAG); case ISD::SDIV: case ISD::UDIV: - return LowerDIV(Op, DAG); + return LowerDIV(Op, DAG); case ISD::SMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, + /*OverrideNEON=*/true); case ISD::UMIN: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, + /*OverrideNEON=*/true); case ISD::SMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, + /*OverrideNEON=*/true); case ISD::UMAX: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, - /*OverrideNEON=*/true); + return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, + /*OverrideNEON=*/true); case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -4374,21 +4374,21 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::MGATHER: - return LowerMGATHER(Op, DAG); - case ISD::MSCATTER: - return LowerMSCATTER(Op, DAG); - case ISD::VECREDUCE_SEQ_FADD: - return LowerVECREDUCE_SEQ_FADD(Op, DAG); + case ISD::MGATHER: + return LowerMGATHER(Op, DAG); + case ISD::MSCATTER: + return LowerMSCATTER(Op, DAG); + case ISD::VECREDUCE_SEQ_FADD: + return LowerVECREDUCE_SEQ_FADD(Op, DAG); case ISD::VECREDUCE_ADD: - case ISD::VECREDUCE_AND: - case ISD::VECREDUCE_OR: - case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: - case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return LowerVECREDUCE(Op, DAG); @@ -4400,21 +4400,21 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); - case ISD::ANY_EXTEND: - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); - case ISD::SIGN_EXTEND_INREG: { - // Only custom lower when ExtraVT has a legal byte based element type. - EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && - (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) - return SDValue(); - - return LowerToPredicatedOp(Op, DAG, - AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); - } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return LowerFixedLengthVectorIntExtendToSVE(Op, DAG); + case ISD::SIGN_EXTEND_INREG: { + // Only custom lower when ExtraVT has a legal byte based element type. + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) && + (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64)) + return SDValue(); + + return LowerToPredicatedOp(Op, DAG, + AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU); + } case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::LOAD: @@ -4422,49 +4422,49 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFixedLengthVectorLoadToSVE(Op, DAG); llvm_unreachable("Unexpected request to lower ISD::LOAD"); case ISD::ADD: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); - case ISD::AND: - return LowerToScalableOp(Op, DAG); - case ISD::SUB: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); - case ISD::FMAXNUM: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); - case ISD::FMINNUM: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); - case ISD::VSELECT: - return LowerFixedLengthVectorSelectToSVE(Op, DAG); - case ISD::ABS: - return LowerABS(Op, DAG); - case ISD::BITREVERSE: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, - /*OverrideNEON=*/true); - case ISD::BSWAP: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); - case ISD::CTLZ: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, - /*OverrideNEON=*/true); - case ISD::CTTZ: - return LowerCTTZ(Op, DAG); - } -} - -bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { - return !Subtarget->useSVEForFixedLengthVectors(); -} - -bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( - EVT VT, bool OverrideNEON) const { - if (!Subtarget->useSVEForFixedLengthVectors()) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); + case ISD::AND: + return LowerToScalableOp(Op, DAG); + case ISD::SUB: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); + case ISD::FMAXNUM: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED); + case ISD::FMINNUM: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED); + case ISD::VSELECT: + return LowerFixedLengthVectorSelectToSVE(Op, DAG); + case ISD::ABS: + return LowerABS(Op, DAG); + case ISD::BITREVERSE: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU, + /*OverrideNEON=*/true); + case ISD::BSWAP: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU); + case ISD::CTLZ: + return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU, + /*OverrideNEON=*/true); + case ISD::CTTZ: + return LowerCTTZ(Op, DAG); + } +} + +bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const { + return !Subtarget->useSVEForFixedLengthVectors(); +} + +bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( + EVT VT, bool OverrideNEON) const { + if (!Subtarget->useSVEForFixedLengthVectors()) return false; if (!VT.isFixedLengthVector()) return false; - // Don't use SVE for vectors we cannot scalarize if required. - switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { + // Don't use SVE for vectors we cannot scalarize if required. + switch (VT.getVectorElementType().getSimpleVT().SimpleTy) { // Fixed length predicates should be promoted to i8. // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work. - case MVT::i1: + case MVT::i1: default: return false; case MVT::i8: @@ -4477,16 +4477,16 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT( break; } - // All SVE implementations support NEON sized vectors. - if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) - return true; - + // All SVE implementations support NEON sized vectors. + if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector())) + return true; + // Ensure NEON MVTs only belong to a single register class. - if (VT.getFixedSizeInBits() <= 128) + if (VT.getFixedSizeInBits() <= 128) return false; // Don't use SVE for types that don't fit. - if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) + if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits()) return false; // TODO: Perhaps an artificial restriction, but worth having whilst getting @@ -4586,9 +4586,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( (void)Res; } SmallVector<SDValue, 16> ArgValues; - unsigned ExtraArgLocs = 0; - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; + unsigned ExtraArgLocs = 0; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a @@ -4716,44 +4716,44 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (VA.getLocInfo() == CCValAssign::Indirect) { assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - - uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); - unsigned NumParts = 1; - if (Ins[i].Flags.isInConsecutiveRegs()) { - assert(!Ins[i].Flags.isInConsecutiveRegsLast()); - while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) - ++NumParts; - } - - MVT PartLoad = VA.getValVT(); - SDValue Ptr = ArgValue; - - // Ensure we generate all loads for each tuple part, whilst updating the - // pointer after each load correctly using vscale. - while (NumParts > 0) { - ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); - InVals.push_back(ArgValue); - NumParts--; - if (NumParts > 0) { - SDValue BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - BytesIncrement, Flags); - ExtraArgLocs++; - i++; - } - } - } else { - if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) - ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), - ArgValue, DAG.getValueType(MVT::i32)); - InVals.push_back(ArgValue); + + uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); + unsigned NumParts = 1; + if (Ins[i].Flags.isInConsecutiveRegs()) { + assert(!Ins[i].Flags.isInConsecutiveRegsLast()); + while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) + ++NumParts; + } + + MVT PartLoad = VA.getValVT(); + SDValue Ptr = ArgValue; + + // Ensure we generate all loads for each tuple part, whilst updating the + // pointer after each load correctly using vscale. + while (NumParts > 0) { + ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); + InVals.push_back(ArgValue); + NumParts--; + if (NumParts > 0) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + ExtraArgLocs++; + i++; + } + } + } else { + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); } } - assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); + assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -4928,7 +4928,7 @@ SDValue AArch64TargetLowering::LowerCallResult( const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const { - CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; DenseMap<unsigned, SDValue> CopiedRegs; @@ -5351,9 +5351,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Walk the register/memloc assignments, inserting copies/loads. - unsigned ExtraArgLocs = 0; - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; + unsigned ExtraArgLocs = 0; + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -5395,49 +5395,49 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - - uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); - uint64_t PartSize = StoreSize; - unsigned NumParts = 1; - if (Outs[i].Flags.isInConsecutiveRegs()) { - assert(!Outs[i].Flags.isInConsecutiveRegsLast()); - while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) - ++NumParts; - StoreSize *= NumParts; - } - + + uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); + uint64_t PartSize = StoreSize; + unsigned NumParts = 1; + if (Outs[i].Flags.isInConsecutiveRegs()) { + assert(!Outs[i].Flags.isInConsecutiveRegsLast()); + while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast()) + ++NumParts; + StoreSize *= NumParts; + } + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); - int FI = MFI.CreateStackObject(StoreSize, Alignment, false); - MFI.setStackID(FI, TargetStackID::ScalableVector); + int FI = MFI.CreateStackObject(StoreSize, Alignment, false); + MFI.setStackID(FI, TargetStackID::ScalableVector); - MachinePointerInfo MPI = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); - SDValue Ptr = DAG.getFrameIndex( + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + SDValue Ptr = DAG.getFrameIndex( FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - SDValue SpillSlot = Ptr; - - // Ensure we generate all stores for each tuple part, whilst updating the - // pointer after each store correctly using vscale. - while (NumParts) { - Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); - NumParts--; - if (NumParts > 0) { - SDValue BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); - SDNodeFlags Flags; - Flags.setNoUnsignedWrap(true); - - MPI = MachinePointerInfo(MPI.getAddrSpace()); - Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - BytesIncrement, Flags); - ExtraArgLocs++; - i++; - } - } - + SDValue SpillSlot = Ptr; + + // Ensure we generate all stores for each tuple part, whilst updating the + // pointer after each store correctly using vscale. + while (NumParts) { + Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); + NumParts--; + if (NumParts > 0) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + + MPI = MachinePointerInfo(MPI.getAddrSpace()); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + ExtraArgLocs++; + i++; + } + } + Arg = SpillSlot; break; } @@ -5457,18 +5457,18 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // take care of putting the two halves in the right place but we have to // combine them. SDValue &Bits = - llvm::find_if(RegsToPass, - [=](const std::pair<unsigned, SDValue> &Elt) { - return Elt.first == VA.getLocReg(); - }) + llvm::find_if(RegsToPass, + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) ->second; Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); // Call site info is used for function's parameter entry value // tracking. For now we track only simple cases when parameter // is transferred through whole register. - llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { - return ArgReg.Reg == VA.getLocReg(); - }); + llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) { + return ArgReg.Reg == VA.getLocReg(); + }); } else { RegsToPass.emplace_back(VA.getLocReg(), Arg); RegsUsed.insert(VA.getLocReg()); @@ -5487,7 +5487,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, uint32_t BEAlign = 0; unsigned OpSize; if (VA.getLocInfo() == CCValAssign::Indirect) - OpSize = VA.getLocVT().getFixedSizeInBits(); + OpSize = VA.getLocVT().getFixedSizeInBits(); else OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); @@ -5647,17 +5647,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, return Ret; } - unsigned CallOpc = AArch64ISD::CALL; - // Calls marked with "rv_marker" are special. They should be expanded to the - // call, directly followed by a special marker sequence. Use the CALL_RVMARKER - // to do that. - if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) { - assert(!IsTailCall && "tail calls cannot be marked with rv_marker"); - CallOpc = AArch64ISD::CALL_RVMARKER; - } - + unsigned CallOpc = AArch64ISD::CALL; + // Calls marked with "rv_marker" are special. They should be expanded to the + // call, directly followed by a special marker sequence. Use the CALL_RVMARKER + // to do that. + if (CLI.CB && CLI.CB->hasRetAttr("rv_marker")) { + assert(!IsTailCall && "tail calls cannot be marked with rv_marker"); + CallOpc = AArch64ISD::CALL_RVMARKER; + } + // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); + Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops); DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -5681,7 +5681,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { - CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); @@ -5696,7 +5696,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, auto &MF = DAG.getMachineFunction(); auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); @@ -5741,9 +5741,9 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (RegsUsed.count(VA.getLocReg())) { SDValue &Bits = - llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { - return Elt.first == VA.getLocReg(); - })->second; + llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + })->second; Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); } else { RetVals.emplace_back(VA.getLocReg(), Arg); @@ -5963,7 +5963,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDValue FuncTLVGet = DAG.getLoad( PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - Align(PtrMemVT.getSizeInBits() / 8), + Align(PtrMemVT.getSizeInBits() / 8), MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); @@ -6278,22 +6278,22 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, llvm_unreachable("Unexpected platform trying to use TLS"); } -// Looks through \param Val to determine the bit that can be used to -// check the sign of the value. It returns the unextended value and -// the sign bit position. -std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { - if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) - return {Val.getOperand(0), - cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - - 1}; - - if (Val.getOpcode() == ISD::SIGN_EXTEND) - return {Val.getOperand(0), - Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; - - return {Val, Val.getValueSizeInBits() - 1}; -} - +// Looks through \param Val to determine the bit that can be used to +// check the sign of the value. It returns the unextended value and +// the sign bit position. +std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) { + if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG) + return {Val.getOperand(0), + cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() - + 1}; + + if (Val.getOpcode() == ISD::SIGN_EXTEND) + return {Val.getOperand(0), + Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1}; + + return {Val, Val.getValueSizeInBits() - 1}; +} + SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); @@ -6388,10 +6388,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t SignBitPos; - std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); + DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && @@ -6399,10 +6399,10 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. - uint64_t SignBitPos; - std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, - DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); + DAG.getConstant(SignBitPos, dl, MVT::i64), Dest); } SDValue CCVal; @@ -6549,9 +6549,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); - + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); + assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); @@ -6575,16 +6575,16 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { return Val; } -SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - assert(VT.isScalableVector() || - useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); - - SDLoc DL(Op); - SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); - return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); -} - +SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isScalableVector() || + useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)); + + SDLoc DL(Op); + SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0)); + return DAG.getNode(ISD::CTLZ, DL, VT, RBIT); +} + SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) @@ -6742,8 +6742,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, // instead of a CSEL in that case. if (TrueVal == ~FalseVal) { Opcode = AArch64ISD::CSINV; - } else if (FalseVal > std::numeric_limits<int64_t>::min() && - TrueVal == -FalseVal) { + } else if (FalseVal > std::numeric_limits<int64_t>::min() && + TrueVal == -FalseVal) { Opcode = AArch64ISD::CSNEG; } else if (TVal.getValueType() == MVT::i32) { // If our operands are only 32-bit wide, make sure we use 32-bit @@ -6943,9 +6943,9 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, SDValue Entry = Op.getOperand(2); int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex(); - auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); - AFI->setJumpTableEntryInfo(JTI, 4, nullptr); - + auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + AFI->setJumpTableEntryInfo(JTI, 4, nullptr); + SDNode *Dest = DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT, Entry, DAG.getTargetJumpTable(JTI, MVT::i32)); @@ -7012,13 +7012,13 @@ SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); - unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; - auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); + unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; + auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); @@ -7028,64 +7028,64 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SmallVector<SDValue, 4> MemOps; // void *__stack at offset 0 - unsigned Offset = 0; + unsigned Offset = 0; SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); - Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); + Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), Align(PtrSize))); + MachinePointerInfo(SV), Align(PtrSize))); - // void *__gr_top at offset 8 (4 on ILP32) - Offset += PtrSize; + // void *__gr_top at offset 8 (4 on ILP32) + Offset += PtrSize; int GPRSize = FuncInfo->getVarArgsGPRSize(); if (GPRSize > 0) { SDValue GRTop, GRTopAddr; - GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Offset, DL, PtrVT)); + GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Offset, DL, PtrVT)); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, DAG.getConstant(GPRSize, DL, PtrVT)); - GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); + GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, Offset), - Align(PtrSize))); + MachinePointerInfo(SV, Offset), + Align(PtrSize))); } - // void *__vr_top at offset 16 (8 on ILP32) - Offset += PtrSize; + // void *__vr_top at offset 16 (8 on ILP32) + Offset += PtrSize; int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Offset, DL, PtrVT)); + DAG.getConstant(Offset, DL, PtrVT)); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, DAG.getConstant(FPRSize, DL, PtrVT)); - VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); + VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, Offset), - Align(PtrSize))); - } - - // int __gr_offs at offset 24 (12 on ILP32) - Offset += PtrSize; - SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Offset, DL, PtrVT)); - MemOps.push_back( - DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), - GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); - - // int __vr_offs at offset 28 (16 on ILP32) - Offset += 4; - SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, - DAG.getConstant(Offset, DL, PtrVT)); - MemOps.push_back( - DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), - VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); + MachinePointerInfo(SV, Offset), + Align(PtrSize))); + } + + // int __gr_offs at offset 24 (12 on ILP32) + Offset += PtrSize; + SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Offset, DL, PtrVT)); + MemOps.push_back( + DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), + GROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); + + // int __vr_offs at offset 28 (16 on ILP32) + Offset += 4; + SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Offset, DL, PtrVT)); + MemOps.push_back( + DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), + VROffsAddr, MachinePointerInfo(SV, Offset), Align(4))); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -7108,10 +7108,10 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // pointer. SDLoc DL(Op); unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; - unsigned VaListSize = - (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) - ? PtrSize - : Subtarget->isTargetILP32() ? 20 : 32; + unsigned VaListSize = + (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) + ? PtrSize + : Subtarget->isTargetILP32() ? 20 : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); @@ -7264,34 +7264,34 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - SDValue ReturnAddress; + SDValue ReturnAddress; if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); - ReturnAddress = DAG.getLoad( - VT, DL, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); - } else { - // Return LR, which contains the return address. Mark it an implicit - // live-in. - unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); - ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); - } - - // The XPACLRI instruction assembles to a hint-space instruction before - // Armv8.3-A therefore this instruction can be safely used for any pre - // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use - // that instead. - SDNode *St; - if (Subtarget->hasPAuth()) { - St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); - } else { - // XPACLRI operates on LR therefore we must move the operand accordingly. - SDValue Chain = - DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); - St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); - } - return SDValue(St, 0); + ReturnAddress = DAG.getLoad( + VT, DL, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); + } else { + // Return LR, which contains the return address. Mark it an implicit + // live-in. + unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); + ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); + } + + // The XPACLRI instruction assembles to a hint-space instruction before + // Armv8.3-A therefore this instruction can be safely used for any pre + // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use + // that instead. + SDNode *St; + if (Subtarget->hasPAuth()) { + St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress); + } else { + // XPACLRI operates on LR therefore we must move the operand accordingly. + SDValue Chain = + DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress); + St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain); + } + return SDValue(St, 0); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two @@ -7472,22 +7472,22 @@ static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, return SDValue(); } -SDValue -AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, - const DenormalMode &Mode) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); -} - -SDValue -AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, - SelectionDAG &DAG) const { - return Op; -} - +SDValue +AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, + const DenormalMode &Mode) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); +} + +SDValue +AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op, + SelectionDAG &DAG) const { + return Op; +} + SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, @@ -7511,7 +7511,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } - if (!Reciprocal) + if (!Reciprocal) Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); ExtraSteps = 0; @@ -7688,30 +7688,30 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': - if (VT.isScalableVector()) - return std::make_pair(0U, nullptr); - if (VT.getFixedSizeInBits() == 64) + if (VT.isScalableVector()) + return std::make_pair(0U, nullptr); + if (VT.getFixedSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); - case 'w': { + case 'w': { if (!Subtarget->hasFPARMv8()) break; - if (VT.isScalableVector()) { - if (VT.getVectorElementType() != MVT::i1) - return std::make_pair(0U, &AArch64::ZPRRegClass); - return std::make_pair(0U, nullptr); - } - uint64_t VTSize = VT.getFixedSizeInBits(); - if (VTSize == 16) + if (VT.isScalableVector()) { + if (VT.getVectorElementType() != MVT::i1) + return std::make_pair(0U, &AArch64::ZPRRegClass); + return std::make_pair(0U, nullptr); + } + uint64_t VTSize = VT.getFixedSizeInBits(); + if (VTSize == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); - if (VTSize == 32) + if (VTSize == 32) return std::make_pair(0U, &AArch64::FPR32RegClass); - if (VTSize == 64) + if (VTSize == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); - if (VTSize == 128) + if (VTSize == 128) return std::make_pair(0U, &AArch64::FPR128RegClass); break; - } + } // The instructions that this constraint is designed for can // only take 128-bit registers so just use that regclass. case 'x': @@ -7732,11 +7732,11 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( } else { PredicateConstraint PC = parsePredicateConstraint(Constraint); if (PC != PredicateConstraint::Invalid) { - if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) - return std::make_pair(0U, nullptr); + if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + return std::make_pair(0U, nullptr); bool restricted = (PC == PredicateConstraint::Upl); return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) - : std::make_pair(0U, &AArch64::PPRRegClass); + : std::make_pair(0U, &AArch64::PPRRegClass); } } if (StringRef("{cc}").equals_lower(Constraint)) @@ -7975,8 +7975,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); SDLoc dl(Op); EVT VT = Op.getValueType(); - assert(!VT.isScalableVector() && - "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); + assert(!VT.isScalableVector() && + "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { @@ -8047,9 +8047,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, } } unsigned ResMultiplier = - VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); - uint64_t VTSize = VT.getFixedSizeInBits(); - NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); + VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits(); + uint64_t VTSize = VT.getFixedSizeInBits(); + NumElts = VTSize / SmallestEltTy.getFixedSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able @@ -8058,18 +8058,18 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); - uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); - if (SrcVTSize == VTSize) + uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); + if (SrcVTSize == VTSize) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); - unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); + unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - if (SrcVTSize < VTSize) { - assert(2 * SrcVTSize == VTSize); + if (SrcVTSize < VTSize) { + assert(2 * SrcVTSize == VTSize); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = @@ -8078,11 +8078,11 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, continue; } - if (SrcVTSize != 2 * VTSize) { - LLVM_DEBUG( - dbgs() << "Reshuffle failed: result vector too small to extract\n"); - return SDValue(); - } + if (SrcVTSize != 2 * VTSize) { + LLVM_DEBUG( + dbgs() << "Reshuffle failed: result vector too small to extract\n"); + return SDValue(); + } if (Src.MaxElt - Src.MinElt >= NumSrcElts) { LLVM_DEBUG( @@ -8111,13 +8111,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); - if (!SrcVT.is64BitVector()) { - LLVM_DEBUG( - dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " - "for SVE vectors."); - return SDValue(); - } - + if (!SrcVT.is64BitVector()) { + LLVM_DEBUG( + dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT " + "for SVE vectors."); + return SDValue(); + } + Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); @@ -8134,8 +8134,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); - Src.WindowScale = - SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); + Src.WindowScale = + SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits(); Src.WindowBase *= Src.WindowScale; } @@ -8159,8 +8159,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); - int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), - VT.getScalarSizeInBits()); + int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), + VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, @@ -8224,81 +8224,81 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { return true; } -/// Check if a vector shuffle corresponds to a DUP instructions with a larger -/// element width than the vector lane type. If that is the case the function -/// returns true and writes the value of the DUP instruction lane operand into -/// DupLaneOp -static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, - unsigned &DupLaneOp) { - assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && - "Only possible block sizes for wide DUP are: 16, 32, 64"); - - if (BlockSize <= VT.getScalarSizeInBits()) - return false; - if (BlockSize % VT.getScalarSizeInBits() != 0) - return false; - if (VT.getSizeInBits() % BlockSize != 0) - return false; - - size_t SingleVecNumElements = VT.getVectorNumElements(); - size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); - size_t NumBlocks = VT.getSizeInBits() / BlockSize; - - // We are looking for masks like - // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element - // might be replaced by 'undefined'. BlockIndices will eventually contain - // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] - // for the above examples) - SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); - for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) - for (size_t I = 0; I < NumEltsPerBlock; I++) { - int Elt = M[BlockIndex * NumEltsPerBlock + I]; - if (Elt < 0) - continue; - // For now we don't support shuffles that use the second operand - if ((unsigned)Elt >= SingleVecNumElements) - return false; - if (BlockElts[I] < 0) - BlockElts[I] = Elt; - else if (BlockElts[I] != Elt) - return false; - } - - // We found a candidate block (possibly with some undefs). It must be a - // sequence of consecutive integers starting with a value divisible by - // NumEltsPerBlock with some values possibly replaced by undef-s. - - // Find first non-undef element - auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); - assert(FirstRealEltIter != BlockElts.end() && - "Shuffle with all-undefs must have been caught by previous cases, " - "e.g. isSplat()"); - if (FirstRealEltIter == BlockElts.end()) { - DupLaneOp = 0; - return true; - } - - // Index of FirstRealElt in BlockElts - size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); - - if ((unsigned)*FirstRealEltIter < FirstRealIndex) - return false; - // BlockElts[0] must have the following value if it isn't undef: - size_t Elt0 = *FirstRealEltIter - FirstRealIndex; - - // Check the first element - if (Elt0 % NumEltsPerBlock != 0) - return false; - // Check that the sequence indeed consists of consecutive integers (modulo - // undefs) - for (size_t I = 0; I < NumEltsPerBlock; I++) - if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) - return false; - - DupLaneOp = Elt0 / NumEltsPerBlock; - return true; -} - +/// Check if a vector shuffle corresponds to a DUP instructions with a larger +/// element width than the vector lane type. If that is the case the function +/// returns true and writes the value of the DUP instruction lane operand into +/// DupLaneOp +static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize, + unsigned &DupLaneOp) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for wide DUP are: 16, 32, 64"); + + if (BlockSize <= VT.getScalarSizeInBits()) + return false; + if (BlockSize % VT.getScalarSizeInBits() != 0) + return false; + if (VT.getSizeInBits() % BlockSize != 0) + return false; + + size_t SingleVecNumElements = VT.getVectorNumElements(); + size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits(); + size_t NumBlocks = VT.getSizeInBits() / BlockSize; + + // We are looking for masks like + // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element + // might be replaced by 'undefined'. BlockIndices will eventually contain + // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7] + // for the above examples) + SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1); + for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++) + for (size_t I = 0; I < NumEltsPerBlock; I++) { + int Elt = M[BlockIndex * NumEltsPerBlock + I]; + if (Elt < 0) + continue; + // For now we don't support shuffles that use the second operand + if ((unsigned)Elt >= SingleVecNumElements) + return false; + if (BlockElts[I] < 0) + BlockElts[I] = Elt; + else if (BlockElts[I] != Elt) + return false; + } + + // We found a candidate block (possibly with some undefs). It must be a + // sequence of consecutive integers starting with a value divisible by + // NumEltsPerBlock with some values possibly replaced by undef-s. + + // Find first non-undef element + auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; }); + assert(FirstRealEltIter != BlockElts.end() && + "Shuffle with all-undefs must have been caught by previous cases, " + "e.g. isSplat()"); + if (FirstRealEltIter == BlockElts.end()) { + DupLaneOp = 0; + return true; + } + + // Index of FirstRealElt in BlockElts + size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin(); + + if ((unsigned)*FirstRealEltIter < FirstRealIndex) + return false; + // BlockElts[0] must have the following value if it isn't undef: + size_t Elt0 = *FirstRealEltIter - FirstRealIndex; + + // Check the first element + if (Elt0 % NumEltsPerBlock != 0) + return false; + // Check that the sequence indeed consists of consecutive integers (modulo + // undefs) + for (size_t I = 0; I < NumEltsPerBlock; I++) + if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I) + return false; + + DupLaneOp = Elt0 / NumEltsPerBlock; + return true; +} + // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, @@ -8732,60 +8732,60 @@ static unsigned getDUPLANEOp(EVT EltType) { llvm_unreachable("Invalid vector element type?"); } -static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, - unsigned Opcode, SelectionDAG &DAG) { - // Try to eliminate a bitcasted extract subvector before a DUPLANE. - auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { - // Match: dup (bitcast (extract_subv X, C)), LaneC - if (BitCast.getOpcode() != ISD::BITCAST || - BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) - return false; - - // The extract index must align in the destination type. That may not - // happen if the bitcast is from narrow to wide type. - SDValue Extract = BitCast.getOperand(0); - unsigned ExtIdx = Extract.getConstantOperandVal(1); - unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); - unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; - unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); - if (ExtIdxInBits % CastedEltBitWidth != 0) - return false; - - // Update the lane value by offsetting with the scaled extract index. - LaneC += ExtIdxInBits / CastedEltBitWidth; - - // Determine the casted vector type of the wide vector input. - // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' - // Examples: - // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 - // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 - unsigned SrcVecNumElts = - Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; - CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), - SrcVecNumElts); - return true; - }; - MVT CastVT; - if (getScaledOffsetDup(V, Lane, CastVT)) { - V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); - } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { - // The lane is incremented by the index of the extract. - // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 - Lane += V.getConstantOperandVal(1); - V = V.getOperand(0); - } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { - // The lane is decremented if we are splatting from the 2nd operand. - // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 - unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; - Lane -= Idx * VT.getVectorNumElements() / 2; - V = WidenVector(V.getOperand(Idx), DAG); - } else if (VT.getSizeInBits() == 64) { - // Widen the operand to 128-bit register with undef. - V = WidenVector(V, DAG); - } - return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); -} - +static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, + unsigned Opcode, SelectionDAG &DAG) { + // Try to eliminate a bitcasted extract subvector before a DUPLANE. + auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { + // Match: dup (bitcast (extract_subv X, C)), LaneC + if (BitCast.getOpcode() != ISD::BITCAST || + BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + // The extract index must align in the destination type. That may not + // happen if the bitcast is from narrow to wide type. + SDValue Extract = BitCast.getOperand(0); + unsigned ExtIdx = Extract.getConstantOperandVal(1); + unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); + unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; + unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); + if (ExtIdxInBits % CastedEltBitWidth != 0) + return false; + + // Update the lane value by offsetting with the scaled extract index. + LaneC += ExtIdxInBits / CastedEltBitWidth; + + // Determine the casted vector type of the wide vector input. + // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' + // Examples: + // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 + // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 + unsigned SrcVecNumElts = + Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; + CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), + SrcVecNumElts); + return true; + }; + MVT CastVT; + if (getScaledOffsetDup(V, Lane, CastVT)) { + V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0)); + } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + // The lane is incremented by the index of the extract. + // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 + Lane += V.getConstantOperandVal(1); + V = V.getOperand(0); + } else if (V.getOpcode() == ISD::CONCAT_VECTORS) { + // The lane is decremented if we are splatting from the 2nd operand. + // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 + unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; + Lane -= Idx * VT.getVectorNumElements() / 2; + V = WidenVector(V.getOperand(Idx), DAG); + } else if (VT.getSizeInBits() == 64) { + // Widen the operand to 128-bit register with undef. + V = WidenVector(V, DAG); + } + return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -8819,25 +8819,25 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); - return constructDup(V1, Lane, dl, VT, Opcode, DAG); - } - - // Check if the mask matches a DUP for a wider element - for (unsigned LaneSize : {64U, 32U, 16U}) { - unsigned Lane = 0; - if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { - unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 - : LaneSize == 32 ? AArch64ISD::DUPLANE32 - : AArch64ISD::DUPLANE16; - // Cast V1 to an integer vector with required lane size - MVT NewEltTy = MVT::getIntegerVT(LaneSize); - unsigned NewEltCount = VT.getSizeInBits() / LaneSize; - MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); - V1 = DAG.getBitcast(NewVecTy, V1); - // Constuct the DUP instruction - V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); - // Cast back to the original type - return DAG.getBitcast(VT, V1); + return constructDup(V1, Lane, dl, VT, Opcode, DAG); + } + + // Check if the mask matches a DUP for a wider element + for (unsigned LaneSize : {64U, 32U, 16U}) { + unsigned Lane = 0; + if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) { + unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64 + : LaneSize == 32 ? AArch64ISD::DUPLANE32 + : AArch64ISD::DUPLANE16; + // Cast V1 to an integer vector with required lane size + MVT NewEltTy = MVT::getIntegerVT(LaneSize); + unsigned NewEltCount = VT.getSizeInBits() / LaneSize; + MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount); + V1 = DAG.getBitcast(NewVecTy, V1); + // Constuct the DUP instruction + V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG); + // Cast back to the original type + return DAG.getBitcast(VT, V1); } } @@ -8909,7 +8909,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, EVT ScalarVT = VT.getVectorElementType(); - if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) + if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( @@ -8950,9 +8950,9 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, EVT ElemVT = VT.getScalarType(); SDValue SplatVal = Op.getOperand(0); - if (useSVEForFixedLengthVectorVT(VT)) - return LowerToScalableOp(Op, DAG); - + if (useSVEForFixedLengthVectorVT(VT)) + return LowerToScalableOp(Op, DAG); + // Extend input splat value where needed to fit into a GPR (32b or 64b only) // FPRs don't have this restriction. switch (ElemVT.getSimpleVT().SimpleTy) { @@ -9382,9 +9382,9 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) - return LowerToScalableOp(Op, DAG); - + if (useSVEForFixedLengthVectorVT(Op.getValueType())) + return LowerToScalableOp(Op, DAG); + // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; @@ -9543,18 +9543,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; - unsigned NumDifferentLanes = 0; - unsigned NumUndefLanes = 0; + unsigned NumDifferentLanes = 0; + unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; - if (V.isUndef()) { - ++NumUndefLanes; + if (V.isUndef()) { + ++NumUndefLanes; continue; - } + } if (i > 0) isOnlyLowElement = false; if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) @@ -9570,10 +9570,10 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!Value.getNode()) Value = V; - else if (V != Value) { + else if (V != Value) { usesOnlyOneValue = false; - ++NumDifferentLanes; - } + ++NumDifferentLanes; + } } if (!Value.getNode()) { @@ -9699,20 +9699,20 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, } } - // If we need to insert a small number of different non-constant elements and - // the vector width is sufficiently large, prefer using DUP with the common - // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, - // skip the constant lane handling below. - bool PreferDUPAndInsert = - !isConstant && NumDifferentLanes >= 1 && - NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && - NumDifferentLanes >= NumConstantLanes; - + // If we need to insert a small number of different non-constant elements and + // the vector width is sufficiently large, prefer using DUP with the common + // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred, + // skip the constant lane handling below. + bool PreferDUPAndInsert = + !isConstant && NumDifferentLanes >= 1 && + NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) && + NumDifferentLanes >= NumConstantLanes; + // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. - if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { + if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), Val = ConstantBuildVector(Vec, DAG); @@ -9748,22 +9748,22 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return shuffle; } - if (PreferDUPAndInsert) { - // First, build a constant vector with the common element. - SmallVector<SDValue, 8> Ops; - for (unsigned I = 0; I < NumElts; ++I) - Ops.push_back(Value); - SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); - // Next, insert the elements that do not match the common value. - for (unsigned I = 0; I < NumElts; ++I) - if (Op.getOperand(I) != Value) - NewVector = - DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, - Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); - - return NewVector; - } - + if (PreferDUPAndInsert) { + // First, build a constant vector with the common element. + SmallVector<SDValue, 8> Ops; + for (unsigned I = 0; I < NumElts; ++I) + Ops.push_back(Value); + SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG); + // Next, insert the elements that do not match the common value. + for (unsigned I = 0; I < NumElts; ++I) + if (Op.getOperand(I) != Value) + NewVector = + DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector, + Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64)); + + return NewVector; + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's @@ -9812,18 +9812,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getValueType().isScalableVector() && - isTypeLegal(Op.getValueType()) && - "Expected legal scalable vector type!"); - - if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2) - return Op; - - return SDValue(); -} - +SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType().isScalableVector() && + isTypeLegal(Op.getValueType()) && + "Expected legal scalable vector type!"); + + if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2) + return Op; + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); @@ -9919,8 +9919,8 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. - if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && - InVT.getSizeInBits() == 128) + if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && + InVT.getSizeInBits() == 128) return Op; return SDValue(); @@ -9934,34 +9934,34 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - if (InVT.isScalableVector()) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - if (!isTypeLegal(VT) || !VT.isInteger()) - return SDValue(); - - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - - // Ensure the subvector is half the size of the main vector. - if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) - return SDValue(); - - // Extend elements of smaller vector... - EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); - SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); - - if (Idx == 0) { - SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); - } else if (Idx == InVT.getVectorMinNumElements()) { - SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); - return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); - } - + if (InVT.isScalableVector()) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (!isTypeLegal(VT) || !VT.isInteger()) + return SDValue(); + + SDValue Vec0 = Op.getOperand(0); + SDValue Vec1 = Op.getOperand(1); + + // Ensure the subvector is half the size of the main vector. + if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) + return SDValue(); + + // Extend elements of smaller vector... + EVT WideVT = InVT.widenIntegerVectorElementType(*(DAG.getContext())); + SDValue ExtVec = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1); + + if (Idx == 0) { + SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, ExtVec, HiVec0); + } else if (Idx == InVT.getVectorMinNumElements()) { + SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0); + return DAG.getNode(AArch64ISD::UZP1, DL, VT, LoVec0, ExtVec); + } + return SDValue(); - } + } // This will be matched by custom code during ISelDAGToDAG. if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) @@ -9970,42 +9970,42 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) - return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); - - assert(VT.isScalableVector() && "Expected a scalable vector."); - - bool Signed = Op.getOpcode() == ISD::SDIV; - unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; - - if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) - return LowerToPredicatedOp(Op, DAG, PredOpcode); - - // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit - // operations, and truncate the result. - EVT WidenedVT; - if (VT == MVT::nxv16i8) - WidenedVT = MVT::nxv8i16; - else if (VT == MVT::nxv8i16) - WidenedVT = MVT::nxv4i32; - else - llvm_unreachable("Unexpected Custom DIV operation"); - - SDLoc dl(Op); - unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; - unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; - SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); - SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); - SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); - SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); - SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); - SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); - return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); -} - +SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) + return LowerFixedLengthVectorIntDivideToSVE(Op, DAG); + + assert(VT.isScalableVector() && "Expected a scalable vector."); + + bool Signed = Op.getOpcode() == ISD::SDIV; + unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; + + if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) + return LowerToPredicatedOp(Op, DAG, PredOpcode); + + // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit + // operations, and truncate the result. + EVT WidenedVT; + if (VT == MVT::nxv16i8) + WidenedVT = MVT::nxv8i16; + else if (VT == MVT::nxv8i16) + WidenedVT = MVT::nxv4i32; + else + llvm_unreachable("Unexpected Custom DIV operation"); + + SDLoc dl(Op); + unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; + unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; + SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); + SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); + SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); + SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); + SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); + SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); + return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); +} + bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. if (useSVEForFixedLengthVectorVT(VT)) @@ -10105,12 +10105,12 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op, } if (!VT.isVector() || VT.isScalableVector()) - return SDValue(); + return SDValue(); if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) return LowerFixedLengthVectorTruncateToSVE(Op, DAG); - return SDValue(); + return SDValue(); } SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, @@ -10128,8 +10128,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, llvm_unreachable("unexpected shift opcode"); case ISD::SHL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), @@ -10140,9 +10140,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { - unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED - : AArch64ISD::SRL_PRED; + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { + unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED + : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); } @@ -10194,7 +10194,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); else Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); - return DAG.getNOT(dl, Fcmeq, VT); + return DAG.getNOT(dl, Fcmeq, VT); } case AArch64CC::EQ: if (IsZero) @@ -10233,7 +10233,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); else Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); - return DAG.getNOT(dl, Cmeq, VT); + return DAG.getNOT(dl, Cmeq, VT); } case AArch64CC::EQ: if (IsZero) @@ -10274,9 +10274,9 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); } - if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) - return LowerFixedLengthVectorSetccToSVE(Op, DAG); - + if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) + return LowerFixedLengthVectorSetccToSVE(Op, DAG); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -10349,51 +10349,51 @@ static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - // Try to lower fixed length reductions to SVE. - EVT SrcVT = Src.getValueType(); - bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || - Op.getOpcode() == ISD::VECREDUCE_OR || - Op.getOpcode() == ISD::VECREDUCE_XOR || - Op.getOpcode() == ISD::VECREDUCE_FADD || - (Op.getOpcode() != ISD::VECREDUCE_ADD && - SrcVT.getVectorElementType() == MVT::i64); - if (SrcVT.isScalableVector() || - useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { - - if (SrcVT.getVectorElementType() == MVT::i1) - return LowerPredReductionToSVE(Op, DAG); - - switch (Op.getOpcode()) { - case ISD::VECREDUCE_ADD: - return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); - case ISD::VECREDUCE_AND: - return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); - case ISD::VECREDUCE_OR: - return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); - case ISD::VECREDUCE_SMAX: - return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); - case ISD::VECREDUCE_SMIN: - return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); - case ISD::VECREDUCE_UMAX: - return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); - case ISD::VECREDUCE_UMIN: - return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); - case ISD::VECREDUCE_XOR: - return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); - case ISD::VECREDUCE_FADD: - return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); - case ISD::VECREDUCE_FMAX: - return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); - case ISD::VECREDUCE_FMIN: - return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); - default: - llvm_unreachable("Unhandled fixed length reduction"); - } - } - - // Lower NEON reductions. + SDValue Src = Op.getOperand(0); + + // Try to lower fixed length reductions to SVE. + EVT SrcVT = Src.getValueType(); + bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || + Op.getOpcode() == ISD::VECREDUCE_OR || + Op.getOpcode() == ISD::VECREDUCE_XOR || + Op.getOpcode() == ISD::VECREDUCE_FADD || + (Op.getOpcode() != ISD::VECREDUCE_ADD && + SrcVT.getVectorElementType() == MVT::i64); + if (SrcVT.isScalableVector() || + useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); + case ISD::VECREDUCE_AND: + return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); + case ISD::VECREDUCE_OR: + return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); + case ISD::VECREDUCE_SMAX: + return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); + case ISD::VECREDUCE_SMIN: + return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); + case ISD::VECREDUCE_UMAX: + return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); + case ISD::VECREDUCE_UMIN: + return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); + case ISD::VECREDUCE_XOR: + return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); + case ISD::VECREDUCE_FADD: + return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); + case ISD::VECREDUCE_FMAX: + return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); + case ISD::VECREDUCE_FMIN: + return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); + default: + llvm_unreachable("Unhandled fixed length reduction"); + } + } + + // Lower NEON reductions. SDLoc dl(Op); switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: @@ -10410,13 +10410,13 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), - Src); + Src); } case ISD::VECREDUCE_FMIN: { return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), - Src); + Src); } default: llvm_unreachable("Unhandled reduction"); @@ -10426,7 +10426,7 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); - if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) + if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); // LSE has an atomic load-add instruction, but not a load-sub. @@ -10443,7 +10443,7 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget()); - if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) + if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics()) return SDValue(); // LSE has an atomic load-clear instruction, but not a load-and. @@ -10544,17 +10544,17 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics. template <unsigned NumVecs> -static bool -setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, - AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { +static bool +setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, + AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) { Info.opc = ISD::INTRINSIC_VOID; // Retrieve EC from first vector argument. - const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); + const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType()); ElementCount EC = VT.getVectorElementCount(); #ifndef NDEBUG // Check the assumption that all input vectors are the same type. for (unsigned I = 0; I < NumVecs; ++I) - assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && + assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) && "Invalid type."); #endif // memVT is `NumVecs * VT`. @@ -10577,11 +10577,11 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_sve_st2: - return setInfoSVEStN<2>(*this, DL, Info, I); + return setInfoSVEStN<2>(*this, DL, Info, I); case Intrinsic::aarch64_sve_st3: - return setInfoSVEStN<3>(*this, DL, Info, I); + return setInfoSVEStN<3>(*this, DL, Info, I); case Intrinsic::aarch64_sve_st4: - return setInfoSVEStN<4>(*this, DL, Info, I); + return setInfoSVEStN<4>(*this, DL, Info, I); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: @@ -10737,15 +10737,15 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; - uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize(); - uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize(); + uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize(); + uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize(); return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; - uint64_t NumBits1 = VT1.getFixedSizeInBits(); - uint64_t NumBits2 = VT2.getFixedSizeInBits(); + uint64_t NumBits1 = VT1.getFixedSizeInBits(); + uint64_t NumBits2 = VT2.getFixedSizeInBits(); return NumBits1 > NumBits2; } @@ -10987,43 +10987,43 @@ bool AArch64TargetLowering::shouldSinkOperands( return true; } - case Instruction::Mul: { - bool IsProfitable = false; - for (auto &Op : I->operands()) { - // Make sure we are not already sinking this operand - if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) - continue; - - ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); - if (!Shuffle || !Shuffle->isZeroEltSplat()) - continue; - - Value *ShuffleOperand = Shuffle->getOperand(0); - InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); - if (!Insert) - continue; - - Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); - if (!OperandInstr) - continue; - - ConstantInt *ElementConstant = - dyn_cast<ConstantInt>(Insert->getOperand(2)); - // Check that the insertelement is inserting into element 0 - if (!ElementConstant || ElementConstant->getZExtValue() != 0) - continue; - - unsigned Opcode = OperandInstr->getOpcode(); - if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) - continue; - - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&Op); - IsProfitable = true; - } - - return IsProfitable; - } + case Instruction::Mul: { + bool IsProfitable = false; + for (auto &Op : I->operands()) { + // Make sure we are not already sinking this operand + if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op); + if (!Shuffle || !Shuffle->isZeroEltSplat()) + continue; + + Value *ShuffleOperand = Shuffle->getOperand(0); + InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand); + if (!Insert) + continue; + + Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1)); + if (!OperandInstr) + continue; + + ConstantInt *ElementConstant = + dyn_cast<ConstantInt>(Insert->getOperand(2)); + // Check that the insertelement is inserting into element 0 + if (!ElementConstant || ElementConstant->getZExtValue() != 0) + continue; + + unsigned Opcode = OperandInstr->getOpcode(); + if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt) + continue; + + Ops.push_back(&Shuffle->getOperandUse(0)); + Ops.push_back(&Op); + IsProfitable = true; + } + + return IsProfitable; + } default: return false; } @@ -11359,12 +11359,12 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; - assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && + assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && "invalid tuple vector type!"); - EVT SplitVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorElementCount().divideCoefficientBy(N)); + EVT SplitVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorElementCount().divideCoefficientBy(N)); assert(isTypeLegal(SplitVT)); SmallVector<EVT, 5> VTs(N, SplitVT); @@ -11655,86 +11655,86 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); } -// VECREDUCE_ADD( EXTEND(v16i8_type) ) to -// VECREDUCE_ADD( DOTv16i8(v16i8_type) ) -static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *ST) { - SDValue Op0 = N->getOperand(0); - if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32) - return SDValue(); - - if (Op0.getValueType().getVectorElementType() != MVT::i32) - return SDValue(); - - unsigned ExtOpcode = Op0.getOpcode(); - if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) - return SDValue(); - - EVT Op0VT = Op0.getOperand(0).getValueType(); - if (Op0VT != MVT::v16i8) - return SDValue(); - - SDLoc DL(Op0); - SDValue Ones = DAG.getConstant(1, DL, Op0VT); - SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32); - auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND) - ? Intrinsic::aarch64_neon_udot - : Intrinsic::aarch64_neon_sdot; - SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(), - DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros, - Ones, Op0.getOperand(0)); - return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); -} - -// Given a ABS node, detect the following pattern: -// (ABS (SUB (EXTEND a), (EXTEND b))). -// Generates UABD/SABD instruction. -static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const AArch64Subtarget *Subtarget) { - SDValue AbsOp1 = N->getOperand(0); - SDValue Op0, Op1; - - if (AbsOp1.getOpcode() != ISD::SUB) - return SDValue(); - - Op0 = AbsOp1.getOperand(0); - Op1 = AbsOp1.getOperand(1); - - unsigned Opc0 = Op0.getOpcode(); - // Check if the operands of the sub are (zero|sign)-extended. - if (Opc0 != Op1.getOpcode() || - (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND)) - return SDValue(); - - EVT VectorT1 = Op0.getOperand(0).getValueType(); - EVT VectorT2 = Op1.getOperand(0).getValueType(); - // Check if vectors are of same type and valid size. - uint64_t Size = VectorT1.getFixedSizeInBits(); - if (VectorT1 != VectorT2 || (Size != 64 && Size != 128)) - return SDValue(); - - // Check if vector element types are valid. - EVT VT1 = VectorT1.getVectorElementType(); - if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32) - return SDValue(); - - Op0 = Op0.getOperand(0); - Op1 = Op1.getOperand(0); - unsigned ABDOpcode = - (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD; - SDValue ABD = - DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1); - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD); -} - +// VECREDUCE_ADD( EXTEND(v16i8_type) ) to +// VECREDUCE_ADD( DOTv16i8(v16i8_type) ) +static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *ST) { + SDValue Op0 = N->getOperand(0); + if (!ST->hasDotProd() || N->getValueType(0) != MVT::i32) + return SDValue(); + + if (Op0.getValueType().getVectorElementType() != MVT::i32) + return SDValue(); + + unsigned ExtOpcode = Op0.getOpcode(); + if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) + return SDValue(); + + EVT Op0VT = Op0.getOperand(0).getValueType(); + if (Op0VT != MVT::v16i8) + return SDValue(); + + SDLoc DL(Op0); + SDValue Ones = DAG.getConstant(1, DL, Op0VT); + SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32); + auto DotIntrisic = (ExtOpcode == ISD::ZERO_EXTEND) + ? Intrinsic::aarch64_neon_udot + : Intrinsic::aarch64_neon_sdot; + SDValue Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Zeros.getValueType(), + DAG.getConstant(DotIntrisic, DL, MVT::i32), Zeros, + Ones, Op0.getOperand(0)); + return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot); +} + +// Given a ABS node, detect the following pattern: +// (ABS (SUB (EXTEND a), (EXTEND b))). +// Generates UABD/SABD instruction. +static SDValue performABSCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SDValue AbsOp1 = N->getOperand(0); + SDValue Op0, Op1; + + if (AbsOp1.getOpcode() != ISD::SUB) + return SDValue(); + + Op0 = AbsOp1.getOperand(0); + Op1 = AbsOp1.getOperand(1); + + unsigned Opc0 = Op0.getOpcode(); + // Check if the operands of the sub are (zero|sign)-extended. + if (Opc0 != Op1.getOpcode() || + (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND)) + return SDValue(); + + EVT VectorT1 = Op0.getOperand(0).getValueType(); + EVT VectorT2 = Op1.getOperand(0).getValueType(); + // Check if vectors are of same type and valid size. + uint64_t Size = VectorT1.getFixedSizeInBits(); + if (VectorT1 != VectorT2 || (Size != 64 && Size != 128)) + return SDValue(); + + // Check if vector element types are valid. + EVT VT1 = VectorT1.getVectorElementType(); + if (VT1 != MVT::i8 && VT1 != MVT::i16 && VT1 != MVT::i32) + return SDValue(); + + Op0 = Op0.getOperand(0); + Op1 = Op1.getOperand(0); + unsigned ABDOpcode = + (Opc0 == ISD::SIGN_EXTEND) ? AArch64ISD::SABD : AArch64ISD::UABD; + SDValue ABD = + DAG.getNode(ABDOpcode, SDLoc(N), Op0->getValueType(0), Op0, Op1); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), ABD); +} + static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); - return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); + return foldVectorXorShiftIntoCmp(N, DAG, Subtarget); } SDValue @@ -11793,157 +11793,157 @@ static bool IsSVECntIntrinsic(SDValue S) { return false; } -/// Calculates what the pre-extend type is, based on the extension -/// operation node provided by \p Extend. -/// -/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the -/// pre-extend type is pulled directly from the operand, while other extend -/// operations need a bit more inspection to get this information. -/// -/// \param Extend The SDNode from the DAG that represents the extend operation -/// \param DAG The SelectionDAG hosting the \p Extend node -/// -/// \returns The type representing the \p Extend source type, or \p MVT::Other -/// if no valid type can be determined -static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { - switch (Extend.getOpcode()) { - case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: - return Extend.getOperand(0).getValueType(); - case ISD::AssertSext: - case ISD::AssertZext: - case ISD::SIGN_EXTEND_INREG: { - VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1)); - if (!TypeNode) - return MVT::Other; - return TypeNode->getVT(); - } - case ISD::AND: { - ConstantSDNode *Constant = - dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode()); - if (!Constant) - return MVT::Other; - - uint32_t Mask = Constant->getZExtValue(); - - if (Mask == UCHAR_MAX) - return MVT::i8; - else if (Mask == USHRT_MAX) - return MVT::i16; - else if (Mask == UINT_MAX) - return MVT::i32; - - return MVT::Other; - } - default: - return MVT::Other; - } - - llvm_unreachable("Code path unhandled in calculatePreExtendType!"); -} - -/// Combines a dup(sext/zext) node pattern into sext/zext(dup) -/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt -static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, - SelectionDAG &DAG) { - - ShuffleVectorSDNode *ShuffleNode = - dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode()); - if (!ShuffleNode) - return SDValue(); - - // Ensuring the mask is zero before continuing - if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) - return SDValue(); - - SDValue InsertVectorElt = VectorShuffle.getOperand(0); - - if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - - SDValue InsertLane = InsertVectorElt.getOperand(2); - ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode()); - // Ensures the insert is inserting into lane 0 - if (!Constant || Constant->getZExtValue() != 0) - return SDValue(); - - SDValue Extend = InsertVectorElt.getOperand(1); - unsigned ExtendOpcode = Extend.getOpcode(); - - bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || - ExtendOpcode == ISD::SIGN_EXTEND_INREG || - ExtendOpcode == ISD::AssertSext; - if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && - ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) - return SDValue(); - - EVT TargetType = VectorShuffle.getValueType(); - EVT PreExtendType = calculatePreExtendType(Extend, DAG); - - if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && - TargetType != MVT::v2i64) || - (PreExtendType == MVT::Other)) - return SDValue(); - - // Restrict valid pre-extend data type - if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 && - PreExtendType != MVT::i32) - return SDValue(); - - EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); - - if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) - return SDValue(); - - if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) - return SDValue(); - - SDLoc DL(VectorShuffle); - - SDValue InsertVectorNode = DAG.getNode( - InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), - DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType), - DAG.getConstant(0, DL, MVT::i64)); - - std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue()); - - SDValue VectorShuffleNode = - DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, - DAG.getUNDEF(PreExtendVT), ShuffleMask); - - SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, - DL, TargetType, VectorShuffleNode); - - return ExtendNode; -} - -/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) -/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt -static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { - // If the value type isn't a vector, none of the operands are going to be dups - if (!Mul->getValueType(0).isVector()) - return SDValue(); - - SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); - SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); - - // Neither operands have been changed, don't make any further changes - if (!Op0 && !Op1) - return SDValue(); - - SDLoc DL(Mul); - return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), - Op0 ? Op0 : Mul->getOperand(0), - Op1 ? Op1 : Mul->getOperand(1)); -} - +/// Calculates what the pre-extend type is, based on the extension +/// operation node provided by \p Extend. +/// +/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the +/// pre-extend type is pulled directly from the operand, while other extend +/// operations need a bit more inspection to get this information. +/// +/// \param Extend The SDNode from the DAG that represents the extend operation +/// \param DAG The SelectionDAG hosting the \p Extend node +/// +/// \returns The type representing the \p Extend source type, or \p MVT::Other +/// if no valid type can be determined +static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { + switch (Extend.getOpcode()) { + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return Extend.getOperand(0).getValueType(); + case ISD::AssertSext: + case ISD::AssertZext: + case ISD::SIGN_EXTEND_INREG: { + VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1)); + if (!TypeNode) + return MVT::Other; + return TypeNode->getVT(); + } + case ISD::AND: { + ConstantSDNode *Constant = + dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode()); + if (!Constant) + return MVT::Other; + + uint32_t Mask = Constant->getZExtValue(); + + if (Mask == UCHAR_MAX) + return MVT::i8; + else if (Mask == USHRT_MAX) + return MVT::i16; + else if (Mask == UINT_MAX) + return MVT::i32; + + return MVT::Other; + } + default: + return MVT::Other; + } + + llvm_unreachable("Code path unhandled in calculatePreExtendType!"); +} + +/// Combines a dup(sext/zext) node pattern into sext/zext(dup) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, + SelectionDAG &DAG) { + + ShuffleVectorSDNode *ShuffleNode = + dyn_cast<ShuffleVectorSDNode>(VectorShuffle.getNode()); + if (!ShuffleNode) + return SDValue(); + + // Ensuring the mask is zero before continuing + if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) + return SDValue(); + + SDValue InsertVectorElt = VectorShuffle.getOperand(0); + + if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + SDValue InsertLane = InsertVectorElt.getOperand(2); + ConstantSDNode *Constant = dyn_cast<ConstantSDNode>(InsertLane.getNode()); + // Ensures the insert is inserting into lane 0 + if (!Constant || Constant->getZExtValue() != 0) + return SDValue(); + + SDValue Extend = InsertVectorElt.getOperand(1); + unsigned ExtendOpcode = Extend.getOpcode(); + + bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || + ExtendOpcode == ISD::SIGN_EXTEND_INREG || + ExtendOpcode == ISD::AssertSext; + if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && + ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) + return SDValue(); + + EVT TargetType = VectorShuffle.getValueType(); + EVT PreExtendType = calculatePreExtendType(Extend, DAG); + + if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && + TargetType != MVT::v2i64) || + (PreExtendType == MVT::Other)) + return SDValue(); + + // Restrict valid pre-extend data type + if (PreExtendType != MVT::i8 && PreExtendType != MVT::i16 && + PreExtendType != MVT::i32) + return SDValue(); + + EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); + + if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) + return SDValue(); + + if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) + return SDValue(); + + SDLoc DL(VectorShuffle); + + SDValue InsertVectorNode = DAG.getNode( + InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), + DAG.getAnyExtOrTrunc(Extend.getOperand(0), DL, PreExtendType), + DAG.getConstant(0, DL, MVT::i64)); + + std::vector<int> ShuffleMask(TargetType.getVectorElementCount().getValue()); + + SDValue VectorShuffleNode = + DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, + DAG.getUNDEF(PreExtendVT), ShuffleMask); + + SDValue ExtendNode = DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + DL, TargetType, VectorShuffleNode); + + return ExtendNode; +} + +/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { + // If the value type isn't a vector, none of the operands are going to be dups + if (!Mul->getValueType(0).isVector()) + return SDValue(); + + SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); + SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); + + // Neither operands have been changed, don't make any further changes + if (!Op0 && !Op1) + return SDValue(); + + SDLoc DL(Mul); + return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), + Op0 ? Op0 : Mul->getOperand(0), + Op1 ? Op1 : Mul->getOperand(1)); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { - - if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) - return Ext; - + + if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) + return Ext; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -12478,9 +12478,9 @@ static SDValue performSVEAndCombine(SDNode *N, return DAG.getNode(Opc, DL, N->getValueType(0), And); } - if (!EnableCombineMGatherIntrinsics) - return SDValue(); - + if (!EnableCombineMGatherIntrinsics) + return SDValue(); + SDValue Mask = N->getOperand(1); if (!Src.hasOneUse()) @@ -12534,11 +12534,11 @@ static SDValue performANDCombine(SDNode *N, if (VT.isScalableVector()) return performSVEAndCombine(N, DCI); - // The combining code below works only for NEON vectors. In particular, it - // does not work for SVE when dealing with vectors wider than 128 bits. - if (!(VT.is64BitVector() || VT.is128BitVector())) - return SDValue(); - + // The combining code below works only for NEON vectors. In particular, it + // does not work for SVE when dealing with vectors wider than 128 bits. + if (!(VT.is64BitVector() || VT.is128BitVector())) + return SDValue(); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); if (!BVN) @@ -12599,143 +12599,143 @@ static SDValue performSRLCombine(SDNode *N, return SDValue(); } -// Attempt to form urhadd(OpA, OpB) from -// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) -// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). -// The original form of the first expression is -// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the -// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)). -// Before this function is called the srl will have been lowered to -// AArch64ISD::VLSHR. -// This pass can also recognize signed variants of the patterns that use sign -// extension instead of zero extension and form a srhadd(OpA, OpB) or a -// shadd(OpA, OpB) from them. -static SDValue -performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // Since we are looking for a right shift by a constant value of 1 and we are - // operating on types at least 16 bits in length (sign/zero extended OpA and - // OpB, which are at least 8 bits), it follows that the truncate will always - // discard the shifted-in bit and therefore the right shift will be logical - // regardless of the signedness of OpA and OpB. - SDValue Shift = N->getOperand(0); - if (Shift.getOpcode() != AArch64ISD::VLSHR) - return SDValue(); - - // Is the right shift using an immediate value of 1? - uint64_t ShiftAmount = Shift.getConstantOperandVal(1); - if (ShiftAmount != 1) - return SDValue(); - - SDValue ExtendOpA, ExtendOpB; - SDValue ShiftOp0 = Shift.getOperand(0); - unsigned ShiftOp0Opc = ShiftOp0.getOpcode(); - if (ShiftOp0Opc == ISD::SUB) { - - SDValue Xor = ShiftOp0.getOperand(1); - if (Xor.getOpcode() != ISD::XOR) - return SDValue(); - - // Is the XOR using a constant amount of all ones in the right hand side? - uint64_t C; - if (!isAllConstantBuildVector(Xor.getOperand(1), C)) - return SDValue(); - - unsigned ElemSizeInBits = VT.getScalarSizeInBits(); - APInt CAsAPInt(ElemSizeInBits, C); - if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) - return SDValue(); - - ExtendOpA = Xor.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(0); - } else if (ShiftOp0Opc == ISD::ADD) { - ExtendOpA = ShiftOp0.getOperand(0); - ExtendOpB = ShiftOp0.getOperand(1); - } else - return SDValue(); - - unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); - unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); - if (!(ExtendOpAOpc == ExtendOpBOpc && - (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) - return SDValue(); - - // Is the result of the right shift being truncated to the same value type as - // the original operands, OpA and OpB? - SDValue OpA = ExtendOpA.getOperand(0); - SDValue OpB = ExtendOpB.getOperand(0); - EVT OpAVT = OpA.getValueType(); - assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); - if (!(VT == OpAVT && OpAVT == OpB.getValueType())) - return SDValue(); - - SDLoc DL(N); - bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; - bool IsRHADD = ShiftOp0Opc == ISD::SUB; - unsigned HADDOpc = IsSignExtend - ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD) - : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD); - SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB); - - return ResultHADD; -} - -static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { - switch (Opcode) { - case ISD::FADD: - return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; - case ISD::ADD: - return VT == MVT::i64; - default: - return false; - } -} - -static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1); - - EVT VT = N->getValueType(0); - const bool FullFP16 = - static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); - - // Rewrite for pairwise fadd pattern - // (f32 (extract_vector_elt - // (fadd (vXf32 Other) - // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) - // -> - // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) - // (extract_vector_elt (vXf32 Other) 1)) - if (ConstantN1 && ConstantN1->getZExtValue() == 0 && - hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) { - SDLoc DL(N0); - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); - - ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); - SDValue Other = N00; - - // And handle the commutative case. - if (!Shuffle) { - Shuffle = dyn_cast<ShuffleVectorSDNode>(N00); - Other = N01; - } - - if (Shuffle && Shuffle->getMaskElt(0) == 1 && - Other == Shuffle->getOperand(0)) { - return DAG.getNode(N0->getOpcode(), DL, VT, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, - DAG.getConstant(0, DL, MVT::i64)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, - DAG.getConstant(1, DL, MVT::i64))); - } - } - - return SDValue(); -} - +// Attempt to form urhadd(OpA, OpB) from +// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)) +// or uhadd(OpA, OpB) from truncate(vlshr(add(zext(OpA), zext(OpB)), 1)). +// The original form of the first expression is +// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and the +// (OpA + OpB + 1) subexpression will have been changed to (OpB - (~OpA)). +// Before this function is called the srl will have been lowered to +// AArch64ISD::VLSHR. +// This pass can also recognize signed variants of the patterns that use sign +// extension instead of zero extension and form a srhadd(OpA, OpB) or a +// shadd(OpA, OpB) from them. +static SDValue +performVectorTruncateCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Since we are looking for a right shift by a constant value of 1 and we are + // operating on types at least 16 bits in length (sign/zero extended OpA and + // OpB, which are at least 8 bits), it follows that the truncate will always + // discard the shifted-in bit and therefore the right shift will be logical + // regardless of the signedness of OpA and OpB. + SDValue Shift = N->getOperand(0); + if (Shift.getOpcode() != AArch64ISD::VLSHR) + return SDValue(); + + // Is the right shift using an immediate value of 1? + uint64_t ShiftAmount = Shift.getConstantOperandVal(1); + if (ShiftAmount != 1) + return SDValue(); + + SDValue ExtendOpA, ExtendOpB; + SDValue ShiftOp0 = Shift.getOperand(0); + unsigned ShiftOp0Opc = ShiftOp0.getOpcode(); + if (ShiftOp0Opc == ISD::SUB) { + + SDValue Xor = ShiftOp0.getOperand(1); + if (Xor.getOpcode() != ISD::XOR) + return SDValue(); + + // Is the XOR using a constant amount of all ones in the right hand side? + uint64_t C; + if (!isAllConstantBuildVector(Xor.getOperand(1), C)) + return SDValue(); + + unsigned ElemSizeInBits = VT.getScalarSizeInBits(); + APInt CAsAPInt(ElemSizeInBits, C); + if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits)) + return SDValue(); + + ExtendOpA = Xor.getOperand(0); + ExtendOpB = ShiftOp0.getOperand(0); + } else if (ShiftOp0Opc == ISD::ADD) { + ExtendOpA = ShiftOp0.getOperand(0); + ExtendOpB = ShiftOp0.getOperand(1); + } else + return SDValue(); + + unsigned ExtendOpAOpc = ExtendOpA.getOpcode(); + unsigned ExtendOpBOpc = ExtendOpB.getOpcode(); + if (!(ExtendOpAOpc == ExtendOpBOpc && + (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND))) + return SDValue(); + + // Is the result of the right shift being truncated to the same value type as + // the original operands, OpA and OpB? + SDValue OpA = ExtendOpA.getOperand(0); + SDValue OpB = ExtendOpB.getOperand(0); + EVT OpAVT = OpA.getValueType(); + assert(ExtendOpA.getValueType() == ExtendOpB.getValueType()); + if (!(VT == OpAVT && OpAVT == OpB.getValueType())) + return SDValue(); + + SDLoc DL(N); + bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND; + bool IsRHADD = ShiftOp0Opc == ISD::SUB; + unsigned HADDOpc = IsSignExtend + ? (IsRHADD ? AArch64ISD::SRHADD : AArch64ISD::SHADD) + : (IsRHADD ? AArch64ISD::URHADD : AArch64ISD::UHADD); + SDValue ResultHADD = DAG.getNode(HADDOpc, DL, VT, OpA, OpB); + + return ResultHADD; +} + +static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { + switch (Opcode) { + case ISD::FADD: + return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64; + case ISD::ADD: + return VT == MVT::i64; + default: + return false; + } +} + +static SDValue performExtractVectorEltCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1); + + EVT VT = N->getValueType(0); + const bool FullFP16 = + static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16(); + + // Rewrite for pairwise fadd pattern + // (f32 (extract_vector_elt + // (fadd (vXf32 Other) + // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0)) + // -> + // (f32 (fadd (extract_vector_elt (vXf32 Other) 0) + // (extract_vector_elt (vXf32 Other) 1)) + if (ConstantN1 && ConstantN1->getZExtValue() == 0 && + hasPairwiseAdd(N0->getOpcode(), VT, FullFP16)) { + SDLoc DL(N0); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + + ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01); + SDValue Other = N00; + + // And handle the commutative case. + if (!Shuffle) { + Shuffle = dyn_cast<ShuffleVectorSDNode>(N00); + Other = N01; + } + + if (Shuffle && Shuffle->getMaskElt(0) == 1 && + Other == Shuffle->getOperand(0)) { + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(0, DL, MVT::i64)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other, + DAG.getConstant(1, DL, MVT::i64))); + } + } + + return SDValue(); +} + static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -12781,9 +12781,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); - // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted - // subvectors from the same original vectors. Combine these into a single - // [us]rhadd or [us]hadd that operates on the two original vectors. Example: + // Optimise concat_vectors of two [us]rhadds or [us]hadds that use extracted + // subvectors from the same original vectors. Combine these into a single + // [us]rhadd or [us]hadd that operates on the two original vectors. Example: // (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>), // extract_subvector (v16i8 OpB, // <0>))), @@ -12793,8 +12793,8 @@ static SDValue performConcatVectorsCombine(SDNode *N, // -> // (v16i8(urhadd(v16i8 OpA, v16i8 OpB))) if (N->getNumOperands() == 2 && N0Opc == N1Opc && - (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD || - N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) { + (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD || + N0Opc == AArch64ISD::UHADD || N0Opc == AArch64ISD::SHADD)) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); @@ -13099,43 +13099,43 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } -// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) -static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - // Only scalar integer and vector types. - if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) - return SDValue(); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) - return SDValue(); - - auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); - auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); - if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue()) - return SDValue(); - - SDValue Op1 = LHS->getOperand(0); - SDValue Op2 = RHS->getOperand(0); - EVT OpVT1 = Op1.getValueType(); - EVT OpVT2 = Op2.getValueType(); - if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || - Op2.getOpcode() != AArch64ISD::UADDV || - OpVT1.getVectorElementType() != VT) - return SDValue(); - - SDValue Val1 = Op1.getOperand(0); - SDValue Val2 = Op2.getOperand(0); - EVT ValVT = Val1->getValueType(0); - SDLoc DL(N); - SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, - DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), - DAG.getConstant(0, DL, MVT::i64)); -} - +// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b) +static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + // Only scalar integer and vector types. + if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT) + return SDValue(); + + auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); + auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1)); + if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isNullValue()) + return SDValue(); + + SDValue Op1 = LHS->getOperand(0); + SDValue Op2 = RHS->getOperand(0); + EVT OpVT1 = Op1.getValueType(); + EVT OpVT2 = Op2.getValueType(); + if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 || + Op2.getOpcode() != AArch64ISD::UADDV || + OpVT1.getVectorElementType() != VT) + return SDValue(); + + SDValue Val1 = Op1.getOperand(0); + SDValue Val2 = Op2.getOperand(0); + EVT ValVT = Val1->getValueType(0); + SDLoc DL(N); + SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal), + DAG.getConstant(0, DL, MVT::i64)); +} + // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: @@ -13189,16 +13189,16 @@ static SDValue performAddSubLongCombine(SDNode *N, return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } -static SDValue performAddSubCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Try to change sum of two reductions. - if (SDValue Val = performUADDVCombine(N, DAG)) - return Val; - - return performAddSubLongCombine(N, DCI, DAG); -} - +static SDValue performAddSubCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Try to change sum of two reductions. + if (SDValue Val = performUADDVCombine(N, DAG)) + return Val; + + return performAddSubLongCombine(N, DCI, DAG); +} + // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // @@ -13212,8 +13212,8 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); - SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); + SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1); + SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2); assert(LHS.getValueType().is64BitVector() && RHS.getValueType().is64BitVector() && "unexpected shape for long operation"); @@ -13231,9 +13231,9 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, return SDValue(); } - if (IID == Intrinsic::not_intrinsic) - return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); - + if (IID == Intrinsic::not_intrinsic) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), N->getOperand(0), LHS, RHS); } @@ -13374,8 +13374,8 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8; - EVT ByteVT = - EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); + EVT ByteVT = + EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize)); // Convert everything to the domain of EXT (i.e bytes). SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); @@ -13475,25 +13475,25 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, return DAG.getZExtOrTrunc(Res, DL, VT); } -static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, - SelectionDAG &DAG) { - SDLoc DL(N); - - SDValue Pred = N->getOperand(1); - SDValue VecToReduce = N->getOperand(2); - - // NOTE: The integer reduction's result type is not always linked to the - // operand's element type so we construct it from the intrinsic's result type. - EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); - SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); - - // SVE reductions set the whole vector register with the first element - // containing the reduction result, which we'll now extract. - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, - Zero); -} - +static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue VecToReduce = N->getOperand(2); + + // NOTE: The integer reduction's result type is not always linked to the + // operand's element type so we construct it from the intrinsic's result type. + EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0)); + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG) { SDLoc DL(N); @@ -13534,25 +13534,25 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } -// If a merged operation has no inactive lanes we can relax it to a predicated -// or unpredicated operation, which potentially allows better isel (perhaps -// using immediate forms) or relaxing register reuse requirements. -static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, - SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); - assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); - SDValue Pg = N->getOperand(1); - - // ISD way to specify an all active predicate. - if ((Pg.getOpcode() == AArch64ISD::PTRUE) && - (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) - return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, - N->getOperand(2), N->getOperand(3)); - - // FUTURE: SplatVector(true) - return SDValue(); -} - +// If a merged operation has no inactive lanes we can relax it to a predicated +// or unpredicated operation, which potentially allows better isel (perhaps +// using immediate forms) or relaxing register reuse requirements. +static SDValue convertMergedOpToPredOp(SDNode *N, unsigned PredOpc, + SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!"); + assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!"); + SDValue Pg = N->getOperand(1); + + // ISD way to specify an all active predicate. + if ((Pg.getOpcode() == AArch64ISD::PTRUE) && + (Pg.getConstantOperandVal(0) == AArch64SVEPredPattern::all)) + return DAG.getNode(PredOpc, SDLoc(N), N->getValueType(0), Pg, + N->getOperand(2), N->getOperand(3)); + + // FUTURE: SplatVector(true) + return SDValue(); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -13607,28 +13607,28 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); - case Intrinsic::aarch64_sve_saddv: - // There is no i64 version of SADDV because the sign is irrelevant. - if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) - return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); - else - return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); - case Intrinsic::aarch64_sve_uaddv: - return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); + case Intrinsic::aarch64_sve_saddv: + // There is no i64 version of SADDV because the sign is irrelevant. + if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) + return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); + else + return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); + case Intrinsic::aarch64_sve_uaddv: + return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); case Intrinsic::aarch64_sve_smaxv: - return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG); case Intrinsic::aarch64_sve_umaxv: - return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG); case Intrinsic::aarch64_sve_sminv: - return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG); case Intrinsic::aarch64_sve_uminv: - return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG); case Intrinsic::aarch64_sve_orv: - return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG); case Intrinsic::aarch64_sve_eorv: - return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG); case Intrinsic::aarch64_sve_andv: - return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); + return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG); case Intrinsic::aarch64_sve_index: return LowerSVEIntrinsicIndex(N, DAG); case Intrinsic::aarch64_sve_dup: @@ -13639,19 +13639,19 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); case Intrinsic::aarch64_sve_smin: - return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG); case Intrinsic::aarch64_sve_umin: - return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG); case Intrinsic::aarch64_sve_smax: - return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG); case Intrinsic::aarch64_sve_umax: - return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG); case Intrinsic::aarch64_sve_lsl: - return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG); case Intrinsic::aarch64_sve_lsr: - return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG); case Intrinsic::aarch64_sve_asr: - return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); + return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG); case Intrinsic::aarch64_sve_cmphs: if (!N->getOperand(2).getValueType().isFloatingPoint()) return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N), @@ -13744,15 +13744,15 @@ static SDValue performExtendCombine(SDNode *N, // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && - (N->getOperand(0).getOpcode() == AArch64ISD::UABD || - N->getOperand(0).getOpcode() == AArch64ISD::SABD)) { + (N->getOperand(0).getOpcode() == AArch64ISD::UABD || + N->getOperand(0).getOpcode() == AArch64ISD::SABD)) { SDNode *ABDNode = N->getOperand(0).getNode(); - SDValue NewABD = - tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); - if (!NewABD.getNode()) - return SDValue(); + SDValue NewABD = + tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG); + if (!NewABD.getNode()) + return SDValue(); - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } // This is effectively a custom type legalization for AArch64. @@ -14235,31 +14235,31 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, S->getMemOperand()->getFlags()); } -static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { - SDLoc DL(N); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - EVT ResVT = N->getValueType(0); - - // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) - if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { - if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { - SDValue X = Op0.getOperand(0).getOperand(0); - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); - } - } - - // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) - if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { - if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { - SDValue Z = Op1.getOperand(0).getOperand(1); - return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); - } - } - - return SDValue(); -} - +static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT ResVT = N->getValueType(0); + + // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z) + if (Op0.getOpcode() == AArch64ISD::UUNPKLO) { + if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue X = Op0.getOperand(0).getOperand(0); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1); + } + } + + // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z) + if (Op1.getOpcode() == AArch64ISD::UUNPKHI) { + if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) { + SDValue Z = Op1.getOperand(0).getOperand(1); + return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z); + } + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -14398,55 +14398,55 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } -static SDValue performMaskedGatherScatterCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N); - assert(MGS && "Can only combine gather load or scatter store nodes"); - - SDLoc DL(MGS); - SDValue Chain = MGS->getChain(); - SDValue Scale = MGS->getScale(); - SDValue Index = MGS->getIndex(); - SDValue Mask = MGS->getMask(); - SDValue BasePtr = MGS->getBasePtr(); - ISD::MemIndexType IndexType = MGS->getIndexType(); - - EVT IdxVT = Index.getValueType(); - - if (DCI.isBeforeLegalize()) { - // SVE gather/scatter requires indices of i32/i64. Promote anything smaller - // prior to legalisation so the result can be split if required. - if ((IdxVT.getVectorElementType() == MVT::i8) || - (IdxVT.getVectorElementType() == MVT::i16)) { - EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32); - if (MGS->isIndexSigned()) - Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index); - else - Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index); - - if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) { - SDValue PassThru = MGT->getPassThru(); - SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale }; - return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), - PassThru.getValueType(), DL, Ops, - MGT->getMemOperand(), - MGT->getIndexType(), MGT->getExtensionType()); - } else { - auto *MSC = cast<MaskedScatterSDNode>(MGS); - SDValue Data = MSC->getValue(); - SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale }; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), - MSC->getMemoryVT(), DL, Ops, - MSC->getMemOperand(), IndexType, - MSC->isTruncatingStore()); - } - } - } - - return SDValue(); -} - +static SDValue performMaskedGatherScatterCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N); + assert(MGS && "Can only combine gather load or scatter store nodes"); + + SDLoc DL(MGS); + SDValue Chain = MGS->getChain(); + SDValue Scale = MGS->getScale(); + SDValue Index = MGS->getIndex(); + SDValue Mask = MGS->getMask(); + SDValue BasePtr = MGS->getBasePtr(); + ISD::MemIndexType IndexType = MGS->getIndexType(); + + EVT IdxVT = Index.getValueType(); + + if (DCI.isBeforeLegalize()) { + // SVE gather/scatter requires indices of i32/i64. Promote anything smaller + // prior to legalisation so the result can be split if required. + if ((IdxVT.getVectorElementType() == MVT::i8) || + (IdxVT.getVectorElementType() == MVT::i16)) { + EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32); + if (MGS->isIndexSigned()) + Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index); + else + Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index); + + if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) { + SDValue PassThru = MGT->getPassThru(); + SDValue Ops[] = { Chain, PassThru, Mask, BasePtr, Index, Scale }; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), + MGT->getIndexType(), MGT->getExtensionType()); + } else { + auto *MSC = cast<MaskedScatterSDNode>(MGS); + SDValue Data = MSC->getValue(); + SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale }; + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), + MSC->getMemoryVT(), DL, Ops, + MSC->getMemOperand(), IndexType, + MSC->isTruncatingStore()); + } + } + } + + return SDValue(); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -15443,7 +15443,7 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) && "Sign extending from an invalid type"); - EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); + EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext()); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(), ExtOp, DAG.getValueType(ExtVT)); @@ -15451,12 +15451,12 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); } - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (!EnableCombineMGatherIntrinsics) - return SDValue(); - + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!EnableCombineMGatherIntrinsics) + return SDValue(); + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; @@ -15596,11 +15596,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; - case ISD::ABS: - return performABSCombine(N, DAG, DCI, Subtarget); + case ISD::ABS: + return performABSCombine(N, DAG, DCI, Subtarget); case ISD::ADD: case ISD::SUB: - return performAddSubCombine(N, DCI, DAG); + return performAddSubCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: @@ -15627,8 +15627,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performExtendCombine(N, DCI, DAG); case ISD::SIGN_EXTEND_INREG: return performSignExtendInRegCombine(N, DCI, DAG); - case ISD::TRUNCATE: - return performVectorTruncateCombine(N, DCI, DAG); + case ISD::TRUNCATE: + return performVectorTruncateCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: @@ -15641,9 +15641,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); - case ISD::MGATHER: - case ISD::MSCATTER: - return performMaskedGatherScatterCombine(N, DCI, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: + return performMaskedGatherScatterCombine(N, DCI, DAG); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: @@ -15655,14 +15655,14 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performPostLD1Combine(N, DCI, false); case AArch64ISD::NVCAST: return performNVCASTCombine(N); - case AArch64ISD::UZP1: - return performUzpCombine(N, DAG); + case AArch64ISD::UZP1: + return performUzpCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); - case ISD::EXTRACT_VECTOR_ELT: - return performExtractVectorEltCombine(N, DAG); - case ISD::VECREDUCE_ADD: - return performVecReduceAddCombine(N, DCI.DAG, Subtarget); + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DAG); + case ISD::VECREDUCE_ADD: + return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -15811,10 +15811,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); EVT ResVT = N->getValueType(0); - uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue(); - SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL); + uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue(); + SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL); SDValue Val = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx); + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx); return DAG.getMergeValues({Val, Chain}, DL); } case Intrinsic::aarch64_sve_tuple_set: { @@ -15825,11 +15825,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, SDValue Vec = N->getOperand(4); EVT TupleVT = Tuple.getValueType(); - uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue(); + uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue(); uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue(); - uint64_t NumLanes = - Vec.getValueType().getVectorElementCount().getKnownMinValue(); + uint64_t NumLanes = + Vec.getValueType().getVectorElementCount().getKnownMinValue(); if ((TupleLanes % NumLanes) != 0) report_fatal_error("invalid tuple vector!"); @@ -15841,9 +15841,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, if (I == IdxConst) Opnds.push_back(Vec); else { - SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL); - Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, - Vec.getValueType(), Tuple, ExtIdx)); + SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL); + Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, + Vec.getValueType(), Tuple, ExtIdx)); } } SDValue Concat = @@ -16065,7 +16065,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults( ElementCount ResEC = VT.getVectorElementCount(); - if (InVT.getVectorElementCount() != (ResEC * 2)) + if (InVT.getVectorElementCount() != (ResEC * 2)) return; auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1)); @@ -16073,7 +16073,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults( return; unsigned Index = CIndex->getZExtValue(); - if ((Index != 0) && (Index != ResEC.getKnownMinValue())) + if ((Index != 0) && (Index != ResEC.getKnownMinValue())) return; unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; @@ -16108,7 +16108,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); - if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { + if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. SDValue Ops[] = { @@ -16189,8 +16189,8 @@ void AArch64TargetLowering::ReplaceNodeResults( return; case ISD::CTPOP: - if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) - Results.push_back(Result); + if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) + Results.push_back(Result); return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); @@ -16335,44 +16335,44 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; - - // Nand is not supported in LSE. - // Leave 128 bits to LLSC or CmpXChg. - if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { - if (Subtarget->hasLSE()) - return AtomicExpansionKind::None; - if (Subtarget->outlineAtomics()) { - // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. - // Don't outline them unless - // (1) high level <atomic> support approved: - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf - // (2) low level libgcc and compiler-rt support implemented by: - // min/max outline atomics helpers - if (AI->getOperation() != AtomicRMWInst::Min && - AI->getOperation() != AtomicRMWInst::Max && - AI->getOperation() != AtomicRMWInst::UMin && - AI->getOperation() != AtomicRMWInst::UMax) { - return AtomicExpansionKind::None; - } - } - } - - // At -O0, fast-regalloc cannot cope with the live vregs necessary to - // implement atomicrmw without spilling. If the target address is also on the - // stack and close enough to the spill slot, this can lead to a situation - // where the monitor always gets cleared and the atomic operation can never - // succeed. So at -O0 lower this operation to a CAS loop. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) - return AtomicExpansionKind::CmpXChg; - - return AtomicExpansionKind::LLSC; + + // Nand is not supported in LSE. + // Leave 128 bits to LLSC or CmpXChg. + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (Subtarget->hasLSE()) + return AtomicExpansionKind::None; + if (Subtarget->outlineAtomics()) { + // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far. + // Don't outline them unless + // (1) high level <atomic> support approved: + // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf + // (2) low level libgcc and compiler-rt support implemented by: + // min/max outline atomics helpers + if (AI->getOperation() != AtomicRMWInst::Min && + AI->getOperation() != AtomicRMWInst::Max && + AI->getOperation() != AtomicRMWInst::UMin && + AI->getOperation() != AtomicRMWInst::UMax) { + return AtomicExpansionKind::None; + } + } + } + + // At -O0, fast-regalloc cannot cope with the live vregs necessary to + // implement atomicrmw without spilling. If the target address is also on the + // stack and close enough to the spill slot, this can lead to a situation + // where the monitor always gets cleared and the atomic operation can never + // succeed. So at -O0 lower this operation to a CAS loop. + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return AtomicExpansionKind::CmpXChg; + + return AtomicExpansionKind::LLSC; } TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { // If subtarget has LSE, leave cmpxchg intact for codegen. - if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) + if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) return AtomicExpansionKind::None; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also @@ -16883,92 +16883,92 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( Store->isTruncatingStore()); } -SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( - SDValue Op, SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT VT = Op.getValueType(); - EVT EltVT = VT.getVectorElementType(); - - bool Signed = Op.getOpcode() == ISD::SDIV; - unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; - - // Scalable vector i32/i64 DIV is supported. - if (EltVT == MVT::i32 || EltVT == MVT::i64) - return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); - - // Scalable vector i8/i16 DIV is not supported. Promote it to i32. - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); - EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); - EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT); - - // Convert the operands to scalable vectors. - SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); - SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); - - // Extend the scalable operands. - unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; - unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; - SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0); - SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1); - SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0); - SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1); - - // Convert back to fixed vectors so the DIV can be further lowered. - Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo); - Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo); - Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi); - Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi); - SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, - Op0Lo, Op1Lo); - SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, - Op0Hi, Op1Hi); - - // Convert again to scalable vectors to truncate. - ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo); - ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi); - SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT, - ResultLo, ResultHi); - - return convertFromScalableVector(DAG, VT, ScalableResult); -} - -SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( - SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); - - SDLoc DL(Op); - SDValue Val = Op.getOperand(0); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); - Val = convertToScalableVector(DAG, ContainerVT, Val); - - bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; - unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; - - // Repeatedly unpack Val until the result is of the desired element type. - switch (ContainerVT.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("unimplemented container type"); - case MVT::nxv16i8: - Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); - if (VT.getVectorElementType() == MVT::i16) - break; - LLVM_FALLTHROUGH; - case MVT::nxv8i16: - Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); - if (VT.getVectorElementType() == MVT::i32) - break; - LLVM_FALLTHROUGH; - case MVT::nxv4i32: - Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); - assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); - break; - } - - return convertFromScalableVector(DAG, VT, Val); -} - +SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( + SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT EltVT = VT.getVectorElementType(); + + bool Signed = Op.getOpcode() == ISD::SDIV; + unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; + + // Scalable vector i32/i64 DIV is supported. + if (EltVT == MVT::i32 || EltVT == MVT::i64) + return LowerToPredicatedOp(Op, DAG, PredOpcode, /*OverrideNEON=*/true); + + // Scalable vector i8/i16 DIV is not supported. Promote it to i32. + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext()); + EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT); + + // Convert the operands to scalable vectors. + SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); + SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); + + // Extend the scalable operands. + unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; + unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; + SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0); + SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1); + SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0); + SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1); + + // Convert back to fixed vectors so the DIV can be further lowered. + Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo); + Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo); + Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi); + Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi); + SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, + Op0Lo, Op1Lo); + SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT, + Op0Hi, Op1Hi); + + // Convert again to scalable vectors to truncate. + ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo); + ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi); + SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT, + ResultLo, ResultHi); + + return convertFromScalableVector(DAG, VT, ScalableResult); +} + +SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE( + SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); + + SDLoc DL(Op); + SDValue Val = Op.getOperand(0); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType()); + Val = convertToScalableVector(DAG, ContainerVT, Val); + + bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND; + unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; + + // Repeatedly unpack Val until the result is of the desired element type. + switch (ContainerVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("unimplemented container type"); + case MVT::nxv16i8: + Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val); + if (VT.getVectorElementType() == MVT::i16) + break; + LLVM_FALLTHROUGH; + case MVT::nxv8i16: + Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val); + if (VT.getVectorElementType() == MVT::i32) + break; + LLVM_FALLTHROUGH; + case MVT::nxv4i32: + Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val); + assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!"); + break; + } + + return convertFromScalableVector(DAG, VT, Val); +} + SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -17005,21 +17005,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE( return convertFromScalableVector(DAG, VT, Val); } -// Convert vector operation 'Op' to an equivalent predicated operation whereby -// the original operation's type is used to construct a suitable predicate. -// NOTE: The results for inactive lanes are undefined. +// Convert vector operation 'Op' to an equivalent predicated operation whereby +// the original operation's type is used to construct a suitable predicate. +// NOTE: The results for inactive lanes are undefined. SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, - unsigned NewOp, - bool OverrideNEON) const { + unsigned NewOp, + bool OverrideNEON) const { EVT VT = Op.getValueType(); SDLoc DL(Op); auto Pg = getPredicateForVector(DAG, DL, VT); - if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { + if (useSVEForFixedLengthVectorVT(VT, OverrideNEON)) { EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - // Create list of operands by converting existing ones to scalable types. + // Create list of operands by converting existing ones to scalable types. SmallVector<SDValue, 4> Operands = {Pg}; for (const SDValue &V : Op->op_values()) { if (isa<CondCodeSDNode>(V)) { @@ -17027,21 +17027,21 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, continue; } - if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) { - EVT VTArg = VTNode->getVT().getVectorElementType(); - EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); - Operands.push_back(DAG.getValueType(NewVTArg)); - continue; - } - - assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && + if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) { + EVT VTArg = VTNode->getVT().getVectorElementType(); + EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg); + Operands.push_back(DAG.getValueType(NewVTArg)); + continue; + } + + assert(useSVEForFixedLengthVectorVT(V.getValueType(), OverrideNEON) && "Only fixed length vectors are supported!"); Operands.push_back(convertToScalableVector(DAG, ContainerVT, V)); } - if (isMergePassthruOpcode(NewOp)) - Operands.push_back(DAG.getUNDEF(ContainerVT)); - + if (isMergePassthruOpcode(NewOp)) + Operands.push_back(DAG.getUNDEF(ContainerVT)); + auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands); return convertFromScalableVector(DAG, VT, ScalableRes); } @@ -17050,228 +17050,228 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, SmallVector<SDValue, 4> Operands = {Pg}; for (const SDValue &V : Op->op_values()) { - assert((!V.getValueType().isVector() || - V.getValueType().isScalableVector()) && + assert((!V.getValueType().isVector() || + V.getValueType().isScalableVector()) && "Only scalable vectors are supported!"); Operands.push_back(V); } - if (isMergePassthruOpcode(NewOp)) - Operands.push_back(DAG.getUNDEF(VT)); - + if (isMergePassthruOpcode(NewOp)) + Operands.push_back(DAG.getUNDEF(VT)); + return DAG.getNode(NewOp, DL, VT, Operands); } - -// If a fixed length vector operation has no side effects when applied to -// undefined elements, we can safely use scalable vectors to perform the same -// operation without needing to worry about predication. -SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - assert(useSVEForFixedLengthVectorVT(VT) && - "Only expected to lower fixed length vector operation!"); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); - - // Create list of operands by converting existing ones to scalable types. - SmallVector<SDValue, 4> Ops; - for (const SDValue &V : Op->op_values()) { - assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!"); - - // Pass through non-vector operands. - if (!V.getValueType().isVector()) { - Ops.push_back(V); - continue; - } - - // "cast" fixed length vector to a scalable vector. - assert(useSVEForFixedLengthVectorVT(V.getValueType()) && - "Only fixed length vectors are supported!"); - Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); - } - - auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); - return convertFromScalableVector(DAG, VT, ScalableRes); -} - -SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, - SelectionDAG &DAG) const { - SDLoc DL(ScalarOp); - SDValue AccOp = ScalarOp.getOperand(0); - SDValue VecOp = ScalarOp.getOperand(1); - EVT SrcVT = VecOp.getValueType(); - EVT ResVT = SrcVT.getVectorElementType(); - - EVT ContainerVT = SrcVT; - if (SrcVT.isFixedLengthVector()) { - ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); - } - - SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - - // Convert operands to Scalable. - AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, - DAG.getUNDEF(ContainerVT), AccOp, Zero); - - // Perform reduction. - SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, - Pg, AccOp, VecOp); - - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); -} - -SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, - SelectionDAG &DAG) const { - SDLoc DL(ReduceOp); - SDValue Op = ReduceOp.getOperand(0); - EVT OpVT = Op.getValueType(); - EVT VT = ReduceOp.getValueType(); - - if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) - return SDValue(); - - SDValue Pg = getPredicateForVector(DAG, DL, OpVT); - - switch (ReduceOp.getOpcode()) { - default: - return SDValue(); - case ISD::VECREDUCE_OR: - return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); - case ISD::VECREDUCE_AND: { - Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); - return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); - } - case ISD::VECREDUCE_XOR: { - SDValue ID = - DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); - SDValue Cntp = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); - return DAG.getAnyExtOrTrunc(Cntp, DL, VT); - } - } - - return SDValue(); -} - -SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, - SDValue ScalarOp, - SelectionDAG &DAG) const { - SDLoc DL(ScalarOp); - SDValue VecOp = ScalarOp.getOperand(0); - EVT SrcVT = VecOp.getValueType(); - - if (useSVEForFixedLengthVectorVT(SrcVT, true)) { - EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); - } - - // UADDV always returns an i64 result. - EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : - SrcVT.getVectorElementType(); - EVT RdxVT = SrcVT; - if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) - RdxVT = getPackedSVEVectorVT(ResVT); - - SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); - SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); - SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, - Rdx, DAG.getConstant(0, DL, MVT::i64)); - - // The VEC_REDUCE nodes expect an element size result. - if (ResVT != ScalarOp.getValueType()) - Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); - - return Res; -} - -SDValue -AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - EVT InVT = Op.getOperand(1).getValueType(); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); - SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); - SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); - - // Convert the mask to a predicated (NOTE: We don't need to worry about - // inactive lanes since VSELECT is safe when given undefined elements). - EVT MaskVT = Op.getOperand(0).getValueType(); - EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); - auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); - Mask = DAG.getNode(ISD::TRUNCATE, DL, - MaskContainerVT.changeVectorElementType(MVT::i1), Mask); - - auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, - Mask, Op1, Op2); - - return convertFromScalableVector(DAG, VT, ScalableRes); -} - -SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( - SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT InVT = Op.getOperand(0).getValueType(); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); - - assert(useSVEForFixedLengthVectorVT(InVT) && - "Only expected to lower fixed length vector operation!"); - assert(Op.getValueType() == InVT.changeTypeToInteger() && - "Expected integer result of the same bit length as the inputs!"); - - // Expand floating point vector comparisons. - if (InVT.isFloatingPoint()) - return SDValue(); - - auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); - auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); - auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); - - EVT CmpVT = Pg.getValueType(); - auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, - {Pg, Op1, Op2, Op.getOperand(2)}); - - EVT PromoteVT = ContainerVT.changeTypeToInteger(); - auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); - return convertFromScalableVector(DAG, Op.getValueType(), Promote); -} - -SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT InVT = Op.getValueType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - (void)TLI; - - assert(VT.isScalableVector() && TLI.isTypeLegal(VT) && - InVT.isScalableVector() && TLI.isTypeLegal(InVT) && - "Only expect to cast between legal scalable vector types!"); - assert((VT.getVectorElementType() == MVT::i1) == - (InVT.getVectorElementType() == MVT::i1) && - "Cannot cast between data and predicate scalable vector types!"); - - if (InVT == VT) - return Op; - - if (VT.getVectorElementType() == MVT::i1) - return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - - EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); - EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); - assert((VT == PackedVT || InVT == PackedInVT) && - "Cannot cast between unpacked scalable vector types!"); - - // Pack input if required. - if (InVT != PackedInVT) - Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); - - Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); - - // Unpack result if required. - if (VT != PackedVT) - Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); - - return Op; -} + +// If a fixed length vector operation has no side effects when applied to +// undefined elements, we can safely use scalable vectors to perform the same +// operation without needing to worry about predication. +SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(useSVEForFixedLengthVectorVT(VT) && + "Only expected to lower fixed length vector operation!"); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + + // Create list of operands by converting existing ones to scalable types. + SmallVector<SDValue, 4> Ops; + for (const SDValue &V : Op->op_values()) { + assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!"); + + // Pass through non-vector operands. + if (!V.getValueType().isVector()) { + Ops.push_back(V); + continue; + } + + // "cast" fixed length vector to a scalable vector. + assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + "Only fixed length vectors are supported!"); + Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); + } + + auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops); + return convertFromScalableVector(DAG, VT, ScalableRes); +} + +SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, + SelectionDAG &DAG) const { + SDLoc DL(ScalarOp); + SDValue AccOp = ScalarOp.getOperand(0); + SDValue VecOp = ScalarOp.getOperand(1); + EVT SrcVT = VecOp.getValueType(); + EVT ResVT = SrcVT.getVectorElementType(); + + EVT ContainerVT = SrcVT; + if (SrcVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + } + + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + + // Convert operands to Scalable. + AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), AccOp, Zero); + + // Perform reduction. + SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, + Pg, AccOp, VecOp); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); +} + +SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, + SelectionDAG &DAG) const { + SDLoc DL(ReduceOp); + SDValue Op = ReduceOp.getOperand(0); + EVT OpVT = Op.getValueType(); + EVT VT = ReduceOp.getValueType(); + + if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue Pg = getPredicateForVector(DAG, DL, OpVT); + + switch (ReduceOp.getOpcode()) { + default: + return SDValue(); + case ISD::VECREDUCE_OR: + return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); + case ISD::VECREDUCE_AND: { + Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); + return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); + } + case ISD::VECREDUCE_XOR: { + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); + SDValue Cntp = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); + return DAG.getAnyExtOrTrunc(Cntp, DL, VT); + } + } + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, + SDValue ScalarOp, + SelectionDAG &DAG) const { + SDLoc DL(ScalarOp); + SDValue VecOp = ScalarOp.getOperand(0); + EVT SrcVT = VecOp.getValueType(); + + if (useSVEForFixedLengthVectorVT(SrcVT, true)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + } + + // UADDV always returns an i64 result. + EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : + SrcVT.getVectorElementType(); + EVT RdxVT = SrcVT; + if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED) + RdxVT = getPackedSVEVectorVT(ResVT); + + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); + SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, + Rdx, DAG.getConstant(0, DL, MVT::i64)); + + // The VEC_REDUCE nodes expect an element size result. + if (ResVT != ScalarOp.getValueType()) + Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType()); + + return Res; +} + +SDValue +AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + EVT InVT = Op.getOperand(1).getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1)); + SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2)); + + // Convert the mask to a predicated (NOTE: We don't need to worry about + // inactive lanes since VSELECT is safe when given undefined elements). + EVT MaskVT = Op.getOperand(0).getValueType(); + EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT); + auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0)); + Mask = DAG.getNode(ISD::TRUNCATE, DL, + MaskContainerVT.changeVectorElementType(MVT::i1), Mask); + + auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, + Mask, Op1, Op2); + + return convertFromScalableVector(DAG, VT, ScalableRes); +} + +SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE( + SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT InVT = Op.getOperand(0).getValueType(); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + + assert(useSVEForFixedLengthVectorVT(InVT) && + "Only expected to lower fixed length vector operation!"); + assert(Op.getValueType() == InVT.changeTypeToInteger() && + "Expected integer result of the same bit length as the inputs!"); + + // Expand floating point vector comparisons. + if (InVT.isFloatingPoint()) + return SDValue(); + + auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); + auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); + auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); + + EVT CmpVT = Pg.getValueType(); + auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, + {Pg, Op1, Op2, Op.getOperand(2)}); + + EVT PromoteVT = ContainerVT.changeTypeToInteger(); + auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT); + return convertFromScalableVector(DAG, Op.getValueType(), Promote); +} + +SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT InVT = Op.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + (void)TLI; + + assert(VT.isScalableVector() && TLI.isTypeLegal(VT) && + InVT.isScalableVector() && TLI.isTypeLegal(InVT) && + "Only expect to cast between legal scalable vector types!"); + assert((VT.getVectorElementType() == MVT::i1) == + (InVT.getVectorElementType() == MVT::i1) && + "Cannot cast between data and predicate scalable vector types!"); + + if (InVT == VT) + return Op; + + if (VT.getVectorElementType() == MVT::i1) + return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); + + EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType()); + EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType()); + assert((VT == PackedVT || InVT == PackedInVT) && + "Cannot cast between unpacked scalable vector types!"); + + // Pack input if required. + if (InVT != PackedInVT) + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op); + + Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op); + + // Unpack result if required. + if (VT != PackedVT) + Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op); + + return Op; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.h index 9550197159..535aa519f7 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64ISelLowering.h @@ -72,51 +72,51 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions - // Predicated instructions where inactive lanes produce undefined results. + // Predicated instructions where inactive lanes produce undefined results. ADD_PRED, FADD_PRED, - FDIV_PRED, - FMA_PRED, - FMAXNM_PRED, - FMINNM_PRED, - FMUL_PRED, - FSUB_PRED, - MUL_PRED, + FDIV_PRED, + FMA_PRED, + FMAXNM_PRED, + FMINNM_PRED, + FMUL_PRED, + FSUB_PRED, + MUL_PRED, SDIV_PRED, - SHL_PRED, - SMAX_PRED, - SMIN_PRED, - SRA_PRED, - SRL_PRED, - SUB_PRED, + SHL_PRED, + SMAX_PRED, + SMIN_PRED, + SRA_PRED, + SRL_PRED, + SUB_PRED, UDIV_PRED, - UMAX_PRED, - UMIN_PRED, - - // Predicated instructions with the result of inactive lanes provided by the - // last operand. - FABS_MERGE_PASSTHRU, - FCEIL_MERGE_PASSTHRU, - FFLOOR_MERGE_PASSTHRU, - FNEARBYINT_MERGE_PASSTHRU, - FNEG_MERGE_PASSTHRU, - FRECPX_MERGE_PASSTHRU, - FRINT_MERGE_PASSTHRU, - FROUND_MERGE_PASSTHRU, - FROUNDEVEN_MERGE_PASSTHRU, - FSQRT_MERGE_PASSTHRU, - FTRUNC_MERGE_PASSTHRU, - FP_ROUND_MERGE_PASSTHRU, - FP_EXTEND_MERGE_PASSTHRU, - UINT_TO_FP_MERGE_PASSTHRU, - SINT_TO_FP_MERGE_PASSTHRU, - FCVTZU_MERGE_PASSTHRU, - FCVTZS_MERGE_PASSTHRU, - SIGN_EXTEND_INREG_MERGE_PASSTHRU, - ZERO_EXTEND_INREG_MERGE_PASSTHRU, - ABS_MERGE_PASSTHRU, - NEG_MERGE_PASSTHRU, - + UMAX_PRED, + UMIN_PRED, + + // Predicated instructions with the result of inactive lanes provided by the + // last operand. + FABS_MERGE_PASSTHRU, + FCEIL_MERGE_PASSTHRU, + FFLOOR_MERGE_PASSTHRU, + FNEARBYINT_MERGE_PASSTHRU, + FNEG_MERGE_PASSTHRU, + FRECPX_MERGE_PASSTHRU, + FRINT_MERGE_PASSTHRU, + FROUND_MERGE_PASSTHRU, + FROUNDEVEN_MERGE_PASSTHRU, + FSQRT_MERGE_PASSTHRU, + FTRUNC_MERGE_PASSTHRU, + FP_ROUND_MERGE_PASSTHRU, + FP_EXTEND_MERGE_PASSTHRU, + UINT_TO_FP_MERGE_PASSTHRU, + SINT_TO_FP_MERGE_PASSTHRU, + FCVTZU_MERGE_PASSTHRU, + FCVTZS_MERGE_PASSTHRU, + SIGN_EXTEND_INREG_MERGE_PASSTHRU, + ZERO_EXTEND_INREG_MERGE_PASSTHRU, + ABS_MERGE_PASSTHRU, + NEG_MERGE_PASSTHRU, + SETCC_MERGE_ZERO, // Arithmetic instructions which write flags. @@ -219,18 +219,18 @@ enum NodeType : unsigned { SADDV, UADDV, - // Vector halving addition - SHADD, - UHADD, - + // Vector halving addition + SHADD, + UHADD, + // Vector rounding halving addition SRHADD, URHADD, - // Absolute difference - UABD, - SABD, - + // Absolute difference + UABD, + SABD, + // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, @@ -238,8 +238,8 @@ enum NodeType : unsigned { SMAXV, UMAXV, - SADDV_PRED, - UADDV_PRED, + SADDV_PRED, + UADDV_PRED, SMAXV_PRED, UMAXV_PRED, SMINV_PRED, @@ -307,14 +307,14 @@ enum NodeType : unsigned { PTEST, PTRUE, - BITREVERSE_MERGE_PASSTHRU, - BSWAP_MERGE_PASSTHRU, - CTLZ_MERGE_PASSTHRU, - CTPOP_MERGE_PASSTHRU, + BITREVERSE_MERGE_PASSTHRU, + BSWAP_MERGE_PASSTHRU, + CTLZ_MERGE_PASSTHRU, + CTPOP_MERGE_PASSTHRU, DUP_MERGE_PASSTHRU, INDEX_VECTOR, - // Cast between vectors of the same element type but differ in length. + // Cast between vectors of the same element type but differ in length. REINTERPRET_CAST, LD1_MERGE_ZERO, @@ -424,11 +424,11 @@ enum NodeType : unsigned { LDP, STP, - STNP, - - // Pseudo for a OBJC call that gets emitted together with a special `mov - // x29, x29` marker instruction. - CALL_RVMARKER + STNP, + + // Pseudo for a OBJC call that gets emitted together with a special `mov + // x29, x29` marker instruction. + CALL_RVMARKER }; } // end namespace AArch64ISD @@ -438,14 +438,14 @@ namespace { // Any instruction that defines a 32-bit result zeros out the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper -// 32 bits, they're probably just qualifying a CopyFromReg. +// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper +// 32 bits, they're probably just qualifying a CopyFromReg. // FIXME: X86 also checks for CMOV here. Do we need something similar? static inline bool isDef32(const SDNode &N) { unsigned Opc = N.getOpcode(); return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && - Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && - Opc != ISD::AssertZext; + Opc != ISD::CopyFromReg && Opc != ISD::AssertSext && + Opc != ISD::AssertZext; } } // end anonymous namespace @@ -784,7 +784,7 @@ public: /// illegal as the original, thus leading to an infinite legalisation loop. /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal /// vector types this override can be removed. - bool mergeStoresAfterLegalization(EVT VT) const override; + bool mergeStoresAfterLegalization(EVT VT) const override; private: /// Keep a pointer to the AArch64Subtarget around so that we can @@ -815,11 +815,11 @@ private: SDValue ThisVal) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( @@ -903,28 +903,28 @@ private: SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, - bool OverrideNEON = false) const; - SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG, unsigned NewOp, + bool OverrideNEON = false) const; + SDValue LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; @@ -939,17 +939,17 @@ private: SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps, EVT VT, SelectionDAG &DAG, const SDLoc &DL) const; - SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, - SelectionDAG &DAG) const; - SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, - SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, + SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, + SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const; - SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const; - SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, - SelectionDAG &DAG) const; - SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const; + SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const; + SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, + SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op, SelectionDAG &DAG) const; @@ -961,10 +961,10 @@ private: bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const override; - SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, - const DenormalMode &Mode) const override; - SDValue getSqrtResultForDenormInput(SDValue Operand, - SelectionDAG &DAG) const override; + SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const override; + SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; @@ -996,7 +996,7 @@ private: return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } - bool shouldRemoveExtendFromGSIndex(EVT VT) const override; + bool shouldRemoveExtendFromGSIndex(EVT VT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; @@ -1023,21 +1023,21 @@ private: bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const override; - // Normally SVE is only used for byte size vectors that do not fit within a - // NEON vector. This changes when OverrideNEON is true, allowing SVE to be - // used for 64bit and 128bit vectors as well. - bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; - - // With the exception of data-predicate transitions, no instructions are - // required to cast between legal scalable vector types. However: - // 1. Packed and unpacked types have different bit lengths, meaning BITCAST - // is not universally useable. - // 2. Most unpacked integer types are not legal and thus integer extends - // cannot be used to convert between unpacked and packed types. - // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used - // to transition between unpacked and packed types of the same element type, - // with BITCAST used otherwise. - SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; + // Normally SVE is only used for byte size vectors that do not fit within a + // NEON vector. This changes when OverrideNEON is true, allowing SVE to be + // used for 64bit and 128bit vectors as well. + bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + + // With the exception of data-predicate transitions, no instructions are + // required to cast between legal scalable vector types. However: + // 1. Packed and unpacked types have different bit lengths, meaning BITCAST + // is not universally useable. + // 2. Most unpacked integer types are not legal and thus integer extends + // cannot be used to convert between unpacked and packed types. + // These can make "bitcasting" a multiphase process. REINTERPRET_CAST is used + // to transition between unpacked and packed types of the same element type, + // with BITCAST used otherwise. + SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const; }; namespace AArch64 { diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrFormats.td index cf08f56e5b..eb03fce945 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrFormats.td @@ -60,14 +60,14 @@ class AArch64Inst<Format f, string cstr> : Instruction { bits<2> Form = F.Value; // Defaults - bit isWhile = 0; - bit isPTestLike = 0; + bit isWhile = 0; + bit isPTestLike = 0; FalseLanesEnum FalseLanes = FalseLanesNone; DestructiveInstTypeEnum DestructiveInstType = NotDestructive; ElementSizeEnum ElementSize = ElementSizeNone; - let TSFlags{10} = isPTestLike; - let TSFlags{9} = isWhile; + let TSFlags{10} = isPTestLike; + let TSFlags{9} = isWhile; let TSFlags{8-7} = FalseLanes.Value; let TSFlags{6-3} = DestructiveInstType.Value; let TSFlags{2-0} = ElementSize.Value; @@ -267,7 +267,7 @@ def adrplabel : Operand<i64> { let EncoderMethod = "getAdrLabelOpValue"; let PrintMethod = "printAdrpLabel"; let ParserMatchClass = AdrpOperand; - let OperandType = "OPERAND_PCREL"; + let OperandType = "OPERAND_PCREL"; } def AdrOperand : AsmOperandClass { @@ -330,7 +330,7 @@ def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> { } def SImm8Operand : SImmOperand<8>; -def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 128; }]> { +def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 128; }]> { let ParserMatchClass = SImm8Operand; let DecoderMethod = "DecodeSImm<8>"; } @@ -919,13 +919,13 @@ def imm0_1 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_1Operand; } -// timm0_1 - as above, but use TargetConstant (TImmLeaf) -def timm0_1 : Operand<i64>, TImmLeaf<i64, [{ - return ((uint64_t)Imm) < 2; -}]> { - let ParserMatchClass = Imm0_1Operand; -} - +// timm0_1 - as above, but use TargetConstant (TImmLeaf) +def timm0_1 : Operand<i64>, TImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = Imm0_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) < 16; @@ -1301,9 +1301,9 @@ class SimpleSystemI<bit L, dag iops, string asm, string operands, } // System instructions which have an Rt register. -class RtSystemI<bit L, dag oops, dag iops, string asm, string operands, - list<dag> pattern = []> - : BaseSystemI<L, oops, iops, asm, operands, pattern>, +class RtSystemI<bit L, dag oops, dag iops, string asm, string operands, + list<dag> pattern = []> + : BaseSystemI<L, oops, iops, asm, operands, pattern>, Sched<[WriteSys]> { bits<5> Rt; let Inst{4-0} = Rt; @@ -1331,16 +1331,16 @@ class TMSystemI<bits<4> CRm, string asm, list<dag> pattern> let Inst{4-0} = Rt; } -// System instructions that pass a register argument -// This class assumes the register is for input rather than output. -class RegInputSystemI<bits<4> CRm, bits<3> Op2, string asm, - list<dag> pattern = []> - : RtSystemI<0, (outs), (ins GPR64:$Rt), asm, "\t$Rt", pattern> { - let Inst{20-12} = 0b000110001; - let Inst{11-8} = CRm; - let Inst{7-5} = Op2; -} - +// System instructions that pass a register argument +// This class assumes the register is for input rather than output. +class RegInputSystemI<bits<4> CRm, bits<3> Op2, string asm, + list<dag> pattern = []> + : RtSystemI<0, (outs), (ins GPR64:$Rt), asm, "\t$Rt", pattern> { + let Inst{20-12} = 0b000110001; + let Inst{11-8} = CRm; + let Inst{7-5} = Op2; +} + // System instructions for transactional memory - no operand class TMSystemINoOperand<bits<4> CRm, string asm, list<dag> pattern> : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> { @@ -1381,14 +1381,14 @@ def barrier_op : Operand<i32> { let PrintMethod = "printBarrierOption"; let ParserMatchClass = BarrierAsmOperand; } -def BarriernXSAsmOperand : AsmOperandClass { - let Name = "BarriernXS"; - let ParserMethod = "tryParseBarriernXSOperand"; -} -def barrier_nxs_op : Operand<i32> { - let PrintMethod = "printBarriernXSOption"; - let ParserMatchClass = BarriernXSAsmOperand; -} +def BarriernXSAsmOperand : AsmOperandClass { + let Name = "BarriernXS"; + let ParserMethod = "tryParseBarriernXSOperand"; +} +def barrier_nxs_op : Operand<i32> { + let PrintMethod = "printBarriernXSOption"; + let ParserMatchClass = BarriernXSAsmOperand; +} class CRmSystemI<Operand crmtype, bits<3> opc, string asm, list<dag> pattern = []> : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>, @@ -1470,7 +1470,7 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; let Inst{20-5} = systemreg; - let DecoderNamespace = "Fallback"; + let DecoderNamespace = "Fallback"; } // FIXME: Some of these def NZCV, others don't. Best way to model that? @@ -1480,7 +1480,7 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), "msr", "\t$systemreg, $Rt"> { bits<16> systemreg; let Inst{20-5} = systemreg; - let DecoderNamespace = "Fallback"; + let DecoderNamespace = "Fallback"; } def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { @@ -1970,21 +1970,21 @@ class SignAuthTwoOperand<bits<4> opc, string asm, let Inst{4-0} = Rd; } -class ClearAuth<bits<1> data, string asm> - : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> { - bits<5> Rd; - let Inst{31-11} = 0b110110101100000101000; - let Inst{10} = data; - let Inst{9-5} = 0b11111; - let Inst{4-0} = Rd; -} - +class ClearAuth<bits<1> data, string asm> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> { + bits<5> Rd; + let Inst{31-11} = 0b110110101100000101000; + let Inst{10} = data; + let Inst{9-5} = 0b11111; + let Inst{4-0} = Rd; +} + // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops> : I<(outs), iops, asm, ops, "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; - let Defs = [NZCV]; + let Defs = [NZCV]; bits<5> Rn; let Inst{31} = sf; let Inst{30-15} = 0b0111010000000000; @@ -3972,7 +3972,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteAdr, WriteLD]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -4018,7 +4018,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteAdr, WriteLD]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -4115,7 +4115,7 @@ class LoadPairPreIdx<bits<2> opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPreIdx<opc, V, 1, (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2), (ins GPR64sp:$Rn, indextype:$offset), asm>, - Sched<[WriteAdr, WriteLD, WriteLDHi]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPreIdx<bits<2> opc, bit V, RegisterOperand regtype, @@ -4156,7 +4156,7 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPostIdx<opc, V, 1, (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2), (ins GPR64sp:$Rn, idxtype:$offset), asm>, - Sched<[WriteAdr, WriteLD, WriteLDHi]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPostIdx<bits<2> opc, bit V, RegisterOperand regtype, @@ -7874,9 +7874,9 @@ class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1, multiclass SIMDThreeSameVectorBFDot<bit U, string asm> { def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, - v2f32, v4bf16>; + v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, - v4f32, v8bf16>; + v4f32, v8bf16>; } class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm, @@ -7894,7 +7894,7 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm, (InputType RegType:$Rn), (InputType (bitconvert (AccumType (AArch64duplane32 (v4f32 V128:$Rm), - VectorIndexS:$idx)))))))]> { + VectorIndexS:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L @@ -7904,16 +7904,16 @@ class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm, multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> { def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", - ".2h", V64, v2f32, v4bf16>; + ".2h", V64, v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", - ".2h", V128, v4f32, v8bf16>; + ".2h", V128, v4f32, v8bf16>; } class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode> : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), - (v8bf16 V128:$Rn), - (v8bf16 V128:$Rm)))]> { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } @@ -7923,10 +7923,10 @@ class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode> "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", [(set (v4f32 V128:$dst), (v4f32 (OpNode (v4f32 V128:$Rd), - (v8bf16 V128:$Rn), - (v8bf16 + (v8bf16 V128:$Rn), + (v8bf16 (AArch64duplane16 (v8bf16 V128_lo:$Rm), - VectorIndexH:$idx)))))]>, + VectorIndexH:$idx)))))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; @@ -7950,8 +7950,8 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm> V128, asm, ".4s", [(set (v4f32 V128:$dst), (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), - (v8bf16 V128:$Rn), - (v8bf16 V128:$Rm)))]> { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", ", $Rm", ".8h", "}"); } @@ -10629,14 +10629,14 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype, [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; } let Predicates = [HasComplxNum, HasNEON] in { @@ -10645,21 +10645,21 @@ multiclass SIMDThreeSameVectorComplexHSD<bit U, bits<3> opcode, Operand rottype, [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; } } @@ -10701,14 +10701,14 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode, [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; } let Predicates = [HasComplxNum, HasNEON] in { @@ -10717,21 +10717,21 @@ multiclass SIMDThreeSameVectorTiedComplexHSD<bit U, bits<3> opcode, [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), - (i32 rottype:$rot)))]>; + (i32 rottype:$rot)))]>; } } @@ -11259,35 +11259,35 @@ multiclass STOPregister<string asm, string instr> { !cast<Instruction>(instr # "X")>; } -class LoadStore64B_base<bits<3> opc, string asm_inst, string asm_ops, - dag iops, dag oops, list<dag> pat> - : I<oops, iops, asm_inst, asm_ops, "", pat>, - Sched<[]> /* FIXME: fill in scheduling details once known */ { - bits<5> Rt; - bits<5> Rn; - let Inst{31-21} = 0b11111000001; - let Inst{15} = 1; - let Inst{14-12} = opc; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let Predicates = [HasV8_7a]; -} - -class LoadStore64B<bits<3> opc, string asm_inst, dag iops, dag oops, - list<dag> pat = []> - : LoadStore64B_base<opc, asm_inst, "\t$Rt, [$Rn]", iops, oops, pat> { - let Inst{20-16} = 0b11111; -} - -class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []> - : LoadStore64B_base<opc, asm_inst, "\t$Rs, $Rt, [$Rn]", - (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs GPR64:$Rs), pat> { - bits<5> Rs; - let Inst{20-16} = Rs; -} - +class LoadStore64B_base<bits<3> opc, string asm_inst, string asm_ops, + dag iops, dag oops, list<dag> pat> + : I<oops, iops, asm_inst, asm_ops, "", pat>, + Sched<[]> /* FIXME: fill in scheduling details once known */ { + bits<5> Rt; + bits<5> Rn; + let Inst{31-21} = 0b11111000001; + let Inst{15} = 1; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Predicates = [HasV8_7a]; +} + +class LoadStore64B<bits<3> opc, string asm_inst, dag iops, dag oops, + list<dag> pat = []> + : LoadStore64B_base<opc, asm_inst, "\t$Rt, [$Rn]", iops, oops, pat> { + let Inst{20-16} = 0b11111; +} + +class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []> + : LoadStore64B_base<opc, asm_inst, "\t$Rs, $Rt, [$Rn]", + (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs GPR64:$Rs), pat> { + bits<5> Rs; + let Inst{20-16} = Rs; +} + //---------------------------------------------------------------------------- // Allow the size specifier tokens to be upper case, not just lower. def : TokenAlias<".4B", ".4b">; // Add dot product diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrGISel.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrGISel.td index 25656fac1d..b7d5014166 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrGISel.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrGISel.td @@ -88,29 +88,29 @@ def G_DUP: AArch64GenericInstruction { let InOperandList = (ins type1:$lane); let hasSideEffects = 0; } - -// Represents a lane duplicate operation. -def G_DUPLANE8 : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src, type1:$lane); - let hasSideEffects = 0; -} -def G_DUPLANE16 : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src, type1:$lane); - let hasSideEffects = 0; -} -def G_DUPLANE32 : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src, type1:$lane); - let hasSideEffects = 0; -} -def G_DUPLANE64 : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src, type1:$lane); - let hasSideEffects = 0; -} - + +// Represents a lane duplicate operation. +def G_DUPLANE8 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} + // Represents a trn1 instruction. Produced post-legalization from // G_SHUFFLE_VECTORs with appropriate masks. def G_TRN1 : AArch64GenericInstruction { @@ -134,28 +134,28 @@ def G_EXT: AArch64GenericInstruction { let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm); } -// Represents a vector G_ASHR with an immediate. -def G_VASHR : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); -} - -// Represents a vector G_LSHR with an immediate. -def G_VLSHR : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); -} - -// Represents an integer to FP conversion on the FPR bank. -def G_SITOF : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); -} -def G_UITOF : AArch64GenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src); -} - +// Represents a vector G_ASHR with an immediate. +def G_VASHR : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); +} + +// Represents a vector G_LSHR with an immediate. +def G_VLSHR : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); +} + +// Represents an integer to FP conversion on the FPR bank. +def G_SITOF : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); +} +def G_UITOF : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); +} + def : GINodeEquiv<G_REV16, AArch64rev16>; def : GINodeEquiv<G_REV32, AArch64rev32>; def : GINodeEquiv<G_REV64, AArch64rev64>; @@ -164,21 +164,21 @@ def : GINodeEquiv<G_UZP2, AArch64uzp2>; def : GINodeEquiv<G_ZIP1, AArch64zip1>; def : GINodeEquiv<G_ZIP2, AArch64zip2>; def : GINodeEquiv<G_DUP, AArch64dup>; -def : GINodeEquiv<G_DUPLANE8, AArch64duplane8>; -def : GINodeEquiv<G_DUPLANE16, AArch64duplane16>; -def : GINodeEquiv<G_DUPLANE32, AArch64duplane32>; -def : GINodeEquiv<G_DUPLANE64, AArch64duplane64>; +def : GINodeEquiv<G_DUPLANE8, AArch64duplane8>; +def : GINodeEquiv<G_DUPLANE16, AArch64duplane16>; +def : GINodeEquiv<G_DUPLANE32, AArch64duplane32>; +def : GINodeEquiv<G_DUPLANE64, AArch64duplane64>; def : GINodeEquiv<G_TRN1, AArch64trn1>; def : GINodeEquiv<G_TRN2, AArch64trn2>; def : GINodeEquiv<G_EXT, AArch64ext>; -def : GINodeEquiv<G_VASHR, AArch64vashr>; -def : GINodeEquiv<G_VLSHR, AArch64vlshr>; -def : GINodeEquiv<G_SITOF, AArch64sitof>; -def : GINodeEquiv<G_UITOF, AArch64uitof>; - -def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>; - -// These are patterns that we only use for GlobalISel via the importer. -def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)), - (vector_extract (v2f32 FPR64:$Rn), (i64 1)))), - (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>; +def : GINodeEquiv<G_VASHR, AArch64vashr>; +def : GINodeEquiv<G_VLSHR, AArch64vlshr>; +def : GINodeEquiv<G_SITOF, AArch64sitof>; +def : GINodeEquiv<G_UITOF, AArch64uitof>; + +def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>; + +// These are patterns that we only use for GlobalISel via the importer. +def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)), + (vector_extract (v2f32 FPR64:$Rn), (i64 1)))), + (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.cpp index 6b38e216a8..fc3e238182 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -107,13 +107,13 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); break; - case TargetOpcode::STATEPOINT: - NumBytes = StatepointOpers(&MI).getNumPatchBytes(); - assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); - // No patch bytes means a normal call inst is emitted - if (NumBytes == 0) - NumBytes = 4; - break; + case TargetOpcode::STATEPOINT: + NumBytes = StatepointOpers(&MI).getNumPatchBytes(); + assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + // No patch bytes means a normal call inst is emitted + if (NumBytes == 0) + NumBytes = 4; + break; case AArch64::TLSDESC_CALLSEQ: // This gets lowered to an instruction sequence which takes 16 bytes NumBytes = 16; @@ -294,31 +294,31 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, } } - // If we're allowed to modify and the block ends in a unconditional branch - // which could simply fallthrough, remove the branch. (Note: This case only - // matters when we can't understand the whole sequence, otherwise it's also - // handled by BranchFolding.cpp.) - if (AllowModify && isUncondBranchOpcode(LastOpc) && - MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { - LastInst->eraseFromParent(); - LastInst = SecondLastInst; - LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { - assert(!isUncondBranchOpcode(LastOpc) && - "unreachable unconditional branches removed above"); - - if (isCondBranchOpcode(LastOpc)) { - // Block ends with fall-through condbranch. - parseCondBranch(LastInst, TBB, Cond); - return false; - } - return true; // Can't handle indirect branch. - } else { - SecondLastInst = &*I; - SecondLastOpc = SecondLastInst->getOpcode(); - } - } - + // If we're allowed to modify and the block ends in a unconditional branch + // which could simply fallthrough, remove the branch. (Note: This case only + // matters when we can't understand the whole sequence, otherwise it's also + // handled by BranchFolding.cpp.) + if (AllowModify && isUncondBranchOpcode(LastOpc) && + MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { + assert(!isUncondBranchOpcode(LastOpc) && + "unreachable unconditional branches removed above"); + + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + parseCondBranch(LastInst, TBB, Cond); + return false; + } + return true; // Can't handle indirect branch. + } else { + SecondLastInst = &*I; + SecondLastOpc = SecondLastInst->getOpcode(); + } + } + // If there are three terminators, we don't know what sort of block this is. if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; @@ -353,56 +353,56 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, return true; } -bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, - MachineBranchPredicate &MBP, - bool AllowModify) const { - // For the moment, handle only a block which ends with a cb(n)zx followed by - // a fallthrough. Why this? Because it is a common form. - // TODO: Should we handle b.cc? - - MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); - if (I == MBB.end()) - return true; - - // Skip over SpeculationBarrierEndBB terminators - if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || - I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { - --I; - } - - if (!isUnpredicatedTerminator(*I)) - return true; - - // Get the last instruction in the block. - MachineInstr *LastInst = &*I; - unsigned LastOpc = LastInst->getOpcode(); - if (!isCondBranchOpcode(LastOpc)) - return true; - - switch (LastOpc) { - default: - return true; - case AArch64::CBZW: - case AArch64::CBZX: - case AArch64::CBNZW: - case AArch64::CBNZX: - break; - }; - - MBP.TrueDest = LastInst->getOperand(1).getMBB(); - assert(MBP.TrueDest && "expected!"); - MBP.FalseDest = MBB.getNextNode(); - - MBP.ConditionDef = nullptr; - MBP.SingleUseCondition = false; - - MBP.LHS = LastInst->getOperand(0); - MBP.RHS = MachineOperand::CreateImm(0); - MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE - : MachineBranchPredicate::PRED_EQ; - return false; -} - +bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const { + // For the moment, handle only a block which ends with a cb(n)zx followed by + // a fallthrough. Why this? Because it is a common form. + // TODO: Should we handle b.cc? + + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return true; + + // Skip over SpeculationBarrierEndBB terminators + if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || + I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { + --I; + } + + if (!isUnpredicatedTerminator(*I)) + return true; + + // Get the last instruction in the block. + MachineInstr *LastInst = &*I; + unsigned LastOpc = LastInst->getOpcode(); + if (!isCondBranchOpcode(LastOpc)) + return true; + + switch (LastOpc) { + default: + return true; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + break; + }; + + MBP.TrueDest = LastInst->getOperand(1).getMBB(); + assert(MBP.TrueDest && "expected!"); + MBP.FalseDest = MBB.getNextNode(); + + MBP.ConditionDef = nullptr; + MBP.SingleUseCondition = false; + + MBP.LHS = LastInst->getOperand(0); + MBP.RHS = MachineOperand::CreateImm(0); + MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; + return false; +} + bool AArch64InstrInfo::reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const { if (Cond[0].getImm() != -1) { @@ -1119,13 +1119,13 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, switch (MI.getOpcode()) { default: break; - case AArch64::PTEST_PP: - SrcReg = MI.getOperand(0).getReg(); - SrcReg2 = MI.getOperand(1).getReg(); - // Not sure about the mask and value for now... - CmpMask = ~0; - CmpValue = 0; - return true; + case AArch64::PTEST_PP: + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = MI.getOperand(1).getReg(); + // Not sure about the mask and value for now... + CmpMask = ~0; + CmpValue = 0; + return true; case AArch64::SUBSWrr: case AArch64::SUBSWrs: case AArch64::SUBSWrx: @@ -1281,9 +1281,9 @@ static bool areCFlagsAccessedBetweenInstrs( return true; // From must be above To. - assert(std::any_of( - ++To.getReverse(), To->getParent()->rend(), - [From](MachineInstr &MI) { return MI.getIterator() == From; })); + assert(std::any_of( + ++To.getReverse(), To->getParent()->rend(), + [From](MachineInstr &MI) { return MI.getIterator() == From; })); // We iterate backward starting at \p To until we hit \p From. for (const MachineInstr &Instr : @@ -1296,127 +1296,127 @@ static bool areCFlagsAccessedBetweenInstrs( return false; } -/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating -/// operation which could set the flags in an identical manner -bool AArch64InstrInfo::optimizePTestInstr( - MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, - const MachineRegisterInfo *MRI) const { - auto *Mask = MRI->getUniqueVRegDef(MaskReg); - auto *Pred = MRI->getUniqueVRegDef(PredReg); - auto NewOp = Pred->getOpcode(); - bool OpChanged = false; - - unsigned MaskOpcode = Mask->getOpcode(); - unsigned PredOpcode = Pred->getOpcode(); - bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); - bool PredIsWhileLike = isWhileOpcode(PredOpcode); - - if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { - // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't - // deactivate any lanes OTHER_INST might set. - uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); - uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); - - // Must be an all active predicate of matching element size. - if ((PredElementSize != MaskElementSize) || - (Mask->getOperand(1).getImm() != 31)) - return false; - - // Fallthough to simply remove the PTEST. - } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { - // For PTEST(PG, PG), PTEST is redundant when PG is the result of an - // instruction that sets the flags as PTEST would. - - // Fallthough to simply remove the PTEST. - } else if (PredIsPTestLike) { - // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both - // instructions use the same predicate. - auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PTestLikeMask) - return false; - - // Fallthough to simply remove the PTEST. - } else { - switch (Pred->getOpcode()) { - case AArch64::BRKB_PPzP: - case AArch64::BRKPB_PPzPP: { - // Op 0 is chain, 1 is the mask, 2 the previous predicate to - // propagate, 3 the new predicate. - - // Check to see if our mask is the same as the brkpb's. If - // not the resulting flag bits may be different and we - // can't remove the ptest. - auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PredMask) - return false; - - // Switch to the new opcode - NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP - : AArch64::BRKPBS_PPzPP; - OpChanged = true; - break; - } - case AArch64::BRKN_PPzP: { - auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); - if (Mask != PredMask) - return false; - - NewOp = AArch64::BRKNS_PPzP; - OpChanged = true; - break; - } - default: - // Bail out if we don't recognize the input - return false; - } - } - - const TargetRegisterInfo *TRI = &getRegisterInfo(); - - // If the predicate is in a different block (possibly because its been - // hoisted out), then assume the flags are set in between statements. - if (Pred->getParent() != PTest->getParent()) - return false; - - // If another instruction between the propagation and test sets the - // flags, don't remove the ptest. - MachineBasicBlock::iterator I = Pred, E = PTest; - ++I; // Skip past the predicate op itself. - for (; I != E; ++I) { - const MachineInstr &Inst = *I; - - // TODO: If the ptest flags are unused, we could still remove it. - if (Inst.modifiesRegister(AArch64::NZCV, TRI)) - return false; - } - - // If we pass all the checks, it's safe to remove the PTEST and use the flags - // as they are prior to PTEST. Sometimes this requires the tested PTEST - // operand to be replaced with an equivalent instruction that also sets the - // flags. - Pred->setDesc(get(NewOp)); - PTest->eraseFromParent(); - if (OpChanged) { - bool succeeded = UpdateOperandRegClass(*Pred); - (void)succeeded; - assert(succeeded && "Operands have incompatible register classes!"); - Pred->addRegisterDefined(AArch64::NZCV, TRI); - } - - // Ensure that the flags def is live. - if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { - unsigned i = 0, e = Pred->getNumOperands(); - for (; i != e; ++i) { - MachineOperand &MO = Pred->getOperand(i); - if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { - MO.setIsDead(false); - break; - } - } - } - return true; -} - +/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating +/// operation which could set the flags in an identical manner +bool AArch64InstrInfo::optimizePTestInstr( + MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, + const MachineRegisterInfo *MRI) const { + auto *Mask = MRI->getUniqueVRegDef(MaskReg); + auto *Pred = MRI->getUniqueVRegDef(PredReg); + auto NewOp = Pred->getOpcode(); + bool OpChanged = false; + + unsigned MaskOpcode = Mask->getOpcode(); + unsigned PredOpcode = Pred->getOpcode(); + bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); + bool PredIsWhileLike = isWhileOpcode(PredOpcode); + + if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { + // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't + // deactivate any lanes OTHER_INST might set. + uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); + uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); + + // Must be an all active predicate of matching element size. + if ((PredElementSize != MaskElementSize) || + (Mask->getOperand(1).getImm() != 31)) + return false; + + // Fallthough to simply remove the PTEST. + } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { + // For PTEST(PG, PG), PTEST is redundant when PG is the result of an + // instruction that sets the flags as PTEST would. + + // Fallthough to simply remove the PTEST. + } else if (PredIsPTestLike) { + // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both + // instructions use the same predicate. + auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); + if (Mask != PTestLikeMask) + return false; + + // Fallthough to simply remove the PTEST. + } else { + switch (Pred->getOpcode()) { + case AArch64::BRKB_PPzP: + case AArch64::BRKPB_PPzPP: { + // Op 0 is chain, 1 is the mask, 2 the previous predicate to + // propagate, 3 the new predicate. + + // Check to see if our mask is the same as the brkpb's. If + // not the resulting flag bits may be different and we + // can't remove the ptest. + auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); + if (Mask != PredMask) + return false; + + // Switch to the new opcode + NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP + : AArch64::BRKPBS_PPzPP; + OpChanged = true; + break; + } + case AArch64::BRKN_PPzP: { + auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); + if (Mask != PredMask) + return false; + + NewOp = AArch64::BRKNS_PPzP; + OpChanged = true; + break; + } + default: + // Bail out if we don't recognize the input + return false; + } + } + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + // If the predicate is in a different block (possibly because its been + // hoisted out), then assume the flags are set in between statements. + if (Pred->getParent() != PTest->getParent()) + return false; + + // If another instruction between the propagation and test sets the + // flags, don't remove the ptest. + MachineBasicBlock::iterator I = Pred, E = PTest; + ++I; // Skip past the predicate op itself. + for (; I != E; ++I) { + const MachineInstr &Inst = *I; + + // TODO: If the ptest flags are unused, we could still remove it. + if (Inst.modifiesRegister(AArch64::NZCV, TRI)) + return false; + } + + // If we pass all the checks, it's safe to remove the PTEST and use the flags + // as they are prior to PTEST. Sometimes this requires the tested PTEST + // operand to be replaced with an equivalent instruction that also sets the + // flags. + Pred->setDesc(get(NewOp)); + PTest->eraseFromParent(); + if (OpChanged) { + bool succeeded = UpdateOperandRegClass(*Pred); + (void)succeeded; + assert(succeeded && "Operands have incompatible register classes!"); + Pred->addRegisterDefined(AArch64::NZCV, TRI); + } + + // Ensure that the flags def is live. + if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { + unsigned i = 0, e = Pred->getNumOperands(); + for (; i != e; ++i) { + MachineOperand &MO = Pred->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { + MO.setIsDead(false); + break; + } + } + } + return true; +} + /// Try to optimize a compare instruction. A compare instruction is an /// instruction which produces AArch64::NZCV. It can be truly compare /// instruction @@ -1455,9 +1455,9 @@ bool AArch64InstrInfo::optimizeCompareInstr( return true; } - if (CmpInstr.getOpcode() == AArch64::PTEST_PP) - return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); - + if (CmpInstr.getOpcode() == AArch64::PTEST_PP) + return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); + // Continue only if we have a "ri" where immediate is zero. // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare // function. @@ -2274,24 +2274,24 @@ bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( return true; } -Optional<ExtAddrMode> -AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, - const TargetRegisterInfo *TRI) const { - const MachineOperand *Base; // Filled with the base operand of MI. - int64_t Offset; // Filled with the offset of MI. - bool OffsetIsScalable; - if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) - return None; - - if (!Base->isReg()) - return None; - ExtAddrMode AM; - AM.BaseReg = Base->getReg(); - AM.Displacement = Offset; - AM.ScaledReg = 0; - return AM; -} - +Optional<ExtAddrMode> +AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, + const TargetRegisterInfo *TRI) const { + const MachineOperand *Base; // Filled with the base operand of MI. + int64_t Offset; // Filled with the offset of MI. + bool OffsetIsScalable; + if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) + return None; + + if (!Base->isReg()) + return None; + ExtAddrMode AM; + AM.BaseReg = Base->getReg(); + AM.Displacement = Offset; + AM.ScaledReg = 0; + return AM; +} + bool AArch64InstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -3290,7 +3290,7 @@ void AArch64InstrInfo::storeRegToStackSlot( else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 4: @@ -3334,7 +3334,7 @@ void AArch64InstrInfo::storeRegToStackSlot( } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 24: @@ -3356,7 +3356,7 @@ void AArch64InstrInfo::storeRegToStackSlot( } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 48: @@ -3367,7 +3367,7 @@ void AArch64InstrInfo::storeRegToStackSlot( } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 64: @@ -3378,7 +3378,7 @@ void AArch64InstrInfo::storeRegToStackSlot( } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); Opc = AArch64::STR_ZZZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; } @@ -3444,7 +3444,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_PXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 4: @@ -3488,7 +3488,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 24: @@ -3510,7 +3510,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 48: @@ -3521,7 +3521,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; case 64: @@ -3532,7 +3532,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); Opc = AArch64::LDR_ZZZZXI; - StackID = TargetStackID::ScalableVector; + StackID = TargetStackID::ScalableVector; } break; } @@ -3559,47 +3559,47 @@ bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, }); } -void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( - const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { - // The smallest scalable element supported by scaled SVE addressing - // modes are predicates, which are 2 scalable bytes in size. So the scalable - // byte offset must always be a multiple of 2. - assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); - - // VGSized offsets are divided by '2', because the VG register is the - // the number of 64bit granules as opposed to 128bit vector chunks, - // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. - // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. - // VG = n * 2 and the dwarf offset must be VG * 8 bytes. - ByteSized = Offset.getFixed(); - VGSized = Offset.getScalable() / 2; -} - -/// Returns the offset in parts to which this frame offset can be -/// decomposed for the purpose of describing a frame offset. -/// For non-scalable offsets this is simply its byte size. -void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( - const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, - int64_t &NumDataVectors) { - // The smallest scalable element supported by scaled SVE addressing - // modes are predicates, which are 2 scalable bytes in size. So the scalable - // byte offset must always be a multiple of 2. - assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); - - NumBytes = Offset.getFixed(); - NumDataVectors = 0; - NumPredicateVectors = Offset.getScalable() / 2; - // This method is used to get the offsets to adjust the frame offset. - // If the function requires ADDPL to be used and needs more than two ADDPL - // instructions, part of the offset is folded into NumDataVectors so that it - // uses ADDVL for part of it, reducing the number of ADDPL instructions. - if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || - NumPredicateVectors > 62) { - NumDataVectors = NumPredicateVectors / 8; - NumPredicateVectors -= NumDataVectors * 8; - } -} - +void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( + const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); + + // VGSized offsets are divided by '2', because the VG register is the + // the number of 64bit granules as opposed to 128bit vector chunks, + // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. + // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. + // VG = n * 2 and the dwarf offset must be VG * 8 bytes. + ByteSized = Offset.getFixed(); + VGSized = Offset.getScalable() / 2; +} + +/// Returns the offset in parts to which this frame offset can be +/// decomposed for the purpose of describing a frame offset. +/// For non-scalable offsets this is simply its byte size. +void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( + const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); + + NumBytes = Offset.getFixed(); + NumDataVectors = 0; + NumPredicateVectors = Offset.getScalable() / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } +} + // Helper function to emit a frame offset adjustment from a given // pointer (SrcReg), stored into DestReg. This function is explicit // in that it requires the opcode. @@ -3709,13 +3709,13 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { int64_t Bytes, NumPredicateVectors, NumDataVectors; - AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( - Offset, Bytes, NumPredicateVectors, NumDataVectors); + AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( + Offset, Bytes, NumPredicateVectors, NumDataVectors); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { - assert((DestReg != AArch64::SP || Bytes % 8 == 0) && - "SP increment/decrement not 8-byte aligned"); + assert((DestReg != AArch64::SP || Bytes % 8 == 0) && + "SP increment/decrement not 8-byte aligned"); unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; if (Bytes < 0) { Bytes = -Bytes; @@ -3970,7 +3970,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, // Construct the complete offset. bool IsMulVL = ScaleValue.isScalable(); unsigned Scale = ScaleValue.getKnownMinSize(); - int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); + int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); const MachineOperand &ImmOpnd = MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); @@ -4012,9 +4012,9 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, *OutUnscaledOp = *UnscaledOp; if (IsMulVL) - SOffset = StackOffset::get(SOffset.getFixed(), Offset); + SOffset = StackOffset::get(SOffset.getFixed(), Offset); else - SOffset = StackOffset::get(Offset, SOffset.getScalable()); + SOffset = StackOffset::get(Offset, SOffset.getScalable()); return AArch64FrameOffsetCanUpdate | (SOffset ? 0 : AArch64FrameOffsetIsLegal); } @@ -4026,7 +4026,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned ImmIdx = FrameRegIdx + 1; if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { - Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); + Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), MI.getOperand(0).getReg(), FrameReg, Offset, TII, MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); @@ -4131,7 +4131,7 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } -// FP Opcodes that can be combined with a FMUL. +// FP Opcodes that can be combined with a FMUL. static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { switch (Inst.getOpcode()) { default: @@ -4153,12 +4153,12 @@ static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { case AArch64::FSUBv2f64: case AArch64::FSUBv4f32: TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; - // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by - // the target options or if FADD/FSUB has the contract fast-math flag. - return Options.UnsafeFPMath || - Options.AllowFPOpFusion == FPOpFusion::Fast || - Inst.getFlag(MachineInstr::FmContract); - return true; + // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by + // the target options or if FADD/FSUB has the contract fast-math flag. + return Options.UnsafeFPMath || + Options.AllowFPOpFusion == FPOpFusion::Fast || + Inst.getFlag(MachineInstr::FmContract); + return true; } return false; } @@ -4638,8 +4638,8 @@ bool AArch64InstrInfo::isThroughputPattern( /// pattern evaluator stops checking as soon as it finds a faster sequence. bool AArch64InstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, - bool DoRegPressureReduce) const { + MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const { // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; @@ -4647,8 +4647,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getFMAPatterns(Root, Patterns)) return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, - DoRegPressureReduce); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } enum class FMAInstKind { Default, Indexed, Accumulator }; @@ -4871,7 +4871,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - MachineInstr *MUL = nullptr; + MachineInstr *MUL = nullptr; const TargetRegisterClass *RC; unsigned Opc; switch (Pattern) { @@ -5692,9 +5692,9 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion - // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and - // CodeGen/AArch64/urem-seteq-nonzero.ll. - // assert(MUL && "MUL was never set"); + // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and + // CodeGen/AArch64/urem-seteq-nonzero.ll. + // assert(MUL && "MUL was never set"); DelInstrs.push_back(MUL); DelInstrs.push_back(&Root); } @@ -6034,20 +6034,20 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b) { - const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); - const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); + const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); + const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); - return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && - MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); + return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && + MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); } static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b) { - const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); - const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); + const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); + const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); - return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); + return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); } static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, @@ -6104,9 +6104,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // necessary. However, at this point we don't know if the outlined function // will have a RET instruction so we assume the worst. const TargetRegisterInfo &TRI = getRegisterInfo(); - if (FirstCand.getMF() - ->getInfo<AArch64FunctionInfo>() - ->shouldSignReturnAddress(true)) { + if (FirstCand.getMF() + ->getInfo<AArch64FunctionInfo>() + ->shouldSignReturnAddress(true)) { // One PAC and one AUT instructions NumBytesToCreateFrame += 8; @@ -6163,7 +6163,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( return false; }; // Remove candidates with illegal stack modifying instructions - llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); + llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); // If the sequence doesn't have enough candidates left, then we're done. if (RepeatedSequenceLocs.size() < 2) @@ -6206,7 +6206,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( // Erase every candidate that violates the restrictions above. (It could be // true that we have viable candidates, so it's not worth bailing out in // the case that, say, 1 out of 20 candidates violate the restructions.) - llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); + llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); // If the sequence doesn't have enough candidates left, then we're done. if (RepeatedSequenceLocs.size() < 2) @@ -6229,7 +6229,7 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( NumBytesToCreateFrame += 4; bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { - return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); + return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); }); // We check to see if CFI Instructions are present, and if they are @@ -6398,60 +6398,60 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( FrameID = MachineOutlinerNoLRSave; } else { SetCandidateCallInfo(MachineOutlinerDefault, 12); - - // Bugzilla ID: 46767 - // TODO: Check if fixing up the stack more than once is safe so we can - // outline these. - // - // An outline resulting in a caller that requires stack fixups at the - // callsite to a callee that also requires stack fixups can happen when - // there are no available registers at the candidate callsite for a - // candidate that itself also has calls. - // - // In other words if function_containing_sequence in the following pseudo - // assembly requires that we save LR at the point of the call, but there - // are no available registers: in this case we save using SP and as a - // result the SP offsets requires stack fixups by multiples of 16. - // - // function_containing_sequence: - // ... - // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N - // call OUTLINED_FUNCTION_N - // restore LR from SP - // ... - // - // OUTLINED_FUNCTION_N: - // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N - // ... - // bl foo - // restore LR from SP - // ret - // - // Because the code to handle more than one stack fixup does not - // currently have the proper checks for legality, these cases will assert - // in the AArch64 MachineOutliner. This is because the code to do this - // needs more hardening, testing, better checks that generated code is - // legal, etc and because it is only verified to handle a single pass of - // stack fixup. - // - // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch - // these cases until they are known to be handled. Bugzilla 46767 is - // referenced in comments at the assert site. - // - // To avoid asserting (or generating non-legal code on noassert builds) - // we remove all candidates which would need more than one stack fixup by - // pruning the cases where the candidate has calls while also having no - // available LR and having no available general purpose registers to copy - // LR to (ie one extra stack save/restore). - // - if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { - erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { - return (std::any_of( - C.front(), std::next(C.back()), - [](const MachineInstr &MI) { return MI.isCall(); })) && - (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); - }); - } + + // Bugzilla ID: 46767 + // TODO: Check if fixing up the stack more than once is safe so we can + // outline these. + // + // An outline resulting in a caller that requires stack fixups at the + // callsite to a callee that also requires stack fixups can happen when + // there are no available registers at the candidate callsite for a + // candidate that itself also has calls. + // + // In other words if function_containing_sequence in the following pseudo + // assembly requires that we save LR at the point of the call, but there + // are no available registers: in this case we save using SP and as a + // result the SP offsets requires stack fixups by multiples of 16. + // + // function_containing_sequence: + // ... + // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N + // call OUTLINED_FUNCTION_N + // restore LR from SP + // ... + // + // OUTLINED_FUNCTION_N: + // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N + // ... + // bl foo + // restore LR from SP + // ret + // + // Because the code to handle more than one stack fixup does not + // currently have the proper checks for legality, these cases will assert + // in the AArch64 MachineOutliner. This is because the code to do this + // needs more hardening, testing, better checks that generated code is + // legal, etc and because it is only verified to handle a single pass of + // stack fixup. + // + // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch + // these cases until they are known to be handled. Bugzilla 46767 is + // referenced in comments at the assert site. + // + // To avoid asserting (or generating non-legal code on noassert builds) + // we remove all candidates which would need more than one stack fixup by + // pruning the cases where the candidate has calls while also having no + // available LR and having no available general purpose registers to copy + // LR to (ie one extra stack save/restore). + // + if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { + erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { + return (std::any_of( + C.front(), std::next(C.back()), + [](const MachineInstr &MI) { return MI.isCall(); })) && + (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); + }); + } } // If we dropped all of the candidates, bail out here. @@ -6820,7 +6820,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, // If v8.3a features are available we can replace a RET instruction by // RETAA or RETAB and omit the AUT instructions - if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && + if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && MBBAUT->getOpcode() == AArch64::RET) { BuildMI(MBB, MBBAUT, DL, TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA @@ -6872,12 +6872,12 @@ void AArch64InstrInfo::buildOutlinedFrame( return MI.isCall() && !MI.isReturn(); }; - if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { + if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { // Fix up the instructions in the range, since we're going to modify the // stack. - - // Bugzilla ID: 46767 - // TODO: Check if fixing up twice is safe so we can outline these. + + // Bugzilla ID: 46767 + // TODO: Check if fixing up twice is safe so we can outline these. assert(OF.FrameConstructionID != MachineOutlinerDefault && "Can only fix up stack references once"); fixupPostOutline(MBB); @@ -6934,11 +6934,11 @@ void AArch64InstrInfo::buildOutlinedFrame( // If a bunch of candidates reach this point they must agree on their return // address signing. It is therefore enough to just consider the signing // behaviour of one of them - const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); - bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); + const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); + bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); // a_key is the default - bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); + bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); // If this is a tail call outlined function, then there's already a return. if (OF.FrameConstructionID == MachineOutlinerTailCall || @@ -7099,7 +7099,7 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, return None; int Shift = MI.getOperand(3).getImm(); assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); - Offset = Sign * (MI.getOperand(2).getImm() << Shift); + Offset = Sign * (MI.getOperand(2).getImm() << Shift); } } return RegImmPair{MI.getOperand(1).getReg(), Offset}; @@ -7175,14 +7175,14 @@ uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { return get(Opc).TSFlags & AArch64::ElementSizeMask; } -bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { - return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; -} - -bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { - return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; -} - +bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { + return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; +} + +bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { + return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; +} + unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) return AArch64::BLRNoIP; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.h index 7434987e06..9b924a8440 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.h @@ -112,10 +112,10 @@ public: /// Hint that pairing the given load or store is unprofitable. static void suppressLdStPair(MachineInstr &MI); - Optional<ExtAddrMode> - getAddrModeFromMemoryOp(const MachineInstr &MemI, - const TargetRegisterInfo *TRI) const override; - + Optional<ExtAddrMode> + getAddrModeFromMemoryOp(const MachineInstr &MemI, + const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -191,9 +191,9 @@ public: MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify = false) const override; - bool analyzeBranchPredicate(MachineBasicBlock &MBB, - MachineBranchPredicate &MBP, - bool AllowModify) const override; + bool analyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const override; unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, @@ -235,10 +235,10 @@ public: /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in ``Root``. All potential patterns are /// listed in the ``Patterns`` array. - bool - getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns, - bool DoRegPressureReduce) const override; + bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const override; /// Return true when Inst is associative and commutative so that it can be /// reassociated. bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; @@ -280,12 +280,12 @@ public: bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override; /// Returns the vector element size (B, H, S or D) of an SVE opcode. uint64_t getElementSizeForOpcode(unsigned Opc) const; - /// Returns true if the opcode is for an SVE instruction that sets the - /// condition codes as if it's results had been fed to a PTEST instruction - /// along with the same general predicate. - bool isPTestLikeOpcode(unsigned Opc) const; - /// Returns true if the opcode is for an SVE WHILE## instruction. - bool isWhileOpcode(unsigned Opc) const; + /// Returns true if the opcode is for an SVE instruction that sets the + /// condition codes as if it's results had been fed to a PTEST instruction + /// along with the same general predicate. + bool isPTestLikeOpcode(unsigned Opc) const; + /// Returns true if the opcode is for an SVE WHILE## instruction. + bool isWhileOpcode(unsigned Opc) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. static bool isFalkorShiftExtFast(const MachineInstr &MI); @@ -299,13 +299,13 @@ public: Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI, Register Reg) const override; - static void decomposeStackOffsetForFrameOffsets(const StackOffset &Offset, - int64_t &NumBytes, - int64_t &NumPredicateVectors, - int64_t &NumDataVectors); - static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, - int64_t &ByteSized, - int64_t &VGSized); + static void decomposeStackOffsetForFrameOffsets(const StackOffset &Offset, + int64_t &NumBytes, + int64_t &NumPredicateVectors, + int64_t &NumDataVectors); + static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, + int64_t &ByteSized, + int64_t &VGSized); #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" @@ -334,12 +334,12 @@ private: /// Returns an unused general-purpose register which can be used for /// constructing an outlined call if one exists. Returns 0 otherwise. unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; - - /// Remove a ptest of a predicate-generating operation that already sets, or - /// can be made to set, the condition codes in an identical manner - bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg, - unsigned PredReg, - const MachineRegisterInfo *MRI) const; + + /// Remove a ptest of a predicate-generating operation that already sets, or + /// can be made to set, the condition codes in an identical manner + bool optimizePTestInstr(MachineInstr *PTest, unsigned MaskReg, + unsigned PredReg, + const MachineRegisterInfo *MRI) const; }; /// Return true if there is an instruction /after/ \p DefMI and before \p UseMI @@ -423,18 +423,18 @@ static inline bool isIndirectBranchOpcode(int Opc) { return false; } -static inline bool isPTrueOpcode(unsigned Opc) { - switch (Opc) { - case AArch64::PTRUE_B: - case AArch64::PTRUE_H: - case AArch64::PTRUE_S: - case AArch64::PTRUE_D: - return true; - default: - return false; - } -} - +static inline bool isPTrueOpcode(unsigned Opc) { + switch (Opc) { + case AArch64::PTRUE_B: + case AArch64::PTRUE_H: + case AArch64::PTRUE_S: + case AArch64::PTRUE_D: + return true; + default: + return false; + } +} + /// Return opcode to be used for indirect calls. unsigned getBLRCallOpcode(const MachineFunction &MF); @@ -442,7 +442,7 @@ unsigned getBLRCallOpcode(const MachineFunction &MF); #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits #define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit #define TSFLAG_FALSE_LANE_TYPE(X) ((X) << 7) // 2-bits -#define TSFLAG_INSTR_FLAGS(X) ((X) << 9) // 2-bits +#define TSFLAG_INSTR_FLAGS(X) ((X) << 9) // 2-bits // } namespace AArch64 { @@ -475,14 +475,14 @@ enum FalseLaneType { FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2), }; -// NOTE: This is a bit field. -static const uint64_t InstrFlagIsWhile = TSFLAG_INSTR_FLAGS(0x1); -static const uint64_t InstrFlagIsPTestLike = TSFLAG_INSTR_FLAGS(0x2); - +// NOTE: This is a bit field. +static const uint64_t InstrFlagIsWhile = TSFLAG_INSTR_FLAGS(0x1); +static const uint64_t InstrFlagIsPTestLike = TSFLAG_INSTR_FLAGS(0x2); + #undef TSFLAG_ELEMENT_SIZE_TYPE #undef TSFLAG_DESTRUCTIVE_INST_TYPE #undef TSFLAG_FALSE_LANE_TYPE -#undef TSFLAG_INSTR_FLAGS +#undef TSFLAG_INSTR_FLAGS int getSVEPseudoMap(uint16_t Opcode); int getSVERevInstr(uint16_t Opcode); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.td index 171d3dbaa8..8051a6a937 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64InstrInfo.td @@ -25,16 +25,16 @@ def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; -def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, - AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; +def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, + AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; def HasVH : Predicate<"Subtarget->hasVH()">, AssemblerPredicate<(all_of FeatureVH), "vh">; def HasLOR : Predicate<"Subtarget->hasLOR()">, AssemblerPredicate<(all_of FeatureLOR), "lor">; -def HasPAuth : Predicate<"Subtarget->hasPAuth()">, - AssemblerPredicate<(all_of FeaturePAuth), "pauth">; +def HasPAuth : Predicate<"Subtarget->hasPAuth()">, + AssemblerPredicate<(all_of FeaturePAuth), "pauth">; def HasJS : Predicate<"Subtarget->hasJS()">, AssemblerPredicate<(all_of FeatureJS), "jsconv">; @@ -69,8 +69,8 @@ def HasPMU : Predicate<"Subtarget->hasPMU()">, def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">; -def HasFlagM : Predicate<"Subtarget->hasFlagM()">, - AssemblerPredicate<(all_of FeatureFlagM), "flagm">; +def HasFlagM : Predicate<"Subtarget->hasFlagM()">, + AssemblerPredicate<(all_of FeatureFlagM), "flagm">; def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">; @@ -151,16 +151,16 @@ def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; -def HasXS : Predicate<"Subtarget->hasXS()">, - AssemblerPredicate<(all_of FeatureXS), "xs">; -def HasWFxT : Predicate<"Subtarget->hasWFxT()">, - AssemblerPredicate<(all_of FeatureWFxT), "wfxt">; -def HasLS64 : Predicate<"Subtarget->hasLS64()">, - AssemblerPredicate<(all_of FeatureLS64), "ls64">; -def HasBRBE : Predicate<"Subtarget->hasBRBE()">, - AssemblerPredicate<(all_of FeatureBRBE), "brbe">; -def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, - AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">; +def HasXS : Predicate<"Subtarget->hasXS()">, + AssemblerPredicate<(all_of FeatureXS), "xs">; +def HasWFxT : Predicate<"Subtarget->hasWFxT()">, + AssemblerPredicate<(all_of FeatureWFxT), "wfxt">; +def HasLS64 : Predicate<"Subtarget->hasLS64()">, + AssemblerPredicate<(all_of FeatureLS64), "ls64">; +def HasBRBE : Predicate<"Subtarget->hasBRBE()">, + AssemblerPredicate<(all_of FeatureBRBE), "brbe">; +def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, + AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -411,12 +411,12 @@ def AArch64call : SDNode<"AArch64ISD::CALL", SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; - -def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER", - SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPVariadic]>; - + +def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond, [SDNPHasChain]>; def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz, @@ -518,7 +518,7 @@ def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>; def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>; def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>; def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), - (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; + (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; @@ -570,19 +570,19 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; -def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>; -def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>; - -def AArch64uabd_n : SDNode<"AArch64ISD::UABD", SDT_AArch64binvec>; -def AArch64sabd_n : SDNode<"AArch64ISD::SABD", SDT_AArch64binvec>; - -def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), - [(AArch64uabd_n node:$lhs, node:$rhs), - (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; -def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), - [(AArch64sabd_n node:$lhs, node:$rhs), - (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; - +def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>; +def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>; + +def AArch64uabd_n : SDNode<"AArch64ISD::UABD", SDT_AArch64binvec>; +def AArch64sabd_n : SDNode<"AArch64ISD::SABD", SDT_AArch64binvec>; + +def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), + [(AArch64uabd_n node:$lhs, node:$rhs), + (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; +def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), + [(AArch64sabd_n node:$lhs, node:$rhs), + (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; + def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -617,8 +617,8 @@ let RecomputePerFunction = 1 in { // Avoid generating STRQro if it is slow, unless we're optimizing for code size. def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">; - def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>; - def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>; + def UseBTI : Predicate<[{ MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>; + def NotUseBTI : Predicate<[{ !MF->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement() }]>; def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>; def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>; @@ -716,8 +716,8 @@ def : Pat<(AArch64LOADgot tconstpool:$addr), // 32-bit jump table destination is actually only 2 instructions since we can // use the table itself as a PC-relative base. But optimization occurs after // branch relaxation so be pessimistic. -let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch", - isNotDuplicable = 1 in { +let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch", + isNotDuplicable = 1 in { def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, Sched<[]>; @@ -801,34 +801,34 @@ def TSB : CRmSystemI<barrier_op, 0b010, "tsb", []> { let Inst{12} = 0; let Predicates = [HasTRACEV8_4]; } - -def DSBnXS : CRmSystemI<barrier_nxs_op, 0b001, "dsb"> { - let CRm{1-0} = 0b11; - let Inst{9-8} = 0b10; - let Predicates = [HasXS]; -} - -let Predicates = [HasWFxT] in { -def WFET : RegInputSystemI<0b0000, 0b000, "wfet">; -def WFIT : RegInputSystemI<0b0000, 0b001, "wfit">; -} - -// Branch Record Buffer two-word mnemonic instructions -class BRBEI<bits<3> op2, string keyword> - : SimpleSystemI<0, (ins), "brb", keyword>, Sched<[WriteSys]> { - let Inst{31-8} = 0b110101010000100101110010; - let Inst{7-5} = op2; - let Predicates = [HasBRBE]; -} -def BRB_IALL: BRBEI<0b100, "\tiall">; -def BRB_INJ: BRBEI<0b101, "\tinj">; - -} - -// Allow uppercase and lowercase keyword arguments for BRB IALL and BRB INJ -def : TokenAlias<"INJ", "inj">; -def : TokenAlias<"IALL", "iall">; - + +def DSBnXS : CRmSystemI<barrier_nxs_op, 0b001, "dsb"> { + let CRm{1-0} = 0b11; + let Inst{9-8} = 0b10; + let Predicates = [HasXS]; +} + +let Predicates = [HasWFxT] in { +def WFET : RegInputSystemI<0b0000, 0b000, "wfet">; +def WFIT : RegInputSystemI<0b0000, 0b001, "wfit">; +} + +// Branch Record Buffer two-word mnemonic instructions +class BRBEI<bits<3> op2, string keyword> + : SimpleSystemI<0, (ins), "brb", keyword>, Sched<[WriteSys]> { + let Inst{31-8} = 0b110101010000100101110010; + let Inst{7-5} = op2; + let Predicates = [HasBRBE]; +} +def BRB_IALL: BRBEI<0b100, "\tiall">; +def BRB_INJ: BRBEI<0b101, "\tinj">; + +} + +// Allow uppercase and lowercase keyword arguments for BRB IALL and BRB INJ +def : TokenAlias<"INJ", "inj">; +def : TokenAlias<"IALL", "iall">; + // ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>; @@ -849,23 +849,23 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; def BFCVT : BF16ToSinglePrecision<"bfcvt">; - -// Vector-scalar BFDOT: -// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit -// register (the instruction uses a single 32-bit lane from it), so the pattern -// is a bit tricky. -def : Pat<(v2f32 (int_aarch64_neon_bfdot - (v2f32 V64:$Rd), (v4bf16 V64:$Rn), - (v4bf16 (bitconvert - (v2i32 (AArch64duplane32 - (v4i32 (bitconvert - (v8bf16 (insert_subvector undef, - (v4bf16 V64:$Rm), - (i64 0))))), - VectorIndexS:$idx)))))), - (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn), - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), - VectorIndexS:$idx)>; + +// Vector-scalar BFDOT: +// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit +// register (the instruction uses a single 32-bit lane from it), so the pattern +// is a bit tricky. +def : Pat<(v2f32 (int_aarch64_neon_bfdot + (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (v4bf16 (bitconvert + (v2i32 (AArch64duplane32 + (v4i32 (bitconvert + (v8bf16 (insert_subvector undef, + (v4bf16 V64:$Rm), + (i64 0))))), + VectorIndexS:$idx)))))), + (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; } // ARMv8.6A AArch64 matrix multiplication @@ -965,7 +965,7 @@ let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>; } - + let Predicates = [HasComplxNum, HasNEON] in { def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>; @@ -979,47 +979,47 @@ let Predicates = [HasComplxNum, HasNEON] in { } } -multiclass FCMLA_PATS<ValueType ty, RegisterClass Reg> { - def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), - (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), - (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 1)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), - (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 2)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), - (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>; -} - -multiclass FCMLA_LANE_PATS<ValueType ty, RegisterClass Reg, dag RHSDup> { - def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), - (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), - (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), - (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>; - def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), - (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>; -} - - -let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { - defm : FCMLA_PATS<v4f16, V64>; - defm : FCMLA_PATS<v8f16, V128>; - - defm : FCMLA_LANE_PATS<v4f16, V64, - (v4f16 (bitconvert (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexD:$idx))))>; - defm : FCMLA_LANE_PATS<v8f16, V128, - (v8f16 (bitconvert (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))>; -} -let Predicates = [HasComplxNum, HasNEON] in { - defm : FCMLA_PATS<v2f32, V64>; - defm : FCMLA_PATS<v4f32, V128>; - defm : FCMLA_PATS<v2f64, V128>; - - defm : FCMLA_LANE_PATS<v4f32, V128, - (v4f32 (bitconvert (v2i64 (AArch64duplane64 (v2i64 V128:$Rm), VectorIndexD:$idx))))>; -} - +multiclass FCMLA_PATS<ValueType ty, RegisterClass Reg> { + def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 1)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 2)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast<Instruction>("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>; +} + +multiclass FCMLA_LANE_PATS<ValueType ty, RegisterClass Reg, dag RHSDup> { + def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast<Instruction>("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>; +} + + +let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { + defm : FCMLA_PATS<v4f16, V64>; + defm : FCMLA_PATS<v8f16, V128>; + + defm : FCMLA_LANE_PATS<v4f16, V64, + (v4f16 (bitconvert (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexD:$idx))))>; + defm : FCMLA_LANE_PATS<v8f16, V128, + (v8f16 (bitconvert (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))>; +} +let Predicates = [HasComplxNum, HasNEON] in { + defm : FCMLA_PATS<v2f32, V64>; + defm : FCMLA_PATS<v4f32, V128>; + defm : FCMLA_PATS<v2f64, V128>; + + defm : FCMLA_LANE_PATS<v4f32, V128, + (v4f32 (bitconvert (v2i64 (AArch64duplane64 (v2i64 V128:$Rm), VectorIndexD:$idx))))>; +} + // v8.3a Pointer Authentication // These instructions inhabit part of the hint space and so can be used for // armv8 targets. Keeping the old HINT mnemonic when compiling without PA is @@ -1073,7 +1073,7 @@ def : InstAlias<"autib1716", (AUTIB1716), 0>; def : InstAlias<"xpaclri", (XPACLRI), 0>; // These pointer authentication instructions require armv8.3a -let Predicates = [HasPAuth] in { +let Predicates = [HasPAuth] in { // When PA is enabled, a better mnemonic should be emitted. def : InstAlias<"paciaz", (PACIAZ), 1>; @@ -1104,8 +1104,8 @@ let Predicates = [HasPAuth] in { defm PAC : SignAuth<0b000, 0b010, "pac">; defm AUT : SignAuth<0b001, 0b011, "aut">; - def XPACI : ClearAuth<0, "xpaci">; - def XPACD : ClearAuth<1, "xpacd">; + def XPACI : ClearAuth<0, "xpaci">; + def XPACD : ClearAuth<1, "xpacd">; def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; // Combined Instructions @@ -1140,7 +1140,7 @@ let Predicates = [HasPAuth] in { } // v8.3a floating point conversion for javascript -let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in +let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, "fjcvtzs", [(set GPR32:$Rd, @@ -1149,7 +1149,7 @@ def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, } // HasJS, HasFPARMv8 // v8.4 Flag manipulation instructions -let Predicates = [HasFlagM], Defs = [NZCV], Uses = [NZCV] in { +let Predicates = [HasFlagM], Defs = [NZCV], Uses = [NZCV] in { def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> { let Inst{20-5} = 0b0000001000000000; } @@ -1157,7 +1157,7 @@ def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">; def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">; def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif", "{\t$Rn, $imm, $mask}">; -} // HasFlagM +} // HasFlagM // v8.5 flag manipulation instructions let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in { @@ -1206,12 +1206,12 @@ def HWASAN_CHECK_MEMACCESS : Pseudo< (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, Sched<[]>; -} - -let Uses = [ X20 ], Defs = [ X16, X17, LR, NZCV ] in { +} + +let Uses = [ X20 ], Defs = [ X16, X17, LR, NZCV ] in { def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo< (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), - [(int_hwasan_check_memaccess_shortgranules X20, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, + [(int_hwasan_check_memaccess_shortgranules X20, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, Sched<[]>; } @@ -1558,16 +1558,16 @@ def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>; def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>; def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>; -def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext_inreg GPR64:$Rm, i32))), - (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; -def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext GPR32:$Rm))), - (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext_inreg GPR64:$Rm, i32))), + (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext GPR32:$Rm))), + (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))), (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; -def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (and GPR64:$Rm, 0xFFFFFFFF))), - (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; -def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (zext GPR32:$Rm))), - (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; +def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (and GPR64:$Rm, 0xFFFFFFFF))), + (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; +def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (zext GPR32:$Rm))), + (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))), (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; @@ -2154,8 +2154,8 @@ let isCall = 1, Defs = [LR], Uses = [SP] in { def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>, Sched<[WriteBrReg]>, PseudoInstExpansion<(BLR GPR64:$Rn)>; - def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>, - Sched<[WriteBrReg]>; + def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>, + Sched<[WriteBrReg]>; } // isCall def : Pat<(AArch64call GPR64:$Rn), @@ -2165,10 +2165,10 @@ def : Pat<(AArch64call GPR64noip:$Rn), (BLRNoIP GPR64noip:$Rn)>, Requires<[SLSBLRMitigation]>; -def : Pat<(AArch64call_rvmarker GPR64:$Rn), - (BLR_RVMARKER GPR64:$Rn)>, - Requires<[NoSLSBLRMitigation]>; - +def : Pat<(AArch64call_rvmarker GPR64:$Rn), + (BLR_RVMARKER GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch @@ -3900,7 +3900,7 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, // Floating point immediate move. //===----------------------------------------------------------------------===// -let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm FMOV : FPMoveImmediate<"fmov">; } @@ -3909,7 +3909,7 @@ defm FMOV : FPMoveImmediate<"fmov">; //===----------------------------------------------------------------------===// defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - AArch64uabd>; + AArch64uabd>; // Match UABDL in log2-shuffle patterns. def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), @@ -4041,7 +4041,7 @@ def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>; @@ -4160,9 +4160,9 @@ defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", - TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; -defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; -defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>; + TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; @@ -4179,9 +4179,9 @@ defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", - TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; -defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; -defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>; + TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; @@ -4579,10 +4579,10 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))), (FCVTPSv1i64 FPR64:$Rn)>; def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), (FCVTPUv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))), - (FCVTZSv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))), - (FCVTZUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))), + (FCVTZSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))), + (FCVTZUv1i64 FPR64:$Rn)>; def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))), (FRECPEv1f16 FPR16:$Rn)>; @@ -4754,9 +4754,9 @@ defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>; defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", - AArch64sabd>; + AArch64sabd>; defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", - AArch64sabd>; + AArch64sabd>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", @@ -4777,59 +4777,59 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", - AArch64uabd>; + AArch64uabd>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", - BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; + BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", - BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; + BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", - BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; + BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", - BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>; - -// Additional patterns for [SU]ML[AS]L -multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperator vecopnode, - Instruction INST8B, Instruction INST4H, Instruction INST2S> { - def : Pat<(v4i16 (opnode - V64:$Ra, - (v4i16 (extract_subvector - (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)), - (i64 0))))), - (EXTRACT_SUBREG (v8i16 (INST8B - (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub), - V64:$Rn, V64:$Rm)), dsub)>; - def : Pat<(v2i32 (opnode - V64:$Ra, - (v2i32 (extract_subvector - (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)), - (i64 0))))), - (EXTRACT_SUBREG (v4i32 (INST4H - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub), - V64:$Rn, V64:$Rm)), dsub)>; - def : Pat<(v1i64 (opnode - V64:$Ra, - (v1i64 (extract_subvector - (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)), - (i64 0))))), - (EXTRACT_SUBREG (v2i64 (INST2S - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub), - V64:$Rn, V64:$Rm)), dsub)>; -} - -defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull, - UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull, - SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull, - UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; -defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull, - SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; - + BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>; + +// Additional patterns for [SU]ML[AS]L +multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperator vecopnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v4i16 (opnode + V64:$Ra, + (v4i16 (extract_subvector + (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v8i16 (INST8B + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v2i32 (opnode + V64:$Ra, + (v2i32 (extract_subvector + (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v4i32 (INST4H + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v1i64 (opnode + V64:$Ra, + (v1i64 (extract_subvector + (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v2i64 (INST2S + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; +} + +defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull, + UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; +defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull, + SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; +defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull, + UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; +defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull, + SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; + // Additional patterns for SMULL and UMULL multiclass Neon_mul_widen_patterns<SDPatternOperator opnode, Instruction INST8B, Instruction INST4H, Instruction INST2S> { @@ -5041,26 +5041,26 @@ defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; - -let Predicates = [HasFullFP16] in { -def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))), - (FADDPv2i16p - (EXTRACT_SUBREG - (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))), - dsub))>; -def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))), - (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>; -} -def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))), - (FADDPv2i32p - (EXTRACT_SUBREG - (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))), - dsub))>; -def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))), - (FADDPv2i32p V64:$Rn)>; -def : Pat<(f64 (vecreduce_fadd (v2f64 V128:$Rn))), - (FADDPv2i64p V128:$Rn)>; - + +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))), + (FADDPv2i16p + (EXTRACT_SUBREG + (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))), + (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>; +} +def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))), + (FADDPv2i32p + (EXTRACT_SUBREG + (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f64 (vecreduce_fadd (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; + def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), @@ -5312,16 +5312,16 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; -def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0), - (i64 VectorIndexH:$imm)), - (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; -def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0), - (i64 VectorIndexS:$imm)), - (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; -def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), - (i64 VectorIndexD:$imm)), - (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; - +def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0), + (i64 VectorIndexH:$imm)), + (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; +def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0), + (i64 VectorIndexS:$imm)), + (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; +def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), + (i64 VectorIndexD:$imm)), + (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; + def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), (INSvi16lane @@ -6833,18 +6833,18 @@ def : Pat<(i32 (trunc GPR64sp:$src)), // __builtin_trap() uses the BRK instruction on AArch64. def : Pat<(trap), (BRK 1)>; -def : Pat<(debugtrap), (BRK 0xF000)>; - -def ubsan_trap_xform : SDNodeXForm<timm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() | ('U' << 8), SDLoc(N), MVT::i32); -}]>; - -def ubsan_trap_imm : TImmLeaf<i32, [{ - return isUInt<8>(Imm); -}], ubsan_trap_xform>; - -def : Pat<(ubsantrap ubsan_trap_imm:$kind), (BRK ubsan_trap_imm:$kind)>; - +def : Pat<(debugtrap), (BRK 0xF000)>; + +def ubsan_trap_xform : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() | ('U' << 8), SDLoc(N), MVT::i32); +}]>; + +def ubsan_trap_imm : TImmLeaf<i32, [{ + return isUInt<8>(Imm); +}], ubsan_trap_xform>; + +def : Pat<(ubsantrap ubsan_trap_imm:$kind), (BRK ubsan_trap_imm:$kind)>; + // Multiply high patterns which multiply the lower subvector using smull/umull // and the upper subvector with smull2/umull2. Then shuffle the high the high // part of both results together. @@ -7639,9 +7639,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), (vector_extract (v4f32 FPR128:$Rn), (i64 1))), (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; -def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), - (vector_extract (v8f16 FPR128:$Rn), (i64 1))), - (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; +def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), + (vector_extract (v8f16 FPR128:$Rn), (i64 1))), + (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; // Scalar 64-bit shifts in FPR64 registers. def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), @@ -7844,23 +7844,23 @@ let AddedComplexity = 10 in { // FIXME: add SVE dot-product patterns. } -let Predicates = [HasLS64] in { - def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), - (outs GPR64x8:$Rt)>; - def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn), - (outs)>; - def ST64BV: Store64BV<0b011, "st64bv">; - def ST64BV0: Store64BV<0b010, "st64bv0">; - - class ST64BPattern<Intrinsic intrinsic, Instruction instruction> - : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7), - (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>; - - def : ST64BPattern<int_aarch64_st64b, ST64B>; - def : ST64BPattern<int_aarch64_st64bv, ST64BV>; - def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>; -} - +let Predicates = [HasLS64] in { + def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), + (outs GPR64x8:$Rt)>; + def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn), + (outs)>; + def ST64BV: Store64BV<0b011, "st64bv">; + def ST64BV0: Store64BV<0b010, "st64bv0">; + + class ST64BPattern<Intrinsic intrinsic, Instruction instruction> + : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7), + (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>; + + def : ST64BPattern<int_aarch64_st64b, ST64B>; + def : ST64BPattern<int_aarch64_st64bv, ST64BV>; + def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>; +} + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index ad180cb293..f87385ccd4 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1186,10 +1186,10 @@ bool AArch64LoadStoreOpt::findMatchingStore( // store instruction writes and the stored value is not modified, we can // promote the load. Since we do not handle stores with pre-/post-index, // it's unnecessary to check if BaseReg is modified by the store itself. - // Also we can't handle stores without an immediate offset operand, - // while the operand might be the address for a global variable. + // Also we can't handle stores without an immediate offset operand, + // while the operand might be the address for a global variable. if (MI.mayStore() && isMatchingStore(LoadMI, MI) && - BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() && + BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() && isLdOffsetInRangeOfSt(LoadMI, MI, TII) && ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { StoreI = MBBI; @@ -1552,27 +1552,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, continue; } } - // If the destination register of one load is the same register or a - // sub/super register of the other load, bail and keep looking. A - // load-pair instruction with both destination registers the same is - // UNPREDICTABLE and will result in an exception. - if (MayLoad && - TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) { + // If the destination register of one load is the same register or a + // sub/super register of the other load, bail and keep looking. A + // load-pair instruction with both destination registers the same is + // UNPREDICTABLE and will result in an exception. + if (MayLoad && + TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) { LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); MemInsns.push_back(&MI); continue; } - // If the BaseReg has been modified, then we cannot do the optimization. - // For example, in the following pattern - // ldr x1 [x2] - // ldr x2 [x3] - // ldr x4 [x2, #8], - // the first and third ldr cannot be converted to ldp x1, x4, [x2] - if (!ModifiedRegUnits.available(BaseReg)) - return E; - + // If the BaseReg has been modified, then we cannot do the optimization. + // For example, in the following pattern + // ldr x1 [x2] + // ldr x2 [x3] + // ldr x4 [x2, #8], + // the first and third ldr cannot be converted to ldp x1, x4, [x2] + if (!ModifiedRegUnits.available(BaseReg)) + return E; + // If the Rt of the second instruction was not modified or used between // the two instructions and none of the instructions between the second // and first alias with the second, we can combine the second into the @@ -1763,11 +1763,11 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, return false; } -static bool needsWinCFI(const MachineFunction *MF) { - return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && - MF->getFunction().needsUnwindTableEntry(); -} - +static bool needsWinCFI(const MachineFunction *MF) { + return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && + MF->getFunction().needsUnwindTableEntry(); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); @@ -1808,11 +1808,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( // the memory access (I) and the increment (MBBI) can access the memory // region defined by [SP, MBBI]. const bool BaseRegSP = BaseReg == AArch64::SP; - if (BaseRegSP && needsWinCFI(I->getMF())) { + if (BaseRegSP && needsWinCFI(I->getMF())) { // FIXME: For now, we always block the optimization over SP in windows // targets as it requires to adjust the unwind/debug info, messing up // the unwind info can actually cause a miscompile. - return E; + return E; } for (unsigned Count = 0; MBBI != E && Count < Limit; @@ -1868,14 +1868,14 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( } } - const bool BaseRegSP = BaseReg == AArch64::SP; - if (BaseRegSP && needsWinCFI(I->getMF())) { - // FIXME: For now, we always block the optimization over SP in windows - // targets as it requires to adjust the unwind/debug info, messing up - // the unwind info can actually cause a miscompile. - return E; - } - + const bool BaseRegSP = BaseReg == AArch64::SP; + if (BaseRegSP && needsWinCFI(I->getMF())) { + // FIXME: For now, we always block the optimization over SP in windows + // targets as it requires to adjust the unwind/debug info, messing up + // the unwind info can actually cause a miscompile. + return E; + } + // Track which register units have been modified and used between the first // insn (inclusive) and the second insn. ModifiedRegUnits.clear(); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MCInstLower.cpp index 10e191ff44..c923f53281 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -203,12 +203,12 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO, RefFlags |= AArch64MCExpr::VK_SABS; } else { RefFlags |= AArch64MCExpr::VK_ABS; - - if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) - RefFlags |= AArch64MCExpr::VK_PAGE; - else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == - AArch64II::MO_PAGEOFF) - RefFlags |= AArch64MCExpr::VK_PAGEOFF | AArch64MCExpr::VK_NC; + + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefFlags |= AArch64MCExpr::VK_PAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefFlags |= AArch64MCExpr::VK_PAGEOFF | AArch64MCExpr::VK_NC; } if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3) diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index 41343ba970..ebb501b779 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -14,9 +14,9 @@ //===----------------------------------------------------------------------===// #include "AArch64MachineFunctionInfo.h" -#include "AArch64InstrInfo.h" -#include <llvm/IR/Metadata.h> -#include <llvm/IR/Module.h> +#include "AArch64InstrInfo.h" +#include <llvm/IR/Metadata.h> +#include <llvm/IR/Module.h> using namespace llvm; @@ -33,82 +33,82 @@ void AArch64FunctionInfo::initializeBaseYamlFields( if (YamlMFI.HasRedZone.hasValue()) HasRedZone = YamlMFI.HasRedZone; } - -static std::pair<bool, bool> GetSignReturnAddress(const Function &F) { - // The function should be signed in the following situations: - // - sign-return-address=all - // - sign-return-address=non-leaf and the functions spills the LR - if (!F.hasFnAttribute("sign-return-address")) { - const Module &M = *F.getParent(); - if (const auto *Sign = mdconst::extract_or_null<ConstantInt>( - M.getModuleFlag("sign-return-address"))) { - if (Sign->getZExtValue()) { - if (const auto *All = mdconst::extract_or_null<ConstantInt>( - M.getModuleFlag("sign-return-address-all"))) - return {true, All->getZExtValue()}; - return {true, false}; - } - } - return {false, false}; - } - - StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString(); - if (Scope.equals("none")) - return {false, false}; - - if (Scope.equals("all")) - return {true, true}; - - assert(Scope.equals("non-leaf")); - return {true, false}; -} - -static bool ShouldSignWithBKey(const Function &F) { - if (!F.hasFnAttribute("sign-return-address-key")) { - if (const auto *BKey = mdconst::extract_or_null<ConstantInt>( - F.getParent()->getModuleFlag("sign-return-address-with-bkey"))) - return BKey->getZExtValue(); - return false; - } - - const StringRef Key = - F.getFnAttribute("sign-return-address-key").getValueAsString(); - assert(Key.equals_lower("a_key") || Key.equals_lower("b_key")); - return Key.equals_lower("b_key"); -} - -AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) { - // If we already know that the function doesn't have a redzone, set - // HasRedZone here. - if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) - HasRedZone = false; - - const Function &F = MF.getFunction(); - std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F); - SignWithBKey = ShouldSignWithBKey(F); - - if (!F.hasFnAttribute("branch-target-enforcement")) { - if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( - F.getParent()->getModuleFlag("branch-target-enforcement"))) - BranchTargetEnforcement = BTE->getZExtValue(); - return; - } - - const StringRef BTIEnable = F.getFnAttribute("branch-target-enforcement").getValueAsString(); - assert(BTIEnable.equals_lower("true") || BTIEnable.equals_lower("false")); - BranchTargetEnforcement = BTIEnable.equals_lower("true"); -} - -bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const { - if (!SignReturnAddress) - return false; - if (SignReturnAddressAll) - return true; - return SpillsLR; -} - -bool AArch64FunctionInfo::shouldSignReturnAddress() const { - return shouldSignReturnAddress(llvm::any_of( - MF.getFrameInfo().getCalleeSavedInfo(), - [](const auto &Info) { return Info.getReg() == AArch64::LR; })); -} + +static std::pair<bool, bool> GetSignReturnAddress(const Function &F) { + // The function should be signed in the following situations: + // - sign-return-address=all + // - sign-return-address=non-leaf and the functions spills the LR + if (!F.hasFnAttribute("sign-return-address")) { + const Module &M = *F.getParent(); + if (const auto *Sign = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("sign-return-address"))) { + if (Sign->getZExtValue()) { + if (const auto *All = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("sign-return-address-all"))) + return {true, All->getZExtValue()}; + return {true, false}; + } + } + return {false, false}; + } + + StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString(); + if (Scope.equals("none")) + return {false, false}; + + if (Scope.equals("all")) + return {true, true}; + + assert(Scope.equals("non-leaf")); + return {true, false}; +} + +static bool ShouldSignWithBKey(const Function &F) { + if (!F.hasFnAttribute("sign-return-address-key")) { + if (const auto *BKey = mdconst::extract_or_null<ConstantInt>( + F.getParent()->getModuleFlag("sign-return-address-with-bkey"))) + return BKey->getZExtValue(); + return false; + } + + const StringRef Key = + F.getFnAttribute("sign-return-address-key").getValueAsString(); + assert(Key.equals_lower("a_key") || Key.equals_lower("b_key")); + return Key.equals_lower("b_key"); +} + +AArch64FunctionInfo::AArch64FunctionInfo(MachineFunction &MF) : MF(MF) { + // If we already know that the function doesn't have a redzone, set + // HasRedZone here. + if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) + HasRedZone = false; + + const Function &F = MF.getFunction(); + std::tie(SignReturnAddress, SignReturnAddressAll) = GetSignReturnAddress(F); + SignWithBKey = ShouldSignWithBKey(F); + + if (!F.hasFnAttribute("branch-target-enforcement")) { + if (const auto *BTE = mdconst::extract_or_null<ConstantInt>( + F.getParent()->getModuleFlag("branch-target-enforcement"))) + BranchTargetEnforcement = BTE->getZExtValue(); + return; + } + + const StringRef BTIEnable = F.getFnAttribute("branch-target-enforcement").getValueAsString(); + assert(BTIEnable.equals_lower("true") || BTIEnable.equals_lower("false")); + BranchTargetEnforcement = BTIEnable.equals_lower("true"); +} + +bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const { + if (!SignReturnAddress) + return false; + if (SignReturnAddressAll) + return true; + return SpillsLR; +} + +bool AArch64FunctionInfo::shouldSignReturnAddress() const { + return shouldSignReturnAddress(llvm::any_of( + MF.getFrameInfo().getCalleeSavedInfo(), + [](const auto &Info) { return Info.getReg() == AArch64::LR; })); +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.h index f60e2b6c31..b3f35a46c7 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -35,9 +35,9 @@ class MachineInstr; /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { - /// Backreference to the machine function. - MachineFunction &MF; - + /// Backreference to the machine function. + MachineFunction &MF; + /// Number of bytes of arguments this function has on the stack. If the callee /// is expected to restore the argument stack this should be a multiple of 16, /// all usable during a tail call. @@ -128,39 +128,39 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// that must be forwarded to every musttail call. SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms; - /// FrameIndex for the tagged base pointer. - Optional<int> TaggedBasePointerIndex; - - /// Offset from SP-at-entry to the tagged base pointer. - /// Tagged base pointer is set up to point to the first (lowest address) - /// tagged stack slot. - unsigned TaggedBasePointerOffset; + /// FrameIndex for the tagged base pointer. + Optional<int> TaggedBasePointerIndex; + /// Offset from SP-at-entry to the tagged base pointer. + /// Tagged base pointer is set up to point to the first (lowest address) + /// tagged stack slot. + unsigned TaggedBasePointerOffset; + /// OutliningStyle denotes, if a function was outined, how it was outlined, /// e.g. Tail Call, Thunk, or Function if none apply. Optional<std::string> OutliningStyle; - // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus - // CalleeSavedStackSize) to the address of the frame record. - int CalleeSaveBaseToFrameRecordOffset = 0; - - /// SignReturnAddress is true if PAC-RET is enabled for the function with - /// defaults being sign non-leaf functions only, with the B key. - bool SignReturnAddress = false; - - /// SignReturnAddressAll modifies the default PAC-RET mode to signing leaf - /// functions as well. - bool SignReturnAddressAll = false; - - /// SignWithBKey modifies the default PAC-RET mode to signing with the B key. - bool SignWithBKey = false; - - /// BranchTargetEnforcement enables placing BTI instructions at potential - /// indirect branch destinations. - bool BranchTargetEnforcement = false; - + // Offset from SP-after-callee-saved-spills (i.e. SP-at-entry minus + // CalleeSavedStackSize) to the address of the frame record. + int CalleeSaveBaseToFrameRecordOffset = 0; + + /// SignReturnAddress is true if PAC-RET is enabled for the function with + /// defaults being sign non-leaf functions only, with the B key. + bool SignReturnAddress = false; + + /// SignReturnAddressAll modifies the default PAC-RET mode to signing leaf + /// functions as well. + bool SignReturnAddressAll = false; + + /// SignWithBKey modifies the default PAC-RET mode to signing with the B key. + bool SignWithBKey = false; + + /// BranchTargetEnforcement enables placing BTI instructions at potential + /// indirect branch destinations. + bool BranchTargetEnforcement = false; + public: - explicit AArch64FunctionInfo(MachineFunction &MF); + explicit AArch64FunctionInfo(MachineFunction &MF); void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); @@ -297,14 +297,14 @@ public: void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } unsigned getJumpTableEntrySize(int Idx) const { - return JumpTableEntryInfo[Idx].first; + return JumpTableEntryInfo[Idx].first; } MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const { - return JumpTableEntryInfo[Idx].second; + return JumpTableEntryInfo[Idx].second; } void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) { - if ((unsigned)Idx >= JumpTableEntryInfo.size()) - JumpTableEntryInfo.resize(Idx+1); + if ((unsigned)Idx >= JumpTableEntryInfo.size()) + JumpTableEntryInfo.resize(Idx+1); JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym); } @@ -346,11 +346,11 @@ public: return ForwardedMustTailRegParms; } - Optional<int> getTaggedBasePointerIndex() const { - return TaggedBasePointerIndex; - } - void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; } - + Optional<int> getTaggedBasePointerIndex() const { + return TaggedBasePointerIndex; + } + void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; } + unsigned getTaggedBasePointerOffset() const { return TaggedBasePointerOffset; } @@ -358,26 +358,26 @@ public: TaggedBasePointerOffset = Offset; } - int getCalleeSaveBaseToFrameRecordOffset() const { - return CalleeSaveBaseToFrameRecordOffset; - } - void setCalleeSaveBaseToFrameRecordOffset(int Offset) { - CalleeSaveBaseToFrameRecordOffset = Offset; - } - - bool shouldSignReturnAddress() const; - bool shouldSignReturnAddress(bool SpillsLR) const; - - bool shouldSignWithBKey() const { return SignWithBKey; } - - bool branchTargetEnforcement() const { return BranchTargetEnforcement; } - + int getCalleeSaveBaseToFrameRecordOffset() const { + return CalleeSaveBaseToFrameRecordOffset; + } + void setCalleeSaveBaseToFrameRecordOffset(int Offset) { + CalleeSaveBaseToFrameRecordOffset = Offset; + } + + bool shouldSignReturnAddress() const; + bool shouldSignReturnAddress(bool SpillsLR) const; + + bool shouldSignWithBKey() const { return SignWithBKey; } + + bool branchTargetEnforcement() const { return BranchTargetEnforcement; } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; SetOfInstructions LOHRelated; - SmallVector<std::pair<unsigned, MCSymbol *>, 2> JumpTableEntryInfo; + SmallVector<std::pair<unsigned, MCSymbol *>, 2> JumpTableEntryInfo; }; namespace yaml { diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MacroFusion.cpp index f3b8ef16d6..0e9cb143f2 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64MacroFusion.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -21,7 +21,7 @@ namespace { /// CMN, CMP, TST followed by Bcc static bool isArithmeticBccPair(const MachineInstr *FirstMI, - const MachineInstr &SecondMI, bool CmpOnly) { + const MachineInstr &SecondMI, bool CmpOnly) { if (SecondMI.getOpcode() != AArch64::Bcc) return false; @@ -29,13 +29,13 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI, if (FirstMI == nullptr) return true; - // If we're in CmpOnly mode, we only fuse arithmetic instructions that - // discard their result. - if (CmpOnly && !(FirstMI->getOperand(0).getReg() == AArch64::XZR || - FirstMI->getOperand(0).getReg() == AArch64::WZR)) { - return false; - } - + // If we're in CmpOnly mode, we only fuse arithmetic instructions that + // discard their result. + if (CmpOnly && !(FirstMI->getOperand(0).getReg() == AArch64::XZR || + FirstMI->getOperand(0).getReg() == AArch64::WZR)) { + return false; + } + switch (FirstMI->getOpcode()) { case AArch64::ADDSWri: case AArch64::ADDSWrr: @@ -387,11 +387,11 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, // All checking functions assume that the 1st instr is a wildcard if it is // unspecified. - if (ST.hasCmpBccFusion() || ST.hasArithmeticBccFusion()) { - bool CmpOnly = !ST.hasArithmeticBccFusion(); - if (isArithmeticBccPair(FirstMI, SecondMI, CmpOnly)) - return true; - } + if (ST.hasCmpBccFusion() || ST.hasArithmeticBccFusion()) { + bool CmpOnly = !ST.hasArithmeticBccFusion(); + if (isArithmeticBccPair(FirstMI, SecondMI, CmpOnly)) + return true; + } if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI)) return true; if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI)) diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index 019220e3a5..82b610f995 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -408,11 +408,11 @@ bool AArch64RedundantCopyElimination::optimizeBlock(MachineBasicBlock *MBB) { O.getReg() != CmpReg; })) continue; - - // Don't remove a move immediate that implicitly defines the upper - // bits as different. - if (TRI->isSuperRegister(DefReg, KnownReg.Reg) && KnownReg.Imm < 0) - continue; + + // Don't remove a move immediate that implicitly defines the upper + // bits as different. + if (TRI->isSuperRegister(DefReg, KnownReg.Reg) && KnownReg.Imm < 0) + continue; } if (IsCopy) diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.cpp index f90856d14b..2aeea84ae2 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -24,7 +24,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/TargetFrameLowering.h" -#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" @@ -240,14 +240,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask; } -const uint32_t *AArch64RegisterInfo::getCustomEHPadPreservedMask( - const MachineFunction &MF) const { - if (MF.getSubtarget<AArch64Subtarget>().isTargetLinux()) - return CSR_AArch64_AAPCS_RegMask; - - return nullptr; -} - +const uint32_t *AArch64RegisterInfo::getCustomEHPadPreservedMask( + const MachineFunction &MF) const { + if (MF.getSubtarget<AArch64Subtarget>().isTargetLinux()) + return CSR_AArch64_AAPCS_RegMask; + + return nullptr; +} + const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { if (TT.isOSDarwin()) return CSR_Darwin_AArch64_TLS_RegMask; @@ -334,16 +334,16 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, } bool AArch64RegisterInfo::isAnyArgRegReserved(const MachineFunction &MF) const { - return llvm::any_of(*AArch64::GPR64argRegClass.MC, [this, &MF](MCPhysReg r) { - return isReservedReg(MF, r); - }); + return llvm::any_of(*AArch64::GPR64argRegClass.MC, [this, &MF](MCPhysReg r) { + return isReservedReg(MF, r); + }); } void AArch64RegisterInfo::emitReservedArgRegCallError( const MachineFunction &MF) const { const Function &F = MF.getFunction(); - F.getContext().diagnose(DiagnosticInfoUnsupported{F, ("AArch64 doesn't support" - " function calls if any of the argument registers is reserved.")}); + F.getContext().diagnose(DiagnosticInfoUnsupported{F, ("AArch64 doesn't support" + " function calls if any of the argument registers is reserved.")}); } bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF, @@ -525,16 +525,16 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const { assert(MI && "Unable to get the legal offset for nil instruction."); - StackOffset SaveOffset = StackOffset::getFixed(Offset); + StackOffset SaveOffset = StackOffset::getFixed(Offset); return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; } /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx /// at the beginning of the basic block. -Register -AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - int FrameIdx, - int64_t Offset) const { +Register +AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + int FrameIdx, + int64_t Offset) const { MachineBasicBlock::iterator Ins = MBB->begin(); DebugLoc DL; // Defaults to "unknown" if (Ins != MBB->end()) @@ -544,7 +544,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); + Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF)); unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); @@ -552,21 +552,21 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, .addFrameIndex(FrameIdx) .addImm(Offset) .addImm(Shifter); - - return BaseReg; + + return BaseReg; } void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { // ARM doesn't need the general 64-bit offsets - StackOffset Off = StackOffset::getFixed(Offset); + StackOffset Off = StackOffset::getFixed(Offset); unsigned i = 0; while (!MI.getOperand(i).isFI()) { ++i; assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); } - + const MachineFunction *MF = MI.getParent()->getParent(); const AArch64InstrInfo *TII = MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); @@ -596,33 +596,33 @@ createScratchRegisterForInstruction(MachineInstr &MI, } } -void AArch64RegisterInfo::getOffsetOpcodes( - const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const { - // The smallest scalable element supported by scaled SVE addressing - // modes are predicates, which are 2 scalable bytes in size. So the scalable - // byte offset must always be a multiple of 2. - assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); - - // Add fixed-sized offset using existing DIExpression interface. - DIExpression::appendOffset(Ops, Offset.getFixed()); - - unsigned VG = getDwarfRegNum(AArch64::VG, true); - int64_t VGSized = Offset.getScalable() / 2; - if (VGSized > 0) { - Ops.push_back(dwarf::DW_OP_constu); - Ops.push_back(VGSized); - Ops.append({dwarf::DW_OP_bregx, VG, 0ULL}); - Ops.push_back(dwarf::DW_OP_mul); - Ops.push_back(dwarf::DW_OP_plus); - } else if (VGSized < 0) { - Ops.push_back(dwarf::DW_OP_constu); - Ops.push_back(-VGSized); - Ops.append({dwarf::DW_OP_bregx, VG, 0ULL}); - Ops.push_back(dwarf::DW_OP_mul); - Ops.push_back(dwarf::DW_OP_minus); - } -} - +void AArch64RegisterInfo::getOffsetOpcodes( + const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); + + // Add fixed-sized offset using existing DIExpression interface. + DIExpression::appendOffset(Ops, Offset.getFixed()); + + unsigned VG = getDwarfRegNum(AArch64::VG, true); + int64_t VGSized = Offset.getScalable() / 2; + if (VGSized > 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(VGSized); + Ops.append({dwarf::DW_OP_bregx, VG, 0ULL}); + Ops.push_back(dwarf::DW_OP_mul); + Ops.push_back(dwarf::DW_OP_plus); + } else if (VGSized < 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(-VGSized); + Ops.append({dwarf::DW_OP_bregx, VG, 0ULL}); + Ops.push_back(dwarf::DW_OP_mul); + Ops.push_back(dwarf::DW_OP_minus); + } +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -640,26 +640,26 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED; Register FrameReg; - // Special handling of dbg_value, stackmap patchpoint statepoint instructions. - if (MI.getOpcode() == TargetOpcode::STACKMAP || - MI.getOpcode() == TargetOpcode::PATCHPOINT || - MI.getOpcode() == TargetOpcode::STATEPOINT) { + // Special handling of dbg_value, stackmap patchpoint statepoint instructions. + if (MI.getOpcode() == TargetOpcode::STACKMAP || + MI.getOpcode() == TargetOpcode::PATCHPOINT || + MI.getOpcode() == TargetOpcode::STATEPOINT) { StackOffset Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, /*PreferFP=*/true, /*ForSimm=*/false); - Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); + Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); return; } if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - StackOffset Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); - assert(!Offset.getScalable() && - "Frame offsets with a scalable component are not supported"); - FI.ChangeToImmediate(Offset.getFixed()); + StackOffset Offset = TFI->getNonLocalFrameIndexReference(MF, FrameIndex); + assert(!Offset.getScalable() && + "Frame offsets with a scalable component are not supported"); + FI.ChangeToImmediate(Offset.getFixed()); return; } @@ -668,11 +668,11 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // TAGPstack must use the virtual frame register in its 3rd operand. const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); FrameReg = MI.getOperand(3).getReg(); - Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) + - AFI->getTaggedBasePointerOffset()); + Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) + + AFI->getTaggedBasePointerOffset()); } else if (Tagged) { - StackOffset SPOffset = StackOffset::getFixed( - MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize()); + StackOffset SPOffset = StackOffset::getFixed( + MFI.getObjectOffset(FrameIndex) + (int64_t)MFI.getStackSize()); if (MFI.hasVarSizedObjects() || isAArch64FrameOffsetLegal(MI, SPOffset, nullptr, nullptr, nullptr) != (AArch64FrameOffsetCanUpdate | AArch64FrameOffsetIsLegal)) { @@ -693,8 +693,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, return; } FrameReg = AArch64::SP; - Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) + - (int64_t)MFI.getStackSize()); + Offset = StackOffset::getFixed(MFI.getObjectOffset(FrameIndex) + + (int64_t)MFI.getStackSize()); } else { Offset = TFI->resolveFrameIndexReference( MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); @@ -765,19 +765,19 @@ unsigned AArch64RegisterInfo::getLocalAddressRegister( return getBaseRegister(); return getFrameRegister(MF); } - -/// SrcRC and DstRC will be morphed into NewRC if this returns true -bool AArch64RegisterInfo::shouldCoalesce( - MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, - const TargetRegisterClass *DstRC, unsigned DstSubReg, - const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { - if (MI->isCopy() && - ((DstRC->getID() == AArch64::GPR64RegClassID) || - (DstRC->getID() == AArch64::GPR64commonRegClassID)) && - MI->getOperand(0).getSubReg() && MI->getOperand(1).getSubReg()) - // Do not coalesce in the case of a 32-bit subregister copy - // which implements a 32 to 64 bit zero extension - // which relies on the upper 32 bits being zeroed. - return false; - return true; -} + +/// SrcRC and DstRC will be morphed into NewRC if this returns true +bool AArch64RegisterInfo::shouldCoalesce( + MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, + const TargetRegisterClass *DstRC, unsigned DstSubReg, + const TargetRegisterClass *NewRC, LiveIntervals &LIS) const { + if (MI->isCopy() && + ((DstRC->getID() == AArch64::GPR64RegClassID) || + (DstRC->getID() == AArch64::GPR64commonRegClassID)) && + MI->getOperand(0).getSubReg() && MI->getOperand(1).getSubReg()) + // Do not coalesce in the case of a 32-bit subregister copy + // which implements a 32 to 64 bit zero extension + // which relies on the upper 32 bits being zeroed. + return false; + return true; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.h index 0c871ac089..b9a4e6ac16 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.h @@ -72,10 +72,10 @@ public: // Funclets on ARM64 Windows don't preserve any registers. const uint32_t *getNoPreservedMask() const override; - // Unwinders may not preserve all Neon and SVE registers. - const uint32_t * - getCustomEHPadPreservedMask(const MachineFunction &MF) const override; - + // Unwinders may not preserve all Neon and SVE registers. + const uint32_t * + getCustomEHPadPreservedMask(const MachineFunction &MF) const override; + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the /// case that 'returned' is on an i64 first argument if the calling convention /// is one that can (partially) model this attribute with a preserved mask @@ -107,8 +107,8 @@ public: bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override; - Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, - int64_t Offset) const override; + Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, + int64_t Offset) const override; void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, @@ -128,15 +128,15 @@ public: unsigned getLocalAddressRegister(const MachineFunction &MF) const; bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const; - - /// SrcRC and DstRC will be morphed into NewRC if this returns true - bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, - unsigned SubReg, const TargetRegisterClass *DstRC, - unsigned DstSubReg, const TargetRegisterClass *NewRC, - LiveIntervals &LIS) const override; - - void getOffsetOpcodes(const StackOffset &Offset, - SmallVectorImpl<uint64_t> &Ops) const override; + + /// SrcRC and DstRC will be morphed into NewRC if this returns true + bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, + unsigned SubReg, const TargetRegisterClass *DstRC, + unsigned DstSubReg, const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const override; + + void getOffsetOpcodes(const StackOffset &Offset, + SmallVectorImpl<uint64_t> &Ops) const override; }; } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.td index 28d1988b8a..17ad5b997c 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64RegisterInfo.td @@ -711,32 +711,32 @@ def XSeqPairClassOperand : //===----- END: v8.1a atomic CASP register operands -----------------------===// -//===----------------------------------------------------------------------===// -// Armv8.7a accelerator extension register operands: 8 consecutive GPRs -// starting with an even one - -let Namespace = "AArch64" in { - foreach i = 0-7 in - def "x8sub_"#i : SubRegIndex<64, !mul(64, i)>; -} - -def Tuples8X : RegisterTuples< - !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)), - !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>; - -def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>; -def GPR64x8AsmOp : AsmOperandClass { - let Name = "GPR64x8"; - let ParserMethod = "tryParseGPR64x8"; - let RenderMethod = "addRegOperands"; -} -def GPR64x8 : RegisterOperand<GPR64x8Class, "printGPR64x8"> { - let ParserMatchClass = GPR64x8AsmOp; - let PrintMethod = "printGPR64x8"; -} - -//===----- END: v8.7a accelerator extension register operands -------------===// - +//===----------------------------------------------------------------------===// +// Armv8.7a accelerator extension register operands: 8 consecutive GPRs +// starting with an even one + +let Namespace = "AArch64" in { + foreach i = 0-7 in + def "x8sub_"#i : SubRegIndex<64, !mul(64, i)>; +} + +def Tuples8X : RegisterTuples< + !foreach(i, [0,1,2,3,4,5,6,7], !cast<SubRegIndex>("x8sub_"#i)), + !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>; + +def GPR64x8Class : RegisterClass<"AArch64", [i64], 64, (trunc Tuples8X, 12)>; +def GPR64x8AsmOp : AsmOperandClass { + let Name = "GPR64x8"; + let ParserMethod = "tryParseGPR64x8"; + let RenderMethod = "addRegOperands"; +} +def GPR64x8 : RegisterOperand<GPR64x8Class, "printGPR64x8"> { + let ParserMatchClass = GPR64x8AsmOp; + let PrintMethod = "printGPR64x8"; +} + +//===----- END: v8.7a accelerator extension register operands -------------===// + // SVE predicate registers def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>; def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 03b32967a2..84e6327550 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -221,9 +221,9 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, // if so, return it. std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); - auto It = SIMDInstrTable.find(InstID); - if (It != SIMDInstrTable.end()) - return It->second; + auto It = SIMDInstrTable.find(InstID); + if (It != SIMDInstrTable.end()) + return It->second; unsigned SCIdx = InstDesc->getSchedClass(); const MCSchedClassDesc *SCDesc = @@ -291,9 +291,9 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { case Interleave: std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU()); - auto It = InterlEarlyExit.find(Subtarget); - if (It != InterlEarlyExit.end()) - return It->second; + auto It = InterlEarlyExit.find(Subtarget); + if (It != InterlEarlyExit.end()) + return It->second; for (auto &I : IRT) { OriginalMCID = &TII->get(I.OrigOpc); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SVEInstrInfo.td index e09b8401c0..19a71f606b 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -152,8 +152,8 @@ def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>; def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>; def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>; def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>; -def AArch64saddv_p : SDNode<"AArch64ISD::SADDV_PRED", SDT_AArch64Reduce>; -def AArch64uaddv_p : SDNode<"AArch64ISD::UADDV_PRED", SDT_AArch64Reduce>; +def AArch64saddv_p : SDNode<"AArch64ISD::SADDV_PRED", SDT_AArch64Reduce>; +def AArch64uaddv_p : SDNode<"AArch64ISD::UADDV_PRED", SDT_AArch64Reduce>; def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; @@ -166,84 +166,84 @@ def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; def SDT_AArch64Arith : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, - SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3> + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3> ]>; def SDT_AArch64FMA : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>, - SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4> + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4> ]>; // Predicated operations with the result of inactive lanes being unspecified. def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; -def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; +def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; -def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; +def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; -def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>; -def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>; -def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>; -def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>; -def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>; -def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; -def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; +def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>; +def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>; +def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>; +def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>; +def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>; +def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; +def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; -def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; -def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; -def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; +def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; +def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; +def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; -def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; -def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; - -def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>, - SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4> -]>; - -// Predicated operations with the result of inactive lanes provided by the last operand. -def AArch64clz_mt : SDNode<"AArch64ISD::CTLZ_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64cnt_mt : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64fabs_mt : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; -def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; -def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frintm_mt : SDNode<"AArch64ISD::FFLOOR_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frinti_mt : SDNode<"AArch64ISD::FNEARBYINT_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>; -def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; - -// These are like the above but we don't yet have need for ISD nodes. They allow -// a single pattern to match intrinsic and ISD operand layouts. -def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cls node:$pt, node:$pg, node:$op)]>; -def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>; -def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>; - -def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, - SDTCVecEltisVT<1,i1> -]>; - -def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>, - SDTCVecEltisVT<1,i1> -]>; - -def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>; -def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>; -def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; -def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; -def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; -def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; - +def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; +def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; + +def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4> +]>; + +// Predicated operations with the result of inactive lanes provided by the last operand. +def AArch64clz_mt : SDNode<"AArch64ISD::CTLZ_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64cnt_mt : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fabs_mt : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintm_mt : SDNode<"AArch64ISD::FFLOOR_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frinti_mt : SDNode<"AArch64ISD::FNEARBYINT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; + +// These are like the above but we don't yet have need for ISD nodes. They allow +// a single pattern to match intrinsic and ISD operand layouts. +def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cls node:$pt, node:$pg, node:$op)]>; +def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>; +def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>; + +def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1> +]>; + +def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1> +]>; + +def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>; +def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; + def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; @@ -263,24 +263,24 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>; def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; -def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), - [(setoge node:$lhs, node:$rhs), - (setge node:$lhs, node:$rhs)]>; -def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), - [(setogt node:$lhs, node:$rhs), - (setgt node:$lhs, node:$rhs)]>; -def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), - [(setoeq node:$lhs, node:$rhs), - (seteq node:$lhs, node:$rhs)]>; -def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), - [(setone node:$lhs, node:$rhs), - (setne node:$lhs, node:$rhs)]>; -def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), - (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - - +def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), + [(setoge node:$lhs, node:$rhs), + (setge node:$lhs, node:$rhs)]>; +def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), + [(setogt node:$lhs, node:$rhs), + (setgt node:$lhs, node:$rhs)]>; +def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), + [(setoeq node:$lhs, node:$rhs), + (seteq node:$lhs, node:$rhs)]>; +def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), + [(setone node:$lhs, node:$rhs), + (setne node:$lhs, node:$rhs)]>; +def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), + (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + + let Predicates = [HasSVE] in { defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; @@ -305,7 +305,7 @@ let Predicates = [HasSVE] in { defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; defm ADD_ZPZZ : sve_int_bin_pred_bhsd<AArch64add_p>; - defm SUB_ZPZZ : sve_int_bin_pred_bhsd<AArch64sub_p>; + defm SUB_ZPZZ : sve_int_bin_pred_bhsd<AArch64sub_p>; let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>; @@ -328,12 +328,12 @@ let Predicates = [HasSVE] in { defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>; // SVE predicated integer reductions. - defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>; - defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", AArch64uaddv_p>; + defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>; + defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", AArch64uaddv_p>; defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>; defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>; defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>; @@ -346,17 +346,17 @@ let Predicates = [HasSVE] in { defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>; - defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>; - defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>; + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>; - defm MUL_ZI : sve_int_arith_imm2<"mul", AArch64mul_p>; - defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", "MUL_ZPZZ", int_aarch64_sve_mul, DestructiveBinaryComm>; - defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", "SMULH_ZPZZ", int_aarch64_sve_smulh, DestructiveBinaryComm>; - defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>; + defm MUL_ZI : sve_int_arith_imm2<"mul", AArch64mul_p>; + defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", "MUL_ZPZZ", int_aarch64_sve_mul, DestructiveBinaryComm>; + defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", "SMULH_ZPZZ", int_aarch64_sve_smulh, DestructiveBinaryComm>; + defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>; - defm MUL_ZPZZ : sve_int_bin_pred_bhsd<AArch64mul_p>; + defm MUL_ZPZZ : sve_int_bin_pred_bhsd<AArch64mul_p>; defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">; defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">; @@ -372,34 +372,34 @@ let Predicates = [HasSVE] in { defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; - defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>; - defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>; - defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>; - defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>; - defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>; - defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>; - defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", AArch64abs_mt>; - defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", AArch64neg_mt>; - - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", AArch64cls_mt>; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", AArch64clz_mt>; - defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", AArch64cnt_mt>; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", AArch64cnot_mt>; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", AArch64not_mt>; - defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>; - defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>; - - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>; - defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>; - defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>; - - defm SMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64smax_p>; - defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>; - defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>; - defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>; + defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>; + defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>; + defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>; + defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>; + defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>; + defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>; + defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", AArch64abs_mt>; + defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", AArch64neg_mt>; + + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", AArch64cls_mt>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", AArch64clz_mt>; + defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", AArch64cnt_mt>; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", AArch64cnot_mt>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", AArch64not_mt>; + defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>; + defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>; + + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>; + defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>; + defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>; + + defm SMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64smax_p>; + defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>; + defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>; + defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>; defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>; defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>; @@ -428,11 +428,11 @@ let Predicates = [HasSVE] in { defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">; defm FADD_ZPZZ : sve_fp_bin_pred_hfd<AArch64fadd_p>; - defm FSUB_ZPZZ : sve_fp_bin_pred_hfd<AArch64fsub_p>; - defm FMUL_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmul_p>; - defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>; - defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>; - defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>; + defm FSUB_ZPZZ : sve_fp_bin_pred_hfd<AArch64fsub_p>; + defm FMUL_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmul_p>; + defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fmaxnm_p>; + defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>; + defm FDIV_ZPZZ : sve_fp_bin_pred_hfd<AArch64fdiv_p>; let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>; @@ -449,10 +449,10 @@ let Predicates = [HasSVE] in { defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>; } - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>; defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>; defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>; @@ -476,14 +476,14 @@ let Predicates = [HasSVE] in { // regalloc. def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)), (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)), - (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)), - (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv4f16 (AArch64fma_p nxv4i1:$P, nxv4f16:$Op1, nxv4f16:$Op2, nxv4f16:$Op3)), + (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv2f16 (AArch64fma_p nxv2i1:$P, nxv2f16:$Op1, nxv2f16:$Op2, nxv2f16:$Op3)), + (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>; def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)), (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; - def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)), - (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; + def : Pat<(nxv2f32 (AArch64fma_p nxv2i1:$P, nxv2f32:$Op1, nxv2f32:$Op2, nxv2f32:$Op3)), + (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>; def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)), (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>; @@ -534,8 +534,8 @@ let Predicates = [HasSVE] in { (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; - def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), - (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; // Duplicate +0.0 into all vector elements def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; @@ -544,7 +544,7 @@ let Predicates = [HasSVE] in { def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; - def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; // Duplicate Int immediate into all vector elements def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), @@ -573,7 +573,7 @@ let Predicates = [HasSVE] in { } // Select elements from either vector (predicated) - defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; + defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>; @@ -582,8 +582,8 @@ let Predicates = [HasSVE] in { defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; - defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; - defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; + defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>; @@ -1035,7 +1035,7 @@ let Predicates = [HasSVE] in { def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>; - multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> { + multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> { // reg + imm let AddedComplexity = 2 in { def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)), @@ -1145,29 +1145,29 @@ let Predicates = [HasSVE] in { def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), (ZIP2_PPP_B PPR:$Ps, (PFALSE))>; - // Extract subvectors from FP SVE vectors - def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), - (UUNPKLO_ZZ_D ZPR:$Zs)>; - def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))), - (UUNPKHI_ZZ_D ZPR:$Zs)>; - def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))), - (UUNPKLO_ZZ_S ZPR:$Zs)>; - def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))), - (UUNPKHI_ZZ_S ZPR:$Zs)>; - def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))), - (UUNPKLO_ZZ_D ZPR:$Zs)>; - def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))), - (UUNPKHI_ZZ_D ZPR:$Zs)>; - - def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))), - (UUNPKLO_ZZ_D ZPR:$Zs)>; - def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))), - (UUNPKHI_ZZ_D ZPR:$Zs)>; - def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))), - (UUNPKLO_ZZ_S ZPR:$Zs)>; - def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))), - (UUNPKHI_ZZ_S ZPR:$Zs)>; - + // Extract subvectors from FP SVE vectors + def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))), + (UUNPKHI_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))), + (UUNPKHI_ZZ_S ZPR:$Zs)>; + // Concatenate two predicates. def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)), (UZP1_PPP_S $p1, $p2)>; @@ -1176,18 +1176,18 @@ let Predicates = [HasSVE] in { def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)), (UZP1_PPP_B $p1, $p2)>; - // Concatenate two floating point vectors. - def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)), - (UZP1_ZZZ_S $v1, $v2)>; - def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)), - (UZP1_ZZZ_H $v1, $v2)>; - def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)), - (UZP1_ZZZ_S $v1, $v2)>; - def : Pat<(nxv4bf16 (concat_vectors nxv2bf16:$v1, nxv2bf16:$v2)), - (UZP1_ZZZ_S $v1, $v2)>; - def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)), - (UZP1_ZZZ_H $v1, $v2)>; - + // Concatenate two floating point vectors. + def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)), + (UZP1_ZZZ_H $v1, $v2)>; + def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv4bf16 (concat_vectors nxv2bf16:$v1, nxv2bf16:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)), + (UZP1_ZZZ_H $v1, $v2)>; + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; @@ -1217,10 +1217,10 @@ let Predicates = [HasSVE] in { defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; @@ -1345,146 +1345,146 @@ let Predicates = [HasSVE] in { defm INDEX_II : sve_int_index_ii<"index", index_vector>; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; - defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; - - defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; - defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; - defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>; - + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; + defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; + + defm ASR_ZPZI : sve_int_shift_pred_bhsd<AArch64asr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; + defm LSR_ZPZI : sve_int_shift_pred_bhsd<AArch64lsr_p, SVEShiftImmR8, SVEShiftImmR16, SVEShiftImmR32, SVEShiftImmR64>; + defm LSL_ZPZI : sve_int_shift_pred_bhsd<AArch64lsl_p, SVEShiftImmL8, SVEShiftImmL16, SVEShiftImmL32, SVEShiftImmL64>; + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { - defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>; - defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>; - defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>; + defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>; + defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsr>; + defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_lsl>; defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>; } - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">; + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">; defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>; defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>; defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>; - defm ASR_ZPZZ : sve_int_bin_pred_bhsd<AArch64asr_p>; - defm LSR_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsr_p>; - defm LSL_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsl_p>; - + defm ASR_ZPZZ : sve_int_bin_pred_bhsd<AArch64asr_p>; + defm LSR_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsr_p>; + defm LSL_ZPZZ : sve_int_bin_pred_bhsd<AArch64lsl_p>; + defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; - defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zdr<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, AArch64fcvtr_mt, nxv4f16, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, AArch64fcvte_mt, nxv4f32, nxv4i1, nxv4f16, ElementSizeS>; - defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110010, "scvtf", ZPR16, ZPR16, null_frag, AArch64scvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010100, "scvtf", ZPR32, ZPR32, null_frag, AArch64scvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010101, "ucvtf", ZPR32, ZPR32, null_frag, AArch64ucvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; - defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110011, "ucvtf", ZPR16, ZPR16, null_frag, AArch64ucvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; - defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag, AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag, AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; - defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; - defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zdr<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, AArch64fcvtr_mt, nxv2f16, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>; - defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>; - defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; - defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; - defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, AArch64scvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, AArch64ucvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; - defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; - defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; - defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; - defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; - defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; - defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; - defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; - - def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))), - (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - // FP_ROUND has an additional 'precise' flag which indicates the type of rounding. - // This is ignored by the pattern below where it is matched by (i64 timm0_1) - def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))), - (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - // Floating-point -> signed integer - def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), - (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))), - (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg), - (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))), - (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), - (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))), - (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), - (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))), - (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), - (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))), - (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - // Floating-point -> unsigned integer - def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), - (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))), - (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), - (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))), - (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg), - (and (nxv4i32 ZPR:$Zs), - (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))), - (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), - (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))), - (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), - (and (nxv2i64 ZPR:$Zs), - (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))), - (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; - - defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>; - defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>; - defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", AArch64frintm_mt>; - defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", AArch64frintz_mt>; - defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", AArch64frinta_mt>; - defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", AArch64frintx_mt>; - defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>; - defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>; - defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; - + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zdr<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, AArch64fcvtr_mt, nxv4f16, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, AArch64fcvte_mt, nxv4f32, nxv4i1, nxv4f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110010, "scvtf", ZPR16, ZPR16, null_frag, AArch64scvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010100, "scvtf", ZPR32, ZPR32, null_frag, AArch64scvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010101, "ucvtf", ZPR32, ZPR32, null_frag, AArch64ucvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110011, "ucvtf", ZPR16, ZPR16, null_frag, AArch64ucvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag, AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag, AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zdr<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, AArch64fcvtr_mt, nxv2f16, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, AArch64scvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, AArch64ucvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + + def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))), + (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // FP_ROUND has an additional 'precise' flag which indicates the type of rounding. + // This is ignored by the pattern below where it is matched by (i64 timm0_1) + def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))), + (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // Floating-point -> signed integer + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))), + (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 PPR:$Pg), + (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))), + (SCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))), + (SCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))), + (SCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 PPR:$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))), + (SCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // Floating-point -> unsigned integer + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 PPR:$Pg), + (and (nxv4i32 ZPR:$Zs), + (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))), + (UCVTF_ZPmZ_StoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 PPR:$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))), + (UCVTF_ZPmZ_StoD ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>; + defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>; + defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", AArch64frintm_mt>; + defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", AArch64frintz_mt>; + defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", AArch64frinta_mt>; + defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", AArch64frintx_mt>; + defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>; + defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>; + defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; + let Predicates = [HasBF16, HasSVE] in { defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; @@ -1648,9 +1648,9 @@ let Predicates = [HasSVE] in { def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>; } - def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), - (ADDVL_XXI GPR64:$op, $imm)>; - + def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), + (ADDVL_XXI GPR64:$op, $imm)>; + // FIXME: BigEndian requires an additional REV instruction to satisfy the // constraint that none of the bits change when stored to memory as one // type, and and reloaded as another type. @@ -1721,7 +1721,7 @@ let Predicates = [HasSVE] in { def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; } - // These allow casting from/to unpacked predicate types. + // These allow casting from/to unpacked predicate types. def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; @@ -1736,18 +1736,18 @@ let Predicates = [HasSVE] in { def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; - // These allow casting from/to unpacked floating-point types. - def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; - + // These allow casting from/to unpacked floating-point types. + def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)), (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>; def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)), @@ -1800,7 +1800,7 @@ let Predicates = [HasSVE] in { defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>; defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; - defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; + defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>; // 16-element contiguous loads defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>; @@ -1838,10 +1838,10 @@ let Predicates = [HasSVE] in { defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>; // 8-element contiguous stores - defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>; - defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; - defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; - defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>; + defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; + defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>; // 16-element contiguous stores defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>; @@ -2003,7 +2003,7 @@ let Predicates = [HasSVE] in { defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16, AArch64ld1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; defm : ld1<LD1H, LD1H_IMM, nxv8i16, AArch64ld1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>; defm : ld1<LD1H, LD1H_IMM, nxv8f16, AArch64ld1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>; - defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; + defm : ld1<LD1H, LD1H_IMM, nxv8bf16, AArch64ld1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; // 16-element contiguous loads defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; @@ -2043,7 +2043,7 @@ let Predicates = [HasSVE] in { defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s_z, nxv8i1, nxv8i8>; defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1_z, nxv8i1, nxv8i16>; defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1_z, nxv8i1, nxv8f16>; - defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>; + defm : ldnf1<LDNF1H_IMM, nxv8bf16, AArch64ldnf1_z, nxv8i1, nxv8bf16>; // 16-element contiguous non-faulting loads defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1_z, nxv16i1, nxv16i8>; @@ -2084,7 +2084,7 @@ let Predicates = [HasSVE] in { defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s_z, nxv8i1, nxv8i8, am_sve_regreg_lsl0>; defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1_z, nxv8i1, nxv8i16, am_sve_regreg_lsl1>; defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1_z, nxv8i1, nxv8f16, am_sve_regreg_lsl1>; - defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; + defm : ldff1<LDFF1H, nxv8bf16, AArch64ldff1_z, nxv8i1, nxv8bf16, am_sve_regreg_lsl1>; // 16-element contiguous first faulting loads defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>; @@ -2135,19 +2135,19 @@ let Predicates = [HasSVE] in { def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)), (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; - def : Pat<(nxv8f16 (vector_insert (nxv8f16 (undef)), (f16 FPR16:$src), 0)), - (INSERT_SUBREG (nxv8f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; - def : Pat<(nxv4f16 (vector_insert (nxv4f16 (undef)), (f16 FPR16:$src), 0)), - (INSERT_SUBREG (nxv4f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; - def : Pat<(nxv2f16 (vector_insert (nxv2f16 (undef)), (f16 FPR16:$src), 0)), - (INSERT_SUBREG (nxv2f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; - def : Pat<(nxv4f32 (vector_insert (nxv4f32 (undef)), (f32 FPR32:$src), 0)), - (INSERT_SUBREG (nxv4f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; - def : Pat<(nxv2f32 (vector_insert (nxv2f32 (undef)), (f32 FPR32:$src), 0)), - (INSERT_SUBREG (nxv2f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; - def : Pat<(nxv2f64 (vector_insert (nxv2f64 (undef)), (f64 FPR64:$src), 0)), - (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; - + def : Pat<(nxv8f16 (vector_insert (nxv8f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv8f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv4f16 (vector_insert (nxv4f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv4f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv2f16 (vector_insert (nxv2f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv2f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 (undef)), (f32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2f32 (vector_insert (nxv2f32 (undef)), (f32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv2f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 (undef)), (f64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + // Insert scalar into vector[0] def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)), (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>; @@ -2211,28 +2211,28 @@ let Predicates = [HasSVE] in { (DUP_ZR_D $index)), $src)>; - // Extract element from vector with scalar index - def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), - (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), - (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), - (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), - (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; - def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), - (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; - + // Extract element from vector with scalar index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + // Extract element from vector with immediate index def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)), (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>; @@ -2244,54 +2244,54 @@ let Predicates = [HasSVE] in { (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; - def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), - (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; - def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)), - (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>; - def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)), - (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; + def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>; + def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>; def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; - def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)), - (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>; + def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>; def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; - // Extract element from vector with immediate index that's within the bottom 128-bits. - let AddedComplexity = 1 in { - def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), - (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; - def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), - (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; - def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), - (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; - def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)), - (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>; - } - - // Extract first element from vector. - let AddedComplexity = 2 in { - def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)), - (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; - def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)), - (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; - def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)), - (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; - def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)), - (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; - def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)), - (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; - def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)), - (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; - def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)), - (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; - def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)), - (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; - def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)), - (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; - def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), - (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; - } + // Extract element from vector with immediate index that's within the bottom 128-bits. + let AddedComplexity = 1 in { + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), + (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), + (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), + (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)), + (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>; + } + + // Extract first element from vector. + let AddedComplexity = 2 in { + def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)), + (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; + def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), + (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; + } } let Predicates = [HasSVE, HasMatMulInt8] in { @@ -2350,10 +2350,10 @@ let Predicates = [HasSVE2] in { defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>; // SVE2 integer multiply vectors (unpredicated) - defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>; + defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>; defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag>; defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag>; - defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; + defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; // Add patterns for unpredicated version of smulh and umulh. def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)), @@ -2372,7 +2372,7 @@ let Predicates = [HasSVE2] in { (UMULH_ZZZ_S $Op1, $Op2)>; def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)), (UMULH_ZZZ_D $Op1, $Op2)>; - + // SVE2 complex integer dot product (indexed) defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; @@ -2494,11 +2494,11 @@ let Predicates = [HasSVE2] in { } // SVE2 predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; - defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; + defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; // SVE2 integer add/subtract long defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA55.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA55.td index 50911fd22b..0b45a3ba09 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA55.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA55.td @@ -1,339 +1,339 @@ -//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the ARM Cortex-A55 processors. -// -//===----------------------------------------------------------------------===// - -// ===---------------------------------------------------------------------===// -// The following definitions describe the per-operand machine model. -// This works with MachineScheduler. See MCSchedModel.h for details. - -// Cortex-A55 machine model for scheduling and other instruction cost heuristics. -def CortexA55Model : SchedMachineModel { - let MicroOpBufferSize = 0; // The Cortex-A55 is an in-order processor - let IssueWidth = 2; // It dual-issues under most circumstances - let LoadLatency = 4; // Cycles for loads to access the cache. The - // optimisation guide shows that most loads have - // a latency of 3, but some have a latency of 4 - // or 5. Setting it 4 looked to be good trade-off. - let MispredictPenalty = 8; // A branch direction mispredict. - let PostRAScheduler = 1; // Enable PostRA scheduler pass. - let CompleteModel = 0; // Covers instructions applicable to Cortex-A55. - - list<Predicate> UnsupportedFeatures = [HasSVE]; - - // FIXME: Remove when all errors have been fixed. - let FullInstRWOverlapCheck = 0; -} - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available. - -// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the -// Cortex-A55 is in-order. - -def CortexA55UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU -def CortexA55UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide -def CortexA55UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined -def CortexA55UnitLd : ProcResource<1> { let BufferSize = 0; } // Load pipe -def CortexA55UnitSt : ProcResource<1> { let BufferSize = 0; } // Store pipe -def CortexA55UnitB : ProcResource<1> { let BufferSize = 0; } // Branch - -// The FP DIV/SQRT instructions execute totally differently from the FP ALU -// instructions, which can mostly be dual-issued; that's why for now we model -// them with 2 resources. -def CortexA55UnitFPALU : ProcResource<2> { let BufferSize = 0; } // FP ALU -def CortexA55UnitFPMAC : ProcResource<2> { let BufferSize = 0; } // FP MAC -def CortexA55UnitFPDIV : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128 - -//===----------------------------------------------------------------------===// -// Subtarget-specific SchedWrite types - -let SchedModel = CortexA55Model in { - -// These latencies are modeled without taking into account forwarding paths -// (the software optimisation guide lists latencies taking into account -// typical forwarding paths). -def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; } // MOVN, MOVZ -def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; } // ALU -def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Shifted-Reg -def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Extended-Reg -def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; } // EXTR from a reg pair -def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; } // Shift/Scale - -// MAC -def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; } // 32-bit Multiply -def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; } // 64-bit Multiply - -// Div -def : WriteRes<WriteID32, [CortexA55UnitDiv]> { - let Latency = 8; let ResourceCycles = [8]; -} -def : WriteRes<WriteID64, [CortexA55UnitDiv]> { - let Latency = 8; let ResourceCycles = [8]; -} - -// Load -def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; } -def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; } -def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; } - -// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd -// below, choosing the median of 3 which makes the latency 6. -// An extra cycle is needed to get the swizzling right. -def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6; - let ResourceCycles = [3]; } -def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; } -def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; - let ResourceCycles = [2]; } -def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6; - let ResourceCycles = [3]; } -def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7; - let ResourceCycles = [4]; } -def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8; - let ResourceCycles = [5]; } -def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9; - let ResourceCycles = [6]; } -def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10; - let ResourceCycles = [7]; } -def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11; - let ResourceCycles = [8]; } - -// Pre/Post Indexing - Performed as part of address generation -def : WriteRes<WriteAdr, []> { let Latency = 0; } - -// Store -def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 4; } -def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 4; } -def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 4; } -def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; } - -// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. -def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5; - let ResourceCycles = [2];} -def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; } -def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; - let ResourceCycles = [2]; } -def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6; - let ResourceCycles = [3]; } -def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; - let ResourceCycles = [4]; } - -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } - -// Branch -def : WriteRes<WriteBr, [CortexA55UnitB]>; -def : WriteRes<WriteBrReg, [CortexA55UnitB]>; -def : WriteRes<WriteSys, [CortexA55UnitB]>; -def : WriteRes<WriteBarrier, [CortexA55UnitB]>; -def : WriteRes<WriteHint, [CortexA55UnitB]>; - -// FP ALU -// As WriteF result is produced in F5 and it can be mostly forwarded -// to consumer at F1, the effectively latency is set as 4. -def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; } -def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; } -def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; } -def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; } -def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; } -def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; } - -// FP ALU specific new schedwrite definitions -def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;} -def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;} -def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;} - -// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined -def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; } -def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22; - let ResourceCycles = [29]; } -def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; } -def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; - let ResourceCycles = [5]; } -def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13; - let ResourceCycles = [10]; } -def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; - let ResourceCycles = [19]; } -def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; - let ResourceCycles = [5]; } -def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12; - let ResourceCycles = [9]; } -def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; - let ResourceCycles = [19]; } - -//===----------------------------------------------------------------------===// -// Subtarget-specific SchedRead types. - -def : ReadAdvance<ReadVLD, 0>; -def : ReadAdvance<ReadExtrHi, 1>; -def : ReadAdvance<ReadAdrBase, 1>; - -// ALU - ALU input operands are generally needed in EX1. An operand produced in -// in say EX2 can be forwarded for consumption to ALU in EX1, thereby -// allowing back-to-back ALU operations such as add. If an operand requires -// a shift, it will, however, be required in ISS stage. -def : ReadAdvance<ReadI, 2, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -// Shifted operand -def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -def CortexA55ReadISReg : SchedReadVariant<[ - SchedVar<RegShiftedPred, [CortexA55ReadShifted]>, - SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>; -def : SchedAlias<ReadISReg, CortexA55ReadISReg>; - -def CortexA55ReadIEReg : SchedReadVariant<[ - SchedVar<RegExtendedPred, [CortexA55ReadShifted]>, - SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>; -def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>; - -// MUL -def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; - -// Div -def : ReadAdvance<ReadID, 1, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; - -//===----------------------------------------------------------------------===// -// Subtarget-specific InstRWs. - -//--- -// Miscellaneous -//--- -def : InstRW<[CortexA55WriteVLD2,CortexA55WriteVLD1], (instregex "LDP.*")>; -def : InstRW<[WriteI], (instrs COPY)>; -//--- -// Vector Loads - 64-bit per cycle -//--- -// 1-element structures -def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element -def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate -def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>; -def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures -def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; - -def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; - -// 2-element structures -def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; -def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; - -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; -def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; - -// 3-element structures -def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>; -def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>; - -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; - -// 4-element structures -def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs. -def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs. -def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs. -def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; - -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; - -//--- -// Vector Stores -//--- -def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>; -def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; -def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>; - -def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>; -def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; -def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -//--- -// Floating Point Conversions, MAC, DIV, SQRT -//--- -def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; -def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; - -def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; -def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; -def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>; - -def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; -def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>; -def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>; -def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>; -def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>; -def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>; -def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>; -def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>; -def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; -def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; -def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; -} +//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM Cortex-A55 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// Cortex-A55 machine model for scheduling and other instruction cost heuristics. +def CortexA55Model : SchedMachineModel { + let MicroOpBufferSize = 0; // The Cortex-A55 is an in-order processor + let IssueWidth = 2; // It dual-issues under most circumstances + let LoadLatency = 4; // Cycles for loads to access the cache. The + // optimisation guide shows that most loads have + // a latency of 3, but some have a latency of 4 + // or 5. Setting it 4 looked to be good trade-off. + let MispredictPenalty = 8; // A branch direction mispredict. + let PostRAScheduler = 1; // Enable PostRA scheduler pass. + let CompleteModel = 0; // Covers instructions applicable to Cortex-A55. + + list<Predicate> UnsupportedFeatures = [HasSVE]; + + // FIXME: Remove when all errors have been fixed. + let FullInstRWOverlapCheck = 0; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the +// Cortex-A55 is in-order. + +def CortexA55UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def CortexA55UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide +def CortexA55UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined +def CortexA55UnitLd : ProcResource<1> { let BufferSize = 0; } // Load pipe +def CortexA55UnitSt : ProcResource<1> { let BufferSize = 0; } // Store pipe +def CortexA55UnitB : ProcResource<1> { let BufferSize = 0; } // Branch + +// The FP DIV/SQRT instructions execute totally differently from the FP ALU +// instructions, which can mostly be dual-issued; that's why for now we model +// them with 2 resources. +def CortexA55UnitFPALU : ProcResource<2> { let BufferSize = 0; } // FP ALU +def CortexA55UnitFPMAC : ProcResource<2> { let BufferSize = 0; } // FP MAC +def CortexA55UnitFPDIV : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128 + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types + +let SchedModel = CortexA55Model in { + +// These latencies are modeled without taking into account forwarding paths +// (the software optimisation guide lists latencies taking into account +// typical forwarding paths). +def : WriteRes<WriteImm, [CortexA55UnitALU]> { let Latency = 3; } // MOVN, MOVZ +def : WriteRes<WriteI, [CortexA55UnitALU]> { let Latency = 3; } // ALU +def : WriteRes<WriteISReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Shifted-Reg +def : WriteRes<WriteIEReg, [CortexA55UnitALU]> { let Latency = 3; } // ALU of Extended-Reg +def : WriteRes<WriteExtr, [CortexA55UnitALU]> { let Latency = 3; } // EXTR from a reg pair +def : WriteRes<WriteIS, [CortexA55UnitALU]> { let Latency = 3; } // Shift/Scale + +// MAC +def : WriteRes<WriteIM32, [CortexA55UnitMAC]> { let Latency = 4; } // 32-bit Multiply +def : WriteRes<WriteIM64, [CortexA55UnitMAC]> { let Latency = 4; } // 64-bit Multiply + +// Div +def : WriteRes<WriteID32, [CortexA55UnitDiv]> { + let Latency = 8; let ResourceCycles = [8]; +} +def : WriteRes<WriteID64, [CortexA55UnitDiv]> { + let Latency = 8; let ResourceCycles = [8]; +} + +// Load +def : WriteRes<WriteLD, [CortexA55UnitLd]> { let Latency = 3; } +def : WriteRes<WriteLDIdx, [CortexA55UnitLd]> { let Latency = 4; } +def : WriteRes<WriteLDHi, [CortexA55UnitLd]> { let Latency = 5; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// An extra cycle is needed to get the swizzling right. +def : WriteRes<WriteVLD, [CortexA55UnitLd]> { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; } +def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; + let ResourceCycles = [2]; } +def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7; + let ResourceCycles = [4]; } +def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9; + let ResourceCycles = [6]; } +def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10; + let ResourceCycles = [7]; } +def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11; + let ResourceCycles = [8]; } + +// Pre/Post Indexing - Performed as part of address generation +def : WriteRes<WriteAdr, []> { let Latency = 0; } + +// Store +def : WriteRes<WriteST, [CortexA55UnitSt]> { let Latency = 4; } +def : WriteRes<WriteSTP, [CortexA55UnitSt]> { let Latency = 4; } +def : WriteRes<WriteSTIdx, [CortexA55UnitSt]> { let Latency = 4; } +def : WriteRes<WriteSTX, [CortexA55UnitSt]> { let Latency = 4; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes<WriteVST, [CortexA55UnitSt]> { let Latency = 5; + let ResourceCycles = [2];} +def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; } +def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; + let ResourceCycles = [4]; } + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// Branch +def : WriteRes<WriteBr, [CortexA55UnitB]>; +def : WriteRes<WriteBrReg, [CortexA55UnitB]>; +def : WriteRes<WriteSys, [CortexA55UnitB]>; +def : WriteRes<WriteBarrier, [CortexA55UnitB]>; +def : WriteRes<WriteHint, [CortexA55UnitB]>; + +// FP ALU +// As WriteF result is produced in F5 and it can be mostly forwarded +// to consumer at F1, the effectively latency is set as 4. +def : WriteRes<WriteF, [CortexA55UnitFPALU]> { let Latency = 4; } +def : WriteRes<WriteFCmp, [CortexA55UnitFPALU]> { let Latency = 3; } +def : WriteRes<WriteFCvt, [CortexA55UnitFPALU]> { let Latency = 4; } +def : WriteRes<WriteFCopy, [CortexA55UnitFPALU]> { let Latency = 3; } +def : WriteRes<WriteFImm, [CortexA55UnitFPALU]> { let Latency = 3; } +def : WriteRes<WriteV, [CortexA55UnitFPALU]> { let Latency = 4; } + +// FP ALU specific new schedwrite definitions +def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;} +def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;} +def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;} + +// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined +def : WriteRes<WriteFMul, [CortexA55UnitFPMAC]> { let Latency = 4; } +def : WriteRes<WriteFDiv, [CortexA55UnitFPDIV]> { let Latency = 22; + let ResourceCycles = [29]; } +def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; } +def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13; + let ResourceCycles = [10]; } +def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; + let ResourceCycles = [19]; } +def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12; + let ResourceCycles = [9]; } +def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; + let ResourceCycles = [19]; } + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +def : ReadAdvance<ReadVLD, 0>; +def : ReadAdvance<ReadExtrHi, 1>; +def : ReadAdvance<ReadAdrBase, 1>; + +// ALU - ALU input operands are generally needed in EX1. An operand produced in +// in say EX2 can be forwarded for consumption to ALU in EX1, thereby +// allowing back-to-back ALU operations such as add. If an operand requires +// a shift, it will, however, be required in ISS stage. +def : ReadAdvance<ReadI, 2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +// Shifted operand +def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def CortexA55ReadISReg : SchedReadVariant<[ + SchedVar<RegShiftedPred, [CortexA55ReadShifted]>, + SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>; +def : SchedAlias<ReadISReg, CortexA55ReadISReg>; + +def CortexA55ReadIEReg : SchedReadVariant<[ + SchedVar<RegExtendedPred, [CortexA55ReadShifted]>, + SchedVar<NoSchedPred, [CortexA55ReadNotShifted]>]>; +def : SchedAlias<ReadIEReg, CortexA55ReadIEReg>; + +// MUL +def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; + +// Div +def : ReadAdvance<ReadID, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +//--- +// Miscellaneous +//--- +def : InstRW<[CortexA55WriteVLD2,CortexA55WriteVLD1], (instregex "LDP.*")>; +def : InstRW<[WriteI], (instrs COPY)>; +//--- +// Vector Loads - 64-bit per cycle +//--- +// 1-element structures +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures +def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// 2-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +// 3-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; + +// 4-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs. +def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs. +def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs. +def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//--- +// Floating Point Conversions, MAC, DIV, SQRT +//--- +def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; + +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>; + +def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>; +def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>; +def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; +def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57.td index aa5bec8088..0ee50541c0 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57.td @@ -93,7 +93,7 @@ def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>; def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>; def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>; def : SchedAlias<WriteFImm, A57Write_3cyc_1V>; -def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;} +def : WriteRes<WriteFMul, [A57UnitV]> { let Latency = 5;} def : SchedAlias<WriteFDiv, A57Write_17cyc_1W>; def : SchedAlias<WriteV, A57Write_3cyc_1V>; def : SchedAlias<WriteVLD, A57Write_5cyc_1L>; @@ -350,16 +350,16 @@ def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$") // D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 // Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 -// Cortex A57 Software Optimization Guide Sec 3.14 -// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate -def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>; - +// Cortex A57 Software Optimization Guide Sec 3.14 +// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate +def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>; + // ASIMD absolute diff accum, D-form -def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; // ASIMD absolute diff accum, Q-form -def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; // ASIMD absolute diff accum long -def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>; // ASIMD arith, reduce, 4H/4S def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; @@ -376,41 +376,41 @@ def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")> def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; // ASIMD multiply, D-form -// MUL -def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; -// PMUL, SQDMULH, SQRDMULH -def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; - +// MUL +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; + // ASIMD multiply, Q-form -// MUL -def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>; -// PMUL, SQDMULH, SQRDMULH -def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; - -// Cortex A57 Software Optimization Guide Sec 3.14 -def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; -def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; - +// MUL +def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// Cortex A57 Software Optimization Guide Sec 3.14 +def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; +def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; + // ASIMD multiply accumulate, D-form -def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; // ASIMD multiply accumulate, Q-form -def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; // ASIMD multiply accumulate long // ASIMD multiply accumulate saturating long -def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>; -def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>; // ASIMD multiply long -def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>; -def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>; +def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>; def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; // ASIMD pairwise add and accumulate // ASIMD shift accumulate -def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>; -def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; // ASIMD shift by immed, complex def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; @@ -487,22 +487,22 @@ def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i6 def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; // ASIMD FP multiply, D-form, FZ -def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; // ASIMD FP multiply, Q-form, FZ -def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP multiply accumulate, D-form, FZ // ASIMD FP multiply accumulate, Q-form, FZ def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } - -// Cortex A57 Software Optimization Guide Sec 3.15 -// Advances from FP mul and mul-accum to mul-accum -def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; -def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; - + +// Cortex A57 Software Optimization Guide Sec 3.15 +// Advances from FP mul and mul-accum to mul-accum +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; +def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; + def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; -def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP round, D-form def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; @@ -565,9 +565,9 @@ def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; -// Cortex A57 Software Optimization Guide Sec 3.10 +// Cortex A57 Software Optimization Guide Sec 3.10 def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } -def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>; +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>; def A57ReadFPM : SchedReadAdvance<0>; def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57WriteRes.td index a4c090d439..2ec3233887 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57WriteRes.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -13,11 +13,11 @@ // Prefix: A57Write // Latency: #cyc // MicroOp Count/Types: #(B|I|M|L|S|X|W|V) -// Postfix (optional): (XYZ)_Forward -// -// The postfix is added to differentiate SchedWriteRes that are used in -// subsequent SchedReadAdvances. +// Postfix (optional): (XYZ)_Forward // +// The postfix is added to differentiate SchedWriteRes that are used in +// subsequent SchedReadAdvances. +// // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are // 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. // @@ -29,9 +29,9 @@ def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } -def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; } def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } -def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; } def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; let ResourceCycles = [17]; } @@ -51,7 +51,7 @@ def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } -def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; } def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } @@ -100,10 +100,10 @@ def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { let Latency = 6; let NumMicroOps = 2; } -def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> { - let Latency = 6; - let NumMicroOps = 2; -} +def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { let Latency = 5; @@ -113,18 +113,18 @@ def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 5; let NumMicroOps = 2; } -def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> { - let Latency = 5; - let NumMicroOps = 2; -} +def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 5; let NumMicroOps = 2; } -def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { - let Latency = 5; - let NumMicroOps = 2; -} +def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, A57UnitV]> { let Latency = 10; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA64FX.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA64FX.td index b6741d418e..3c5a8d033d 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA64FX.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedA64FX.td @@ -1,3890 +1,3890 @@ -//=- AArch64SchedA64FX.td - Fujitsu A64FX Scheduling Defs -*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the scheduling model for the Fujitsu A64FX processors. -// -//===----------------------------------------------------------------------===// - -def A64FXModel : SchedMachineModel { - let IssueWidth = 6; // 6 micro-ops dispatched at a time. - let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. - let LoadLatency = 5; // Optimistic load latency. - let MispredictPenalty = 12; // Extra cycles for mispredicted branch. - // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 128; - let PostRAScheduler = 1; // Using PostRA sched. - let CompleteModel = 1; - - list<Predicate> UnsupportedFeatures = - [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth]; - - let FullInstRWOverlapCheck = 0; -} - -let SchedModel = A64FXModel in { - -// Define the issue ports. - -// A64FXIP* - -// Port 0 -def A64FXIPFLA : ProcResource<1>; - -// Port 1 -def A64FXIPPR : ProcResource<1>; - -// Port 2 -def A64FXIPEXA : ProcResource<1>; - -// Port 3 -def A64FXIPFLB : ProcResource<1>; - -// Port 4 -def A64FXIPEXB : ProcResource<1>; - -// Port 5 -def A64FXIPEAGA : ProcResource<1>; - -// Port 6 -def A64FXIPEAGB : ProcResource<1>; - -// Port 7 -def A64FXIPBR : ProcResource<1>; - -// Define groups for the functional units on each issue port. Each group -// created will be used by a WriteRes later on. - -def A64FXGI7 : ProcResGroup<[A64FXIPBR]>; - -def A64FXGI0 : ProcResGroup<[A64FXIPFLA]>; - -def A64FXGI1 : ProcResGroup<[A64FXIPPR]>; - -def A64FXGI2 : ProcResGroup<[A64FXIPEXA]>; - -def A64FXGI3 : ProcResGroup<[A64FXIPFLB]>; - -def A64FXGI4 : ProcResGroup<[A64FXIPEXB]>; - -def A64FXGI5 : ProcResGroup<[A64FXIPEAGA]>; - -def A64FXGI6 : ProcResGroup<[A64FXIPEAGB]>; - -def A64FXGI03 : ProcResGroup<[A64FXIPFLA, A64FXIPFLB]>; - -def A64FXGI01 : ProcResGroup<[A64FXIPFLA, A64FXIPPR]>; - -def A64FXGI02 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA]>; - -def A64FXGI12 : ProcResGroup<[A64FXIPEXA, A64FXIPPR]>; - -def A64FXGI15 : ProcResGroup<[A64FXIPEAGA, A64FXIPPR]>; - -def A64FXGI05 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA]>; - -def A64FXGI24 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB]>; - -def A64FXGI124 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPPR]>; - -def A64FXGI056 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA, A64FXIPEAGB]>; - -def A64FXGI0256 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA, A64FXIPEAGA, A64FXIPEAGB]>; - -def A64FXGI56 : ProcResGroup<[A64FXIPEAGA, A64FXIPEAGB]>; - -def A64FXGI2456 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB]>; - -def A64FXAny : ProcResGroup<[A64FXIPFLA, A64FXIPPR, A64FXIPEXA, A64FXIPFLB, - A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB, A64FXIPBR]> { - let BufferSize = 60; -} - -def A64FXWrite_6Cyc : SchedWriteRes<[]> { - let Latency = 6; -} - -def A64FXWrite_1Cyc_GI7 : SchedWriteRes<[A64FXGI7]> { - let Latency = 1; -} - -def A64FXWrite_2Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 2; -} - -def A64FXWrite_4Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 4; -} - -def A64FXWrite_5Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 5; -} - -def A64FXWrite_6Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 6; -} - -def A64FXWrite_8Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 8; -} - -def A64FXWrite_9Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 9; -} - -def A64FXWrite_13Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 13; -} - -def A64FXWrite_37Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 37; -} - -def A64FXWrite_98Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 98; -} - -def A64FXWrite_134Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 134; -} - -def A64FXWrite_154Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { - let Latency = 154; -} - -def A64FXWrite_4Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { - let Latency = 4; -} - -def A64FXWrite_6Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { - let Latency = 6; -} - -def A64FXWrite_8Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { - let Latency = 8; -} - -def A64FXWrite_12Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { - let Latency = 12; -} - -def A64FXWrite_10Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { - let Latency = 10; -} - -def A64FXWrite_17Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { - let Latency = 17; -} - -def A64FXWrite_21Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { - let Latency = 21; -} - -def A64FXWrite_3Cyc_GI1 : SchedWriteRes<[A64FXGI1]> { - let Latency = 3; -} - -def A64FXWrite_6Cyc_NGI1 : SchedWriteRes<[A64FXGI1]> { - let Latency = 3; - let NumMicroOps = 2; -} - -def A64FXWrite_4Cyc_GI12 : SchedWriteRes<[A64FXGI12]> { - let Latency = 4; -} - -def A64FXWrite_3Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { - let Latency = 3; -} - -def A64FXWrite_5Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { - let Latency = 5; -} - -def A64FXWrite_6Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { - let Latency = 6; -} - -def A64FXWrite_4Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { - let Latency = 4; -} - -def A64FXWrite_6Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { - let Latency = 6; -} - -def A64FXWrite_6Cyc_GI15 : SchedWriteRes<[A64FXGI15]> { - let Latency = 6; -} - -def A64FXWrite_3Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 3; -} - -def A64FXWrite_4Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 4; -} - -def A64FXWrite_6Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 6; -} - -def A64FXWrite_8Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 8; -} - -def A64FXWrite_9Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 9; -} - -def A64FXWrite_10Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; -} - -def A64FXWrite_12Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 12; -} - -def A64FXWrite_14Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; -} - -def A64FXWrite_15Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 15; -} - -def A64FXWrite_15Cyc_NGI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 15; - let NumMicroOps = 2; -} - -def A64FXWrite_18Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 18; -} - -def A64FXWrite_45Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 45; -} - -def A64FXWrite_60Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 60; -} - -def A64FXWrite_75Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { - let Latency = 75; -} - -def A64FXWrite_6Cyc_GI05 : SchedWriteRes<[A64FXGI05]> { - let Latency = 6; -} - -def A64FXWrite_10Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { - let Latency = 10; -} - -def A64FXWrite_12Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { - let Latency = 12; -} - -def A64FXWrite_20Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { - let Latency = 20; -} - -def A64FXWrite_5Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { - let Latency = 5; -} - -def A64FXWrite_11Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { - let Latency = 11; -} - -def A64FXWrite_5Cyc_GI6 : SchedWriteRes<[A64FXGI6]> { - let Latency = 5; -} - -def A64FXWrite_1Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { - let Latency = 1; -} - -def A64FXWrite_2Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { - let Latency = 2; -} - -def A64FXWrite_4Cyc_NGI24 : SchedWriteRes<[A64FXGI24]> { - let Latency = 4; - let NumMicroOps = 4; -} - -def A64FXWrite_6Cyc_GI124: SchedWriteRes<[A64FXGI124]> { - let Latency = 6; -} - -def A64FXWrite_8Cyc_GI124 : SchedWriteRes<[A64FXGI124]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def A64FXWrite_6Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_1Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 1; -} - -def A64FXWrite_5Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 5; -} - -def A64FXWrite_8Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 8; -} - -def A64FXWrite_11Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 11; -} - -def A64FXWrite_44Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { - let Latency = 44; -} - -def A64FXWrite_10Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { - let Latency = 10; -} - -def A64FXWrite_15Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { - let Latency = 15; -} - -def A64FXWrite_19Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { - let Latency = 19; -} - -def A64FXWrite_25Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { - let Latency = 25; -} - -def A64FXWrite_14Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { - let Latency = 14; -} - -def A64FXWrite_19Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { - let Latency = 19; -} - -def A64FXWrite_29Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { - let Latency = 29; -} - -def A64FXWrite_LDNP: SchedWriteRes<[A64FXGI56]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def A64FXWrite_LDP01: SchedWriteRes<[A64FXGI2456]> { - let Latency = 5; - let NumMicroOps = 3; -} - -def A64FXWrite_LDR01: SchedWriteRes<[A64FXGI2456]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def A64FXWrite_LD102: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def A64FXWrite_LD103: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 2; - -} - -def A64FXWrite_LD104: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 3; -} - -def A64FXWrite_LD105: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 3; -} - -def A64FXWrite_LD106: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 4; -} - -def A64FXWrite_LD107: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 4; -} - -def A64FXWrite_LD108: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def A64FXWrite_LD109: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 2; -} - -def A64FXWrite_LD110: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 3; -} - -def A64FXWrite_LD111: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 3; -} - -def A64FXWrite_LD112: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 4; -} - -def A64FXWrite_LD113: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 4; -} - -def A64FXWrite_LD114: SchedWriteRes<[A64FXGI56]> { - let Latency = 8; - let NumMicroOps = 5; -} - -def A64FXWrite_LD115: SchedWriteRes<[A64FXGI56]> { - let Latency = 11; - let NumMicroOps = 5; -} - -def A64FXWrite_LD1I0: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def A64FXWrite_LD1I1: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 3; -} - -def A64FXWrite_LD2I0: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 4; -} - -def A64FXWrite_LD2I1: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 5; -} - -def A64FXWrite_LD3I0: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 6; -} - -def A64FXWrite_LD3I1: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 7; -} - -def A64FXWrite_LD4I0: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 8; -} - -def A64FXWrite_LD4I1: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; - let NumMicroOps = 9; -} - -def A64FXWrite_1Cyc_GI2456 : SchedWriteRes<[A64FXGI2456]> { - let Latency = 1; -} - -def A64FXWrite_FMOV_GV : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; -} - -def A64FXWrite_FMOV_VG14 : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; -} - -def A64FXWrite_FMOV_VG : SchedWriteRes<[A64FXGI03]> { - let Latency = 25; -} - -def A64FXWrite_ADDLV : SchedWriteRes<[A64FXGI03]> { - let Latency = 12; -} - -def A64FXWrite_MULLE : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; -} - -def A64FXWrite_MULLV : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; -} - -def A64FXWrite_MADDL : SchedWriteRes<[A64FXGI03]> { - let Latency = 6; -} - -def A64FXWrite_ABA : SchedWriteRes<[A64FXGI03]> { - let Latency = 8; -} - -def A64FXWrite_ABAL : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; -} - -def A64FXWrite_ADDLV1 : SchedWriteRes<[A64FXGI03]> { - let Latency = 12; - let NumMicroOps = 6; -} - -def A64FXWrite_MINMAXV : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; - let NumMicroOps = 6; -} - -def A64FXWrite_SQRDMULH : SchedWriteRes<[A64FXGI03]> { - let Latency = 9; -} - -def A64FXWrite_PMUL : SchedWriteRes<[A64FXGI03]> { - let Latency = 8; -} - - -def A64FXWrite_SRSRAV : SchedWriteRes<[A64FXGI03]> { - let Latency = 8; - let NumMicroOps = 3; -} - -def A64FXWrite_SSRAV : SchedWriteRes<[A64FXGI03]> { - let Latency = 8; - let NumMicroOps = 2; -} - -def A64FXWrite_RSHRN : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 3; -} - -def A64FXWrite_SHRN : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 2; -} - - -def A64FXWrite_ADDP : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 3; -} - -def A64FXWrite_FMULXE : SchedWriteRes<[A64FXGI03]> { - let Latency = 15; - let NumMicroOps = 2; -} - -def A64FXWrite_FADDPV : SchedWriteRes<[A64FXGI03]> { - let Latency = 15; - let NumMicroOps = 3; -} - -def A64FXWrite_SADALP : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 3; -} - -def A64FXWrite_SADDLP : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 2; -} - -def A64FXWrite_FCVTXNV : SchedWriteRes<[A64FXGI03]> { - let Latency = 15; - let NumMicroOps = 2; -} - -def A64FXWrite_FMAXVVH : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; - let NumMicroOps = 7; -} - -def A64FXWrite_FMAXVVS : SchedWriteRes<[A64FXGI03]> { - let Latency = 14; -} - -def A64FXWrite_BIF : SchedWriteRes<[A64FXGI03]> { - let Latency = 5; -} - -def A64FXWrite_DUPGENERAL : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; -} - -def A64FXWrite_SHA00 : SchedWriteRes<[A64FXGI0]> { - let Latency = 9; -} - -def A64FXWrite_SHA01 : SchedWriteRes<[A64FXGI0]> { - let Latency = 12; -} - -def A64FXWrite_SMOV : SchedWriteRes<[A64FXGI03]> { - let Latency = 25; -} - -def A64FXWrite_TBX1 : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 3; -} - -def A64FXWrite_TBX2 : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 5; -} - -def A64FXWrite_TBX3 : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 7; -} - -def A64FXWrite_TBX4 : SchedWriteRes<[A64FXGI03]> { - let Latency = 10; - let NumMicroOps = 9; -} - -def A64FXWrite_PREF0: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_PREF1: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_SWP: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_STUR: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_STNP: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_STP01: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_ST10: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_ST11: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_ST12: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_ST13: SchedWriteRes<[A64FXGI56]> { - let Latency = 0; -} - -def A64FXWrite_ST14: SchedWriteRes<[A64FXGI56]> { - let Latency = 1; -} - -def A64FXWrite_ST15: SchedWriteRes<[A64FXGI56]> { - let Latency = 1; -} - -def A64FXWrite_ST16: SchedWriteRes<[A64FXGI56]> { - let Latency = 1; -} - -def A64FXWrite_ST17: SchedWriteRes<[A64FXGI56]> { - let Latency = 1; -} - -def A64FXWrite_ST1W_6: SchedWriteRes<[A64FXGI056]> { - let Latency = 6; -} - -def A64FXWrite_ST2W_7: SchedWriteRes<[A64FXGI056]> { - let Latency = 7; -} - -def A64FXWrite_ST3W_8: SchedWriteRes<[A64FXGI056]> { - let Latency = 8; -} - -def A64FXWrite_ST4W_9: SchedWriteRes<[A64FXGI056]> { - let Latency = 9; -} - -def A64FXWrite_ST1W_15: SchedWriteRes<[A64FXGI056]> { - let Latency = 15; -} - -def A64FXWrite_ST1W_19: SchedWriteRes<[A64FXGI056]> { - let Latency = 19; -} - -def A64FXWrite_CAS: SchedWriteRes<[A64FXGI56]> { - let Latency = 7; -} - -// Define commonly used read types. - -// No forwarding is provided for these types. -def : ReadAdvance<ReadI, 0>; -def : ReadAdvance<ReadISReg, 0>; -def : ReadAdvance<ReadIEReg, 0>; -def : ReadAdvance<ReadIM, 0>; -def : ReadAdvance<ReadIMA, 0>; -def : ReadAdvance<ReadID, 0>; -def : ReadAdvance<ReadExtrHi, 0>; -def : ReadAdvance<ReadAdrBase, 0>; -def : ReadAdvance<ReadVLD, 0>; - -//===----------------------------------------------------------------------===// -// 3. Instruction Tables. - -//--- -// 3.1 Branch Instructions -//--- - -// Branch, immed -// Branch and link, immed -// Compare and branch -def : WriteRes<WriteBr, [A64FXGI7]> { - let Latency = 1; -} - -// Branch, register -// Branch and link, register != LR -// Branch and link, register = LR -def : WriteRes<WriteBrReg, [A64FXGI7]> { - let Latency = 1; -} - -def : WriteRes<WriteSys, []> { let Latency = 1; } -def : WriteRes<WriteBarrier, []> { let Latency = 1; } -def : WriteRes<WriteHint, []> { let Latency = 1; } - -def : WriteRes<WriteAtomic, []> { - let Latency = 4; -} - -//--- -// Branch -//--- -def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs B, BL, BR, BLR)>; -def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs RET)>; -def : InstRW<[A64FXWrite_1Cyc_GI7], (instregex "^B..$")>; -def : InstRW<[A64FXWrite_1Cyc_GI7], - (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; - -//--- -// 3.2 Arithmetic and Logical Instructions -// 3.3 Move and Shift Instructions -//--- - -// ALU, basic -// Conditional compare -// Conditional select -// Address generation -def : WriteRes<WriteI, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : InstRW<[WriteI], - (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", - "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", - "ADC(W|X)r", - "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", - "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", - "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", - "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", - "SBCS(W|X)r", "CCMN(W|X)(i|r)", - "CCMP(W|X)(i|r)", "CSEL(W|X)r", - "CSINC(W|X)r", "CSINV(W|X)r", - "CSNEG(W|X)r")>; - -def : InstRW<[WriteI], (instrs COPY)>; - -// ALU, extend and/or shift -def : WriteRes<WriteISReg, [A64FXGI2456]> { - let Latency = 2; - let ResourceCycles = [1]; -} - -def : InstRW<[WriteISReg], - (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", - "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", - "ADC(W|X)r", - "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", - "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", - "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", - "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", - "SBCS(W|X)r", "CCMN(W|X)(i|r)", - "CCMP(W|X)(i|r)", "CSEL(W|X)r", - "CSINC(W|X)r", "CSINV(W|X)r", - "CSNEG(W|X)r")>; - -def : WriteRes<WriteIEReg, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : InstRW<[WriteIEReg], - (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", - "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", - "ADC(W|X)r", - "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", - "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", - "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", - "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", - "SBCS(W|X)r", "CCMN(W|X)(i|r)", - "CCMP(W|X)(i|r)", "CSEL(W|X)r", - "CSINC(W|X)r", "CSINV(W|X)r", - "CSNEG(W|X)r")>; - -// Move immed -def : WriteRes<WriteImm, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -def : InstRW<[A64FXWrite_1Cyc_GI2456], - (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; - -def : InstRW<[A64FXWrite_2Cyc_GI24], - (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; - -// Variable shift -def : WriteRes<WriteIS, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -//--- -// 3.4 Divide and Multiply Instructions -//--- - -// Divide, W-form -def : WriteRes<WriteID32, [A64FXGI4]> { - let Latency = 39; - let ResourceCycles = [39]; -} - -// Divide, X-form -def : WriteRes<WriteID64, [A64FXGI4]> { - let Latency = 23; - let ResourceCycles = [23]; -} - -// Multiply accumulate, W-form -def : WriteRes<WriteIM32, [A64FXGI2456]> { - let Latency = 5; - let ResourceCycles = [1]; -} - -// Multiply accumulate, X-form -def : WriteRes<WriteIM64, [A64FXGI2456]> { - let Latency = 5; - let ResourceCycles = [1]; -} - -def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; -def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; -def : InstRW<[A64FXWrite_MADDL], - (instregex "(S|U)(MADDL|MSUBL)rrr")>; - -def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; -def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; - -// Bitfield extract, two reg -def : WriteRes<WriteExtr, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -// Multiply high -def : InstRW<[A64FXWrite_5Cyc_GI2], (instrs SMULHrr, UMULHrr)>; - -// Miscellaneous Data-Processing Instructions -// Bitfield extract -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs EXTRWrri, EXTRXrri)>; - -// Bitifield move - basic -def : InstRW<[A64FXWrite_1Cyc_GI24], - (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; - -// Bitfield move, insert -def : InstRW<[A64FXWrite_4Cyc_NGI24], (instregex "^BFM")>; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instregex "(S|U)?BFM.*")>; - -// Count leading -def : InstRW<[A64FXWrite_2Cyc_GI0], (instregex "^CLS(W|X)r$", - "^CLZ(W|X)r$")>; - -// Reverse bits -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBITWr, RBITXr)>; - -// Cryptography Extensions -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AES[DE]")>; -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AESI?MC")>; -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^PMULL")>; -def : InstRW<[A64FXWrite_SHA00], (instregex "^SHA1SU0")>; -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA1(H|SU1)")>; -def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA1[CMP]")>; -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU0")>; -def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU1")>; -def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA256(H|H2)")>; - -// CRC Instructions -def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32Brr, CRC32Hrr)>; -def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32Wrr)>; -def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32Xrr)>; - -def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32CBrr, CRC32CHrr)>; -def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32CWrr)>; -def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32CXrr)>; - -// Reverse bits/bytes -// NOTE: Handled by WriteI. - -//--- -// 3.6 Load Instructions -// 3.10 FP Load Instructions -//--- - -// Load register, literal -// Load register, unscaled immed -// Load register, immed unprivileged -// Load register, unsigned immed -def : WriteRes<WriteLD, [A64FXGI56]> { - let Latency = 4; - let ResourceCycles = [3]; -} - -// Load register, immed post-index -// NOTE: Handled by WriteLD, WriteI. -// Load register, immed pre-index -// NOTE: Handled by WriteLD, WriteAdr. -def : WriteRes<WriteAdr, [A64FXGI2456]> { - let Latency = 1; - let ResourceCycles = [1]; -} - -// Load pair, immed offset, normal -// Load pair, immed offset, signed words, base != SP -// Load pair, immed offset signed words, base = SP -// LDP only breaks into *one* LS micro-op. Thus -// the resources are handled by WriteLD. -def : WriteRes<WriteLDHi, []> { - let Latency = 5; -} - -// Load register offset, basic -// Load register, register offset, scale by 4/8 -// Load register, register offset, scale by 2 -// Load register offset, extend -// Load register, register offset, extend, scale by 4/8 -// Load register, register offset, extend, scale by 2 -def A64FXWriteLDIdx : SchedWriteVariant<[ - SchedVar<ScaledIdxPred, [A64FXWrite_1Cyc_GI56]>, - SchedVar<NoSchedPred, [A64FXWrite_1Cyc_GI56]>]>; -def : SchedAlias<WriteLDIdx, A64FXWriteLDIdx>; - -def A64FXReadAdrBase : SchedReadVariant<[ - SchedVar<ScaledIdxPred, [ReadDefault]>, - SchedVar<NoSchedPred, [ReadDefault]>]>; -def : SchedAlias<ReadAdrBase, A64FXReadAdrBase>; - -// Load pair, immed pre-index, normal -// Load pair, immed pre-index, signed words -// Load pair, immed post-index, normal -// Load pair, immed post-index, signed words -// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. - -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPDi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPQi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPSi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPWi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPXi)>; - -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPDi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPQi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSWi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPWi)>; -def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPXi)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRBui)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRDui)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRHui)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRQui)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRSui)>; - -def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRDl)>; -def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRQl)>; -def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRWl)>; -def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRXl)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRBi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRHi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRWi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRXi)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBWi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBXi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHWi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHXi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSWi)>; - -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPDpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPQpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPSpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPWpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPWpre)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpost)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpost)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpost)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpost)>; - -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPDpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPQpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPSpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPWpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPXpost)>; - -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; - -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPDpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPQpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPSpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPWpre)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPXpre)>; - -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; -def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; - -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPDpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPQpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPSpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPWpost)>; -def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], - (instrs LDPXpost)>; - -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; -def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroW)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRBroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRBroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRDroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRHroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRHHroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRQroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSHWroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSHXroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRWroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRXroW)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRBroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRDroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRHroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRHHroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRQroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSHWroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRSHXroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRWroX)>; -def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], - (instrs LDRXroX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBBi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURDi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHHi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURQi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURXi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBWi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBXi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHWi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHXi)>; -def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSWi)>; - -//--- -// Prefetch -//--- -def : InstRW<[A64FXWrite_PREF0], (instrs PRFMl)>; -def : InstRW<[A64FXWrite_PREF1], (instrs PRFUMi)>; -def : InstRW<[A64FXWrite_PREF1], (instrs PRFMui)>; -def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroW)>; -def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroX)>; - -//-- -// 3.7 Store Instructions -// 3.11 FP Store Instructions -//-- - -// Store register, unscaled immed -// Store register, immed unprivileged -// Store register, unsigned immed -def : WriteRes<WriteST, [A64FXGI56]> { - let Latency = 1; -} - -// Store register, immed post-index -// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase - -// Store register, immed pre-index -// NOTE: Handled by WriteAdr, WriteST - -// Store register, register offset, basic -// Store register, register offset, scaled by 4/8 -// Store register, register offset, scaled by 2 -// Store register, register offset, extend -// Store register, register offset, extend, scale by 4/8 -// Store register, register offset, extend, scale by 1 -def : WriteRes<WriteSTIdx, [A64FXGI56, A64FXGI2456]> { - let Latency = 1; -} - -// Store pair, immed offset, W-form -// Store pair, immed offset, X-form -def : WriteRes<WriteSTP, [A64FXGI56]> { - let Latency = 1; -} - -// Store pair, immed post-index, W-form -// Store pair, immed post-index, X-form -// Store pair, immed pre-index, W-form -// Store pair, immed pre-index, X-form -// NOTE: Handled by WriteAdr, WriteSTP. - -def : InstRW<[A64FXWrite_STUR], (instrs STURBi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURBBi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURDi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURHi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURHHi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURQi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURSi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURWi)>; -def : InstRW<[A64FXWrite_STUR], (instrs STURXi)>; - -def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRBi)>; -def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRHi)>; -def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRWi)>; -def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRXi)>; - -def : InstRW<[A64FXWrite_STNP], (instrs STNPDi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STNPQi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STNPXi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STNPWi)>; - -def : InstRW<[A64FXWrite_STNP], (instrs STPDi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STPQi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STPXi)>; -def : InstRW<[A64FXWrite_STNP], (instrs STPWi)>; - -def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; -def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; - -def : InstRW<[A64FXWrite_STP01], - (instrs STPDpre, STPDpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPDpre, STPDpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPDpre, STPDpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPDpre, STPDpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPQpre, STPQpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPQpre, STPQpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPQpre, STPQpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPQpre, STPQpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPSpre, STPSpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPSpre, STPSpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPSpre, STPSpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPSpre, STPSpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPWpre, STPWpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPWpre, STPWpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPWpre, STPWpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPWpre, STPWpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPXpre, STPXpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPXpre, STPXpost)>; -def : InstRW<[A64FXWrite_STP01], - (instrs STPXpre, STPXpost)>; -def : InstRW<[A64FXWrite_STP01, ReadAdrBase], - (instrs STPXpre, STPXpost)>; - -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRBpre, STRBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRBpre, STRBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRBpre, STRBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRBpre, STRBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRBBpre, STRBBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRBBpre, STRBBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRBBpre, STRBBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRBBpre, STRBBpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRDpre, STRDpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRDpre, STRDpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRDpre, STRDpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRDpre, STRDpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRHpre, STRHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRHpre, STRHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRHpre, STRHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRHpre, STRHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRHHpre, STRHHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRHHpre, STRHHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRHHpre, STRHHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRHHpre, STRHHpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRQpre, STRQpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRQpre, STRQpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRQpre, STRQpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRQpre, STRQpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRSpre, STRSpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRSpre, STRSpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRSpre, STRSpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRSpre, STRSpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRWpre, STRWpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRWpre, STRWpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRWpre, STRWpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRWpre, STRWpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRXpre, STRXpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRXpre, STRXpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01], - (instrs STRXpre, STRXpost)>; -def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], - (instrs STRXpre, STRXpost)>; - -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRBroW, STRBroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRBroW, STRBroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRBBroW, STRBBroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRBBroW, STRBBroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRDroW, STRDroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRDroW, STRDroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRHroW, STRHroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRHroW, STRHroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRHHroW, STRHHroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRHHroW, STRHHroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRQroW, STRQroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRQroW, STRQroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRSroW, STRSroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRSroW, STRSroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRWroW, STRWroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRWroW, STRWroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRXroW, STRXroX)>; -def : InstRW<[A64FXWrite_STUR, ReadAdrBase], - (instrs STRXroW, STRXroX)>; - -//--- -// 3.8 FP Data Processing Instructions -//--- - -// FP absolute value -// FP min/max -// FP negate -def : WriteRes<WriteF, [A64FXGI03]> { - let Latency = 4; - let ResourceCycles = [2]; -} - -// FP arithmetic - -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FADDDrr, FADDHrr)>; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FSUBDrr, FSUBHrr)>; - -// FP compare -def : WriteRes<WriteFCmp, [A64FXGI03]> { - let Latency = 4; - let ResourceCycles = [2]; -} - -// FP Div, Sqrt -def : WriteRes<WriteFDiv, [A64FXGI0]> { - let Latency = 43; -} - -def A64FXXWriteFDiv : SchedWriteRes<[A64FXGI0]> { - let Latency = 38; -} - -def A64FXXWriteFDivSP : SchedWriteRes<[A64FXGI0]> { - let Latency = 29; -} - -def A64FXXWriteFDivDP : SchedWriteRes<[A64FXGI0]> { - let Latency = 43; -} - -def A64FXXWriteFSqrtSP : SchedWriteRes<[A64FXGI0]> { - let Latency = 29; -} - -def A64FXXWriteFSqrtDP : SchedWriteRes<[A64FXGI0]> { - let Latency = 43; -} - -// FP divide, S-form -// FP square root, S-form -def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVSrr)>; -def : InstRW<[A64FXXWriteFSqrtSP], (instrs FSQRTSr)>; -def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVv.*32$")>; -def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; -def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVSrr")>; -def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^FSQRTSr")>; - -// FP divide, D-form -// FP square root, D-form -def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVDrr)>; -def : InstRW<[A64FXXWriteFSqrtDP], (instrs FSQRTDr)>; -def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVv.*64$")>; -def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; -def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVDrr")>; -def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^FSQRTDr")>; - -// FP multiply -// FP multiply accumulate -def : WriteRes<WriteFMul, [A64FXGI03]> { - let Latency = 9; - let ResourceCycles = [2]; -} - -def A64FXXWriteFMul : SchedWriteRes<[A64FXGI03]> { - let Latency = 9; - let ResourceCycles = [2]; -} - -def A64FXXWriteFMulAcc : SchedWriteRes<[A64FXGI03]> { - let Latency = 9; - let ResourceCycles = [2]; -} - -def : InstRW<[A64FXXWriteFMul], (instregex "^FMUL", "^FNMUL")>; -def : InstRW<[A64FXXWriteFMulAcc], - (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; - -// FP round to integral -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; - -// FP select -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCSEL")>; - -//--- -// 3.9 FP Miscellaneous Instructions -//--- - -// FP convert, from vec to vec reg -// FP convert, from gen to vec reg -// FP convert, from vec to gen reg -def : WriteRes<WriteFCvt, [A64FXGI03]> { - let Latency = 9; - let ResourceCycles = [2]; -} - -// FP move, immed -// FP move, register -def : WriteRes<WriteFImm, [A64FXGI0]> { - let Latency = 4; - let ResourceCycles = [2]; -} - -// FP transfer, from gen to vec reg -// FP transfer, from vec to gen reg -def : WriteRes<WriteFCopy, [A64FXGI0]> { - let Latency = 4; - let ResourceCycles = [2]; -} - -def : InstRW<[A64FXWrite_FMOV_GV], (instrs FMOVXDHighr)>; -def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>; - -//--- -// 3.12 ASIMD Integer Instructions -//--- - -// ASIMD absolute diff, D-form -// ASIMD absolute diff, Q-form -// ASIMD absolute diff accum, D-form -// ASIMD absolute diff accum, Q-form -// ASIMD absolute diff accum long -// ASIMD absolute diff long -// ASIMD arith, basic -// ASIMD arith, complex -// ASIMD compare -// ASIMD logical (AND, BIC, EOR) -// ASIMD max/min, basic -// ASIMD max/min, reduce, 4H/4S -// ASIMD max/min, reduce, 8B/8H -// ASIMD max/min, reduce, 16B -// ASIMD multiply, D-form -// ASIMD multiply, Q-form -// ASIMD multiply accumulate long -// ASIMD multiply accumulate saturating long -// ASIMD multiply long -// ASIMD pairwise add and accumulate -// ASIMD shift accumulate -// ASIMD shift by immed, basic -// ASIMD shift by immed and insert, basic, D-form -// ASIMD shift by immed and insert, basic, Q-form -// ASIMD shift by immed, complex -// ASIMD shift by register, basic, D-form -// ASIMD shift by register, basic, Q-form -// ASIMD shift by register, complex, D-form -// ASIMD shift by register, complex, Q-form -def : WriteRes<WriteV, [A64FXGI03]> { - let Latency = 4; - let ResourceCycles = [1]; -} - -// ASIMD arith, reduce, 4H/4S -// ASIMD arith, reduce, 8B/8H -// ASIMD arith, reduce, 16B - -// ASIMD logical (MVN (alias for NOT), ORN, ORR) -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; - -// ASIMD arith, reduce -def : InstRW<[A64FXWrite_ADDLV], - (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; - -// ASIMD polynomial (8x8) multiply long -def : InstRW<[A64FXWrite_MULLE], (instregex "^(S|U|SQD)MULL")>; -def : InstRW<[A64FXWrite_MULLV], - (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; -def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v8i8|v16i8)")>; -def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v1i64|v2i64)")>; - -// ASIMD absolute diff accum, D-form -def : InstRW<[A64FXWrite_ABA], - (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; -// ASIMD absolute diff accum, Q-form -def : InstRW<[A64FXWrite_ABA], - (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; -// ASIMD absolute diff accum long -def : InstRW<[A64FXWrite_ABAL], - (instregex "^[SU]ABAL")>; -// ASIMD arith, reduce, 4H/4S -def : InstRW<[A64FXWrite_ADDLV1], - (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; -// ASIMD arith, reduce, 8B -def : InstRW<[A64FXWrite_ADDLV1], - (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; -// ASIMD arith, reduce, 16B/16H -def : InstRW<[A64FXWrite_ADDLV1], - (instregex "^[SU]?ADDL?Vv16i8v$")>; -// ASIMD max/min, reduce, 4H/4S -def : InstRW<[A64FXWrite_MINMAXV], - (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; -// ASIMD max/min, reduce, 8B/8H -def : InstRW<[A64FXWrite_MINMAXV], - (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; -// ASIMD max/min, reduce, 16B/16H -def : InstRW<[A64FXWrite_MINMAXV], - (instregex "^[SU](MIN|MAX)Vv16i8v$")>; -// ASIMD multiply, D-form -def : InstRW<[A64FXWrite_PMUL], - (instregex "^(P?MUL|SQR?DMUL)" # - "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # - "(_indexed)?$")>; - -// ASIMD multiply, Q-form -def : InstRW<[A64FXWrite_PMUL], - (instregex "^(P?MUL)(v16i8|v8i16|v4i32)(_indexed)?$")>; - -// ASIMD multiply, Q-form -def : InstRW<[A64FXWrite_SQRDMULH], - (instregex "^(SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; - -// ASIMD multiply accumulate, D-form -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; -// ASIMD multiply accumulate, Q-form -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; -// ASIMD shift accumulate -def : InstRW<[A64FXWrite_SRSRAV], - (instregex "SRSRAv", "URSRAv")>; -def : InstRW<[A64FXWrite_SSRAV], - (instregex "SSRAv", "USRAv")>; - -// ASIMD shift by immed, basic -def : InstRW<[A64FXWrite_RSHRN], - (instregex "RSHRNv", "SQRSHRNv", "SQRSHRUNv", "UQRSHRNv")>; -def : InstRW<[A64FXWrite_SHRN], - (instregex "SHRNv", "SQSHRNv", "SQSHRUNv", "UQSHRNv")>; - -def : InstRW<[A64FXWrite_6Cyc_GI3], - (instregex "SQXTNv", "SQXTUNv", "UQXTNv")>; - -// ASIMD shift by immed, complex -def : InstRW<[A64FXWrite_ABA], (instregex "^[SU]?(Q|R){1,2}SHR")>; -def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^SQSHLU")>; -// ASIMD shift by register, basic, Q-form -def : InstRW<[A64FXWrite_6Cyc_GI3], - (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; -// ASIMD shift by register, complex, D-form -def : InstRW<[A64FXWrite_6Cyc_GI3], - (instregex "^[SU][QR]{1,2}SHL" # - "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; -// ASIMD shift by register, complex, Q-form -def : InstRW<[A64FXWrite_6Cyc_GI3], - (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; - -// ASIMD Arithmetic -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; -def : InstRW<[A64FXWrite_SHRN], (instregex "(ADD|SUB)HNv.*")>; -def : InstRW<[A64FXWrite_RSHRN], (instregex "(RADD|RSUB)HNv.*")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", - "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; -def : InstRW<[A64FXWrite_ADDP], - (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # - "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; -def : InstRW<[A64FXWrite_4Cyc_GI0], - (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; -def : InstRW<[A64FXWrite_SADALP], (instregex "^SADALP", "^UADALP")>; -def : InstRW<[A64FXWrite_SADDLP], (instregex "^SADDLPv", "^UADDLPv")>; -def : InstRW<[A64FXWrite_ADDLV1], (instregex "^SADDLV", "^UADDLV")>; -def : InstRW<[A64FXWrite_MINMAXV], - (instregex "^ADDVv", "^SMAXVv", "^UMAXVv", "^SMINVv", "^UMINVv")>; -def : InstRW<[A64FXWrite_ABA], - (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^SQADDv", "^SQSUBv", "^UQADDv", "^UQSUBv")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^SUQADDv", "^USQADDv")>; -def : InstRW<[A64FXWrite_SHRN], - (instregex "^ADDHNv", "^SUBHNv")>; -def : InstRW<[A64FXWrite_RSHRN], - (instregex "^RADDHNv", "^RSUBHNv")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", - "^SRHADD", "^SUQADD", "^UQADD", "^UQSUB", - "^URHADD", "^USQADD")>; - -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^CMEQv", "^CMGEv", "^CMGTv", - "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>; -def : InstRW<[A64FXWrite_MINMAXV], - (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; -def : InstRW<[A64FXWrite_ADDP], - (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^SABDv", "^UABDv")>; -def : InstRW<[A64FXWrite_TBX1], - (instregex "^SABDLv", "^UABDLv")>; - -//--- -// 3.13 ASIMD Floating-point Instructions -//--- - -// ASIMD FP absolute value -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FABSv")>; - -// ASIMD FP arith, normal, D-form -// ASIMD FP arith, normal, Q-form -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^FABDv", "^FADDv", "^FSUBv")>; - -// ASIMD FP arith, pairwise, D-form -// ASIMD FP arith, pairwise, Q-form -def : InstRW<[A64FXWrite_FADDPV], (instregex "^FADDPv")>; - -// ASIMD FP compare, D-form -// ASIMD FP compare, Q-form -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FACGEv", "^FACGTv")>; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCMEQv", "^FCMGEv", - "^FCMGTv", "^FCMLEv", - "^FCMLTv")>; -// ASIMD FP round, D-form -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^FRINT[AIMNPXZ](v2f32)")>; -// ASIMD FP round, Q-form -def : InstRW<[A64FXWrite_9Cyc_GI03], - (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; - -// ASIMD FP convert, long -// ASIMD FP convert, narrow -// ASIMD FP convert, other, D-form -// ASIMD FP convert, other, Q-form - -// ASIMD FP convert, long and narrow -def : InstRW<[A64FXWrite_FCVTXNV], (instregex "^FCVT(L|N|XN)v")>; -// ASIMD FP convert, other, D-form -def : InstRW<[A64FXWrite_FCVTXNV], - (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; -// ASIMD FP convert, other, Q-form -def : InstRW<[A64FXWrite_FCVTXNV], - (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; - -// ASIMD FP divide, D-form, F32 -def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVv2f32)>; -def : InstRW<[A64FXXWriteFDivSP], (instregex "FDIVv2f32")>; - -// ASIMD FP divide, Q-form, F32 -def : InstRW<[A64FXXWriteFDiv], (instrs FDIVv4f32)>; -def : InstRW<[A64FXXWriteFDiv], (instregex "FDIVv4f32")>; - -// ASIMD FP divide, Q-form, F64 -def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVv2f64)>; -def : InstRW<[A64FXXWriteFDivDP], (instregex "FDIVv2f64")>; - -// ASIMD FP max/min, normal, D-form -// ASIMD FP max/min, normal, Q-form -def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMAXv", "^FMAXNMv", - "^FMINv", "^FMINNMv")>; - -// ASIMD FP max/min, pairwise, D-form -// ASIMD FP max/min, pairwise, Q-form -def : InstRW<[A64FXWrite_ADDP], (instregex "^FMAXPv", "^FMAXNMPv", - "^FMINPv", "^FMINNMPv")>; - -// ASIMD FP max/min, reduce -def : InstRW<[A64FXWrite_FMAXVVH], (instregex "^FMAXVv", "^FMAXNMVv", - "^FMINVv", "^FMINNMVv")>; - -// ASIMD FP multiply, D-form, FZ -// ASIMD FP multiply, D-form, no FZ -// ASIMD FP multiply, Q-form, FZ -// ASIMD FP multiply, Q-form, no FZ -def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMULv", "^FMULXv")>; -def : InstRW<[A64FXWrite_FMULXE], - (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; -def : InstRW<[A64FXWrite_FMULXE], - (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; - -// ASIMD FP multiply accumulate, Dform, FZ -// ASIMD FP multiply accumulate, Dform, no FZ -// ASIMD FP multiply accumulate, Qform, FZ -// ASIMD FP multiply accumulate, Qform, no FZ -def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMLAv", "^FMLSv")>; -def : InstRW<[A64FXWrite_FMULXE], - (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; -def : InstRW<[A64FXWrite_FMULXE], - (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; - -// ASIMD FP negate -def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FNEGv")>; - -//-- -// 3.14 ASIMD Miscellaneous Instructions -//-- - -// ASIMD bit reverse -def : InstRW<[A64FXWrite_1Cyc_GI2456], (instregex "^RBITv")>; - -// ASIMD bitwise insert, D-form -// ASIMD bitwise insert, Q-form -def : InstRW<[A64FXWrite_BIF], - (instregex "^BIFv", "^BITv", "^BSLv")>; - -// ASIMD count, D-form -// ASIMD count, Q-form -def : InstRW<[A64FXWrite_4Cyc_GI0], - (instregex "^CLSv", "^CLZv", "^CNTv")>; - -// ASIMD duplicate, gen reg -// ASIMD duplicate, element -def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>; - -// ASIMD extract -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^EXTv")>; - -// ASIMD extract narrow -def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^XTNv")>; - -// ASIMD extract narrow, saturating -def : InstRW<[A64FXWrite_6Cyc_GI3], - (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; - -// ASIMD insert, element to element -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; - -// ASIMD transfer, element to gen reg -def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; - -// ASIMD move, integer immed -def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^MOVIv")>; - -// ASIMD move, FP immed -def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMOVv")>; - -// ASIMD table lookup, D-form -def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv8i8One")>; -def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv8i8Two")>; -def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv8i8Three")>; -def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv8i8Four")>; -def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv8i8One")>; -def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv8i8Two")>; -def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv8i8Three")>; -def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv8i8Four")>; - -// ASIMD table lookup, Q-form -def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv16i8One")>; -def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv16i8Two")>; -def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv16i8Three")>; -def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv16i8Four")>; -def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv16i8One")>; -def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv16i8Two")>; -def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv16i8Three")>; -def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv16i8Four")>; - -// ASIMD transpose -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1", "^TRN2")>; - -// ASIMD unzip/zip -def : InstRW<[A64FXWrite_6Cyc_GI0], - (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; - -// ASIMD reciprocal estimate, D-form -// ASIMD reciprocal estimate, Q-form -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", - "^FRSQRTEv", "^URSQRTEv")>; - -// ASIMD reciprocal step, D-form, FZ -// ASIMD reciprocal step, D-form, no FZ -// ASIMD reciprocal step, Q-form, FZ -// ASIMD reciprocal step, Q-form, no FZ -def : InstRW<[A64FXWrite_9Cyc_GI0], (instregex "^FRECPSv", "^FRSQRTSv")>; - -// ASIMD reverse -def : InstRW<[A64FXWrite_4Cyc_GI03], - (instregex "^REV16v", "^REV32v", "^REV64v")>; - -// ASIMD table lookup, D-form -// ASIMD table lookup, Q-form -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TBLv", "^TBXv")>; - -// ASIMD transfer, element to word or word -def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; - -// ASIMD transfer, element to gen reg -def : InstRW<[A64FXWrite_SMOV], (instregex "(S|U)MOVv.*")>; - -// ASIMD transfer gen reg to element -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; - -// ASIMD transpose -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1v", "^TRN2v", - "^UZP1v", "^UZP2v")>; - -// ASIMD unzip/zip -def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^ZIP1v", "^ZIP2v")>; - -//-- -// 3.15 ASIMD Load Instructions -//-- - -// ASIMD load, 1 element, multiple, 1 reg, D-form -// ASIMD load, 1 element, multiple, 1 reg, Q-form -def : InstRW<[A64FXWrite_8Cyc_GI56], - (instregex "^LD1Onev(8b|4h|2s|1d|2d)$")>; -def : InstRW<[A64FXWrite_11Cyc_GI56], - (instregex "^LD1Onev(16b|8h|4s)$")>; -def : InstRW<[A64FXWrite_LD108, WriteAdr], - (instregex "^LD1Onev(8b|4h|2s|1d|2d)_POST$")>; -def : InstRW<[A64FXWrite_LD109, WriteAdr], - (instregex "^LD1Onev(16b|8h|4s)_POST$")>; - -// ASIMD load, 1 element, multiple, 2 reg, D-form -// ASIMD load, 1 element, multiple, 2 reg, Q-form -def : InstRW<[A64FXWrite_LD102], - (instregex "^LD1Twov(8b|4h|2s|1d|2d)$")>; -def : InstRW<[A64FXWrite_LD103], - (instregex "^LD1Twov(16b|8h|4s)$")>; -def : InstRW<[A64FXWrite_LD110, WriteAdr], - (instregex "^LD1Twov(8b|4h|2s|1d|2d)_POST$")>; -def : InstRW<[A64FXWrite_LD111, WriteAdr], - (instregex "^LD1Twov(16b|8h|4s)_POST$")>; - -// ASIMD load, 1 element, multiple, 3 reg, D-form -// ASIMD load, 1 element, multiple, 3 reg, Q-form -def : InstRW<[A64FXWrite_LD104], - (instregex "^LD1Threev(8b|4h|2s|1d|2d)$")>; -def : InstRW<[A64FXWrite_LD105], - (instregex "^LD1Threev(16b|8h|4s)$")>; -def : InstRW<[A64FXWrite_LD112, WriteAdr], - (instregex "^LD1Threev(8b|4h|2s|1d|2d)_POST$")>; -def : InstRW<[A64FXWrite_LD113, WriteAdr], - (instregex "^LD1Threev(16b|8h|4s)_POST$")>; - -// ASIMD load, 1 element, multiple, 4 reg, D-form -// ASIMD load, 1 element, multiple, 4 reg, Q-form -def : InstRW<[A64FXWrite_LD106], - (instregex "^LD1Fourv(8b|4h|2s|1d|2d)$")>; -def : InstRW<[A64FXWrite_LD107], - (instregex "^LD1Fourv(16b|8h|4s)$")>; -def : InstRW<[A64FXWrite_LD114, WriteAdr], - (instregex "^LD1Fourv(8b|4h|2s|1d|2d)_POST$")>; -def : InstRW<[A64FXWrite_LD115, WriteAdr], - (instregex "^LD1Fourv(16b|8h|4s)_POST$")>; - -// ASIMD load, 1 element, one lane, B/H/S -// ASIMD load, 1 element, one lane, D -def : InstRW<[A64FXWrite_LD1I0], (instregex "^LD1i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_LD1I1, WriteAdr], - (instregex "^LD1i(8|16|32|64)_POST$")>; - -// ASIMD load, 1 element, all lanes, D-form, B/H/S -// ASIMD load, 1 element, all lanes, D-form, D -// ASIMD load, 1 element, all lanes, Q-form -def : InstRW<[A64FXWrite_8Cyc_GI03], - (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD108, WriteAdr], - (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 2 element, multiple, D-form, B/H/S -// ASIMD load, 2 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_LD103], - (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD111, WriteAdr], - (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 2 element, one lane, B/H -// ASIMD load, 2 element, one lane, S -// ASIMD load, 2 element, one lane, D -def : InstRW<[A64FXWrite_LD2I0], (instregex "^LD2i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_LD2I1, WriteAdr], - (instregex "^LD2i(8|16|32|64)_POST$")>; - -// ASIMD load, 2 element, all lanes, D-form, B/H/S -// ASIMD load, 2 element, all lanes, D-form, D -// ASIMD load, 2 element, all lanes, Q-form -def : InstRW<[A64FXWrite_LD102], - (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD110, WriteAdr], - (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 3 element, multiple, D-form, B/H/S -// ASIMD load, 3 element, multiple, Q-form, B/H/S -// ASIMD load, 3 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_LD105], - (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD113, WriteAdr], - (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 3 element, one lone, B/H -// ASIMD load, 3 element, one lane, S -// ASIMD load, 3 element, one lane, D -def : InstRW<[A64FXWrite_LD3I0], (instregex "^LD3i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_LD3I1, WriteAdr], - (instregex "^LD3i(8|16|32|64)_POST$")>; - -// ASIMD load, 3 element, all lanes, D-form, B/H/S -// ASIMD load, 3 element, all lanes, D-form, D -// ASIMD load, 3 element, all lanes, Q-form, B/H/S -// ASIMD load, 3 element, all lanes, Q-form, D -def : InstRW<[A64FXWrite_LD104], - (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD112, WriteAdr], - (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 4 element, multiple, D-form, B/H/S -// ASIMD load, 4 element, multiple, Q-form, B/H/S -// ASIMD load, 4 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_LD107], - (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD115, WriteAdr], - (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 4 element, one lane, B/H -// ASIMD load, 4 element, one lane, S -// ASIMD load, 4 element, one lane, D -def : InstRW<[A64FXWrite_LD4I0], (instregex "^LD4i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_LD4I1, WriteAdr], - (instregex "^LD4i(8|16|32|64)_POST$")>; - -// ASIMD load, 4 element, all lanes, D-form, B/H/S -// ASIMD load, 4 element, all lanes, D-form, D -// ASIMD load, 4 element, all lanes, Q-form, B/H/S -// ASIMD load, 4 element, all lanes, Q-form, D -def : InstRW<[A64FXWrite_LD106], - (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_LD114, WriteAdr], - (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -//-- -// 3.16 ASIMD Store Instructions -//-- - -// ASIMD store, 1 element, multiple, 1 reg, D-form -// ASIMD store, 1 element, multiple, 1 reg, Q-form -def : InstRW<[A64FXWrite_ST10], - (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST14, WriteAdr], - (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 2 reg, D-form -// ASIMD store, 1 element, multiple, 2 reg, Q-form -def : InstRW<[A64FXWrite_ST11], - (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST15, WriteAdr], - (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 3 reg, D-form -// ASIMD store, 1 element, multiple, 3 reg, Q-form -def : InstRW<[A64FXWrite_ST12], - (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST16, WriteAdr], - (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 4 reg, D-form -// ASIMD store, 1 element, multiple, 4 reg, Q-form -def : InstRW<[A64FXWrite_ST13], - (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST17, WriteAdr], - (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, one lane, B/H/S -// ASIMD store, 1 element, one lane, D -def : InstRW<[A64FXWrite_ST10], - (instregex "^ST1i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_ST14, WriteAdr], - (instregex "^ST1i(8|16|32|64)_POST$")>; - -// ASIMD store, 2 element, multiple, D-form, B/H/S -// ASIMD store, 2 element, multiple, Q-form, B/H/S -// ASIMD store, 2 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_ST11], - (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST15, WriteAdr], - (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 2 element, one lane, B/H/S -// ASIMD store, 2 element, one lane, D -def : InstRW<[A64FXWrite_ST11], - (instregex "^ST2i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_ST15, WriteAdr], - (instregex "^ST2i(8|16|32|64)_POST$")>; - -// ASIMD store, 3 element, multiple, D-form, B/H/S -// ASIMD store, 3 element, multiple, Q-form, B/H/S -// ASIMD store, 3 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_ST12], - (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST16, WriteAdr], - (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 3 element, one lane, B/H -// ASIMD store, 3 element, one lane, S -// ASIMD store, 3 element, one lane, D -def : InstRW<[A64FXWrite_ST12], (instregex "^ST3i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_ST16, WriteAdr], - (instregex "^ST3i(8|16|32|64)_POST$")>; - -// ASIMD store, 4 element, multiple, D-form, B/H/S -// ASIMD store, 4 element, multiple, Q-form, B/H/S -// ASIMD store, 4 element, multiple, Q-form, D -def : InstRW<[A64FXWrite_ST13], - (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[A64FXWrite_ST17, WriteAdr], - (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 4 element, one lane, B/H -// ASIMD store, 4 element, one lane, S -// ASIMD store, 4 element, one lane, D -def : InstRW<[A64FXWrite_ST13], (instregex "^ST4i(8|16|32|64)$")>; -def : InstRW<[A64FXWrite_ST17, WriteAdr], - (instregex "^ST4i(8|16|32|64)_POST$")>; - -// V8.1a Atomics (LSE) -def : InstRW<[A64FXWrite_CAS, WriteAtomic], - (instrs CASB, CASH, CASW, CASX)>; - -def : InstRW<[A64FXWrite_CAS, WriteAtomic], - (instrs CASAB, CASAH, CASAW, CASAX)>; - -def : InstRW<[A64FXWrite_CAS, WriteAtomic], - (instrs CASLB, CASLH, CASLW, CASLX)>; - -def : InstRW<[A64FXWrite_CAS, WriteAtomic], - (instrs CASALB, CASALH, CASALW, CASALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, - LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, - LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, - LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, - LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, - LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, - LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, - LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, - LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, - LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; - -def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], - (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, - LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, - LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, - LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; - -def : InstRW<[A64FXWrite_SWP, WriteAtomic], - (instrs SWPB, SWPH, SWPW, SWPX)>; - -def : InstRW<[A64FXWrite_SWP, WriteAtomic], - (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; - -def : InstRW<[A64FXWrite_SWP, WriteAtomic], - (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; - -def : InstRW<[A64FXWrite_SWP, WriteAtomic], - (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; - -def : InstRW<[A64FXWrite_STUR, WriteAtomic], - (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; - -// [ 1] "abs $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ABS_ZPmZ_B, ABS_ZPmZ_D, ABS_ZPmZ_H, ABS_ZPmZ_S)>; - -// [ 2] "add $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZZZ_B, ADD_ZZZ_D, ADD_ZZZ_H, ADD_ZZZ_S)>; - -// [ 3] "add $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZPmZ_B, ADD_ZPmZ_D, ADD_ZPmZ_H, ADD_ZPmZ_S)>; - -// [ 4] "add $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZI_B, ADD_ZI_D, ADD_ZI_H, ADD_ZI_S)>; - -// [ 5] "addpl $Rd, $Rn, $imm6"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDPL_XXI)>; - -// [ 6] "addvl $Rd, $Rn, $imm6"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDVL_XXI)>; - -// [ 7] "adr $Zd, [$Zn, $Zm]"; -def : InstRW<[A64FXWrite_5Cyc_GI0], (instrs ADR_LSL_ZZZ_D_0, ADR_LSL_ZZZ_D_1, ADR_LSL_ZZZ_D_2, ADR_LSL_ZZZ_D_3, ADR_LSL_ZZZ_S_0, ADR_LSL_ZZZ_S_1, ADR_LSL_ZZZ_S_2, ADR_LSL_ZZZ_S_3, ADR_SXTW_ZZZ_D_0, ADR_SXTW_ZZZ_D_1, ADR_SXTW_ZZZ_D_2, ADR_SXTW_ZZZ_D_3, ADR_UXTW_ZZZ_D_0, ADR_UXTW_ZZZ_D_1, ADR_UXTW_ZZZ_D_2, ADR_UXTW_ZZZ_D_3)>; - -// [ 8] "and $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs AND_PPzPP)>; - -// [ 9] "and $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZZZ)>; - -// [10] "and $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZPmZ_B, AND_ZPmZ_D, AND_ZPmZ_H, AND_ZPmZ_S)>; - -// [11] "and $Zdn, $_Zdn, $imms13"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZI)>; - -// [12] "ands $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ANDS_PPzPP)>; - -// [13] "andv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ANDV_VPZ_B, ANDV_VPZ_D, ANDV_VPZ_H, ANDV_VPZ_S)>; - -// [14] "asr $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZZZ_B, ASR_WIDE_ZZZ_H, ASR_WIDE_ZZZ_S)>; - -// [15] "asr $Zd, $Zn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZZI_B, ASR_ZZI_D, ASR_ZZI_H, ASR_ZZI_S)>; - -// [16] "asr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZPmZ_B, ASR_WIDE_ZPmZ_H, ASR_WIDE_ZPmZ_S, ASR_ZPmZ_B, ASR_ZPmZ_D, ASR_ZPmZ_H, ASR_ZPmZ_S)>; - -// [17] "asr $Zdn, $Pg/m, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZPmI_B, ASR_ZPmI_D, ASR_ZPmI_H, ASR_ZPmI_S)>; - -// [18] "asrd $Zdn, $Pg/m, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRD_ZPmI_B, ASRD_ZPmI_D, ASRD_ZPmI_H, ASRD_ZPmI_S)>; - -// [19] "asrr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRR_ZPmZ_B, ASRR_ZPmZ_D, ASRR_ZPmZ_H, ASRR_ZPmZ_S)>; - -// [20] "bic $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BIC_PPzPP)>; - -// [21] "bic $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZZZ)>; - -// [22] "bic $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZPmZ_B, BIC_ZPmZ_D, BIC_ZPmZ_H, BIC_ZPmZ_S)>; - -// [23] "bics $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BICS_PPzPP)>; - -// [24] "brka $Pd, $Pg/m, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPmP)>; - -// [25] "brka $Pd, $Pg/z, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPzP)>; - -// [26] "brkas $Pd, $Pg/z, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKAS_PPzP)>; - -// [27] "brkb $Pd, $Pg/m, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPmP)>; - -// [28] "brkb $Pd, $Pg/z, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPzP)>; - -// [29] "brkbs $Pd, $Pg/z, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKBS_PPzP)>; - -// [30] "brkn $Pdm, $Pg/z, $Pn, $_Pdm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKN_PPzP)>; - -// [31] "brkns $Pdm, $Pg/z, $Pn, $_Pdm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKNS_PPzP)>; - -// [32] "brkpa $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPA_PPzPP)>; - -// [33] "brkpas $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPAS_PPzPP)>; - -// [34] "brkpb $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPB_PPzPP)>; - -// [35] "brkpbs $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPBS_PPzPP)>; - -// [36] "clasta $Rdn, $Pg, $_Rdn, $Zm"; -def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTA_RPZ_B, CLASTA_RPZ_D, CLASTA_RPZ_H, CLASTA_RPZ_S)>; - -// [37] "clasta $Vdn, $Pg, $_Vdn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_VPZ_B, CLASTA_VPZ_D, CLASTA_VPZ_H, CLASTA_VPZ_S)>; - -// [38] "clasta $Zdn, $Pg, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_ZPZ_B, CLASTA_ZPZ_D, CLASTA_ZPZ_H, CLASTA_ZPZ_S)>; - -// [39] "clastb $Rdn, $Pg, $_Rdn, $Zm"; -def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTB_RPZ_B, CLASTB_RPZ_D, CLASTB_RPZ_H, CLASTB_RPZ_S)>; - -// [40] "clastb $Vdn, $Pg, $_Vdn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_VPZ_B, CLASTB_VPZ_D, CLASTB_VPZ_H, CLASTB_VPZ_S)>; - -// [41] "clastb $Zdn, $Pg, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_ZPZ_B, CLASTB_ZPZ_D, CLASTB_ZPZ_H, CLASTB_ZPZ_S)>; - -// [42] "cls $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLS_ZPmZ_B, CLS_ZPmZ_D, CLS_ZPmZ_H, CLS_ZPmZ_S)>; - -// [43] "clz $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLZ_ZPmZ_B, CLZ_ZPmZ_D, CLZ_ZPmZ_H, CLZ_ZPmZ_S)>; - -// [44] "cmpeq $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZZ_B, CMPEQ_PPzZZ_D, CMPEQ_PPzZZ_H, CMPEQ_PPzZZ_S, CMPEQ_WIDE_PPzZZ_B, CMPEQ_WIDE_PPzZZ_H, CMPEQ_WIDE_PPzZZ_S)>; - -// [45] "cmpeq $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZI_B, CMPEQ_PPzZI_D, CMPEQ_PPzZI_H, CMPEQ_PPzZI_S)>; - -// [46] "cmpge $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZZ_B, CMPGE_PPzZZ_D, CMPGE_PPzZZ_H, CMPGE_PPzZZ_S, CMPGE_WIDE_PPzZZ_B, CMPGE_WIDE_PPzZZ_H, CMPGE_WIDE_PPzZZ_S)>; - -// [47] "cmpge $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZI_B, CMPGE_PPzZI_D, CMPGE_PPzZI_H, CMPGE_PPzZI_S)>; - -// [48] "cmpgt $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZZ_B, CMPGT_PPzZZ_D, CMPGT_PPzZZ_H, CMPGT_PPzZZ_S, CMPGT_WIDE_PPzZZ_B, CMPGT_WIDE_PPzZZ_H, CMPGT_WIDE_PPzZZ_S)>; - -// [49] "cmpgt $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZI_B, CMPGT_PPzZI_D, CMPGT_PPzZI_H, CMPGT_PPzZI_S)>; - -// [50] "cmphi $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZZ_B, CMPHI_PPzZZ_D, CMPHI_PPzZZ_H, CMPHI_PPzZZ_S, CMPHI_WIDE_PPzZZ_B, CMPHI_WIDE_PPzZZ_H, CMPHI_WIDE_PPzZZ_S)>; - -// [51] "cmphi $Pd, $Pg/z, $Zn, $imm7"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZI_B, CMPHI_PPzZI_D, CMPHI_PPzZI_H, CMPHI_PPzZI_S)>; - -// [52] "cmphs $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZZ_B, CMPHS_PPzZZ_D, CMPHS_PPzZZ_H, CMPHS_PPzZZ_S, CMPHS_WIDE_PPzZZ_B, CMPHS_WIDE_PPzZZ_H, CMPHS_WIDE_PPzZZ_S)>; - -// [53] "cmphs $Pd, $Pg/z, $Zn, $imm7"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZI_B, CMPHS_PPzZI_D, CMPHS_PPzZI_H, CMPHS_PPzZI_S)>; - -// [54] "cmple $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_WIDE_PPzZZ_B, CMPLE_WIDE_PPzZZ_H, CMPLE_WIDE_PPzZZ_S)>; - -// [55] "cmple $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_PPzZI_B, CMPLE_PPzZI_D, CMPLE_PPzZI_H, CMPLE_PPzZI_S)>; - -// [56] "cmplo $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_WIDE_PPzZZ_B, CMPLO_WIDE_PPzZZ_H, CMPLO_WIDE_PPzZZ_S)>; - -// [57] "cmplo $Pd, $Pg/z, $Zn, $imm7"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_PPzZI_B, CMPLO_PPzZI_D, CMPLO_PPzZI_H, CMPLO_PPzZI_S)>; - -// [58] "cmpls $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_WIDE_PPzZZ_B, CMPLS_WIDE_PPzZZ_H, CMPLS_WIDE_PPzZZ_S)>; - -// [59] "cmpls $Pd, $Pg/z, $Zn, $imm7"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_PPzZI_B, CMPLS_PPzZI_D, CMPLS_PPzZI_H, CMPLS_PPzZI_S)>; - -// [60] "cmplt $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_WIDE_PPzZZ_B, CMPLT_WIDE_PPzZZ_H, CMPLT_WIDE_PPzZZ_S)>; - -// [61] "cmplt $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_PPzZI_B, CMPLT_PPzZI_D, CMPLT_PPzZI_H, CMPLT_PPzZI_S)>; - -// [62] "cmpne $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZZ_B, CMPNE_PPzZZ_D, CMPNE_PPzZZ_H, CMPNE_PPzZZ_S, CMPNE_WIDE_PPzZZ_B, CMPNE_WIDE_PPzZZ_H, CMPNE_WIDE_PPzZZ_S)>; - -// [63] "cmpne $Pd, $Pg/z, $Zn, $imm5"; -def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZI_B, CMPNE_PPzZI_D, CMPNE_PPzZI_H, CMPNE_PPzZI_S)>; - -// [64] "cnot $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs CNOT_ZPmZ_B, CNOT_ZPmZ_D, CNOT_ZPmZ_H, CNOT_ZPmZ_S)>; - -// [65] "cnt $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI3], (instrs CNT_ZPmZ_B, CNT_ZPmZ_D, CNT_ZPmZ_H, CNT_ZPmZ_S)>; - -// [66] "cntb $Rd, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTB_XPiI)>; - -// [67] "cntd $Rd, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTD_XPiI)>; - -// [68] "cnth $Rd, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTH_XPiI)>; - -// [69] "cntp $Rd, $Pg, $Pn"; -def : InstRW<[A64FXWrite_6Cyc_GI01], (instrs CNTP_XPP_B, CNTP_XPP_D, CNTP_XPP_H, CNTP_XPP_S)>; - -// [70] "cntw $Rd, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>; - -// [71] "compact $Zd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>; - -// [72] "cpy $Zd, $Pg/m, $Rn"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>; - -// [73] "cpy $Zd, $Pg/m, $Vn"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>; - -// [74] "cpy $Zd, $Pg/m, $imm"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>; - -// [75] "cpy $Zd, $Pg/z, $imm"; -//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>; - -// [76] "ctermeq $Rn, $Rm"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>; - -// [77] "ctermne $Rn, $Rm"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMNE_WW, CTERMNE_XX)>; - -// [78] "decb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECB_XPiI)>; - -// [79] "decd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECD_XPiI)>; - -// [80] "decd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECD_ZPiI)>; - -// [81] "dech $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECH_XPiI)>; - -// [82] "dech $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECH_ZPiI)>; - -// [83] "decp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs DECP_XP_B, DECP_XP_D, DECP_XP_H, DECP_XP_S)>; - -// [84] "decp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs DECP_ZP_D, DECP_ZP_H, DECP_ZP_S)>; - -// [85] "decw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECW_XPiI)>; - -// [86] "decw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECW_ZPiI)>; - -// [87] "dup $Zd, $Rn"; -def : InstRW<[A64FXWrite_8Cyc_GI01], (instrs DUP_ZR_B, DUP_ZR_D, DUP_ZR_H, DUP_ZR_S)>; - -// [88] "dup $Zd, $Zn$idx"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs DUP_ZZI_B, DUP_ZZI_D, DUP_ZZI_H, DUP_ZZI_Q, DUP_ZZI_S)>; - -// [89] "dup $Zd, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUP_ZI_B, DUP_ZI_D, DUP_ZI_H, DUP_ZI_S)>; - -// [90] "dupm $Zd, $imms"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUPM_ZI)>; - -// [91] "eor $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EOR_PPzPP)>; - -// [92] "eor $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZZZ)>; - -// [93] "eor $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZPmZ_B, EOR_ZPmZ_D, EOR_ZPmZ_H, EOR_ZPmZ_S)>; - -// [94] "eor $Zdn, $_Zdn, $imms13"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs EOR_ZI)>; - -// [95] "eors $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EORS_PPzPP)>; - -// [96] "eorv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs EORV_VPZ_B, EORV_VPZ_D, EORV_VPZ_H, EORV_VPZ_S)>; - -// [97] "ext $Zdn, $_Zdn, $Zm, $imm8"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs EXT_ZZI)>; - -// [99] "fabd $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FABD_ZPmZ_D, FABD_ZPmZ_H, FABD_ZPmZ_S)>; - -// [100] "fabs $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FABS_ZPmZ_D, FABS_ZPmZ_H, FABS_ZPmZ_S)>; - -// [101] "facge $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGE_PPzZZ_D, FACGE_PPzZZ_H, FACGE_PPzZZ_S)>; - -// [102] "facgt $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGT_PPzZZ_D, FACGT_PPzZZ_H, FACGT_PPzZZ_S)>; - -// [103] "fadd $Zd, $Zn, $Zm"; def is line 1638 -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZZZ_D, FADD_ZZZ_H, FADD_ZZZ_S)>; - -// [104] "fadd $Zdn, $Pg/m, $_Zdn, $Zm"; def is line 1638 -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmZ_D, FADD_ZPmZ_H, FADD_ZPmZ_S)>; - -// [105] "fadd $Zdn, $Pg/m, $_Zdn, $i1"; def is line 1638 -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmI_D, FADD_ZPmI_H, FADD_ZPmI_S)>; - -// [106] "fadda $Vdn, $Pg, $_Vdn, $Zm"; -def : InstRW<[A64FXWrite_18Cyc_GI03], (instrs FADDA_VPZ_D, FADDA_VPZ_H, FADDA_VPZ_S)>; - -// [107] "faddv $Vd, $Pg, $Zn"; -// H : 4 / 6 / ([1,2]9 / [1]6) x 4 / [1,2]9 = 75 cycle -// S : 4 / 6 / ([1,2]9 / [1]6) x 3 / [1,2]9 = 60 cycle -// D : 4 / 6 / ([1,2]9 / [1]6) x 2 / [1,2]9 = 45 cycle -def : InstRW<[A64FXWrite_75Cyc_GI03], (instrs FADDV_VPZ_H)>; -def : InstRW<[A64FXWrite_60Cyc_GI03], (instrs FADDV_VPZ_S)>; -def : InstRW<[A64FXWrite_45Cyc_GI03], (instrs FADDV_VPZ_D)>; - -// [108] "fcadd $Zdn, $Pg/m, $_Zdn, $Zm, $imm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCADD_ZPmZ_D, FCADD_ZPmZ_H, FCADD_ZPmZ_S)>; - -// [109] "fcmeq $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZ0_D, FCMEQ_PPzZ0_H, FCMEQ_PPzZ0_S)>; - -// [110] "fcmeq $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZZ_D, FCMEQ_PPzZZ_H, FCMEQ_PPzZZ_S)>; - -// [111] "fcmge $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZ0_D, FCMGE_PPzZ0_H, FCMGE_PPzZ0_S)>; - -// [112] "fcmge $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZZ_D, FCMGE_PPzZZ_H, FCMGE_PPzZZ_S)>; - -// [113] "fcmgt $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZ0_D, FCMGT_PPzZ0_H, FCMGT_PPzZ0_S)>; - -// [114] "fcmgt $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZZ_D, FCMGT_PPzZZ_H, FCMGT_PPzZZ_S)>; - -// [115] "fcmla $Zda, $Pg/m, $Zn, $Zm, $imm"; -def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZPmZZ_D, FCMLA_ZPmZZ_H, FCMLA_ZPmZZ_S)>; - -// [116] "fcmla $Zda, $Zn, $Zm$iop, $imm"; -def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZZZI_H, FCMLA_ZZZI_S)>; - -// [117] "fcmle $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLE_PPzZ0_D, FCMLE_PPzZ0_H, FCMLE_PPzZ0_S)>; - -// [118] "fcmlt $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLT_PPzZ0_D, FCMLT_PPzZ0_H, FCMLT_PPzZ0_S)>; - -// [119] "fcmne $Pd, $Pg/z, $Zn, #0.0"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZ0_D, FCMNE_PPzZ0_H, FCMNE_PPzZ0_S)>; - -// [120] "fcmne $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZZ_D, FCMNE_PPzZZ_H, FCMNE_PPzZZ_S)>; - -// [121] "fcmuo $Pd, $Pg/z, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMUO_PPzZZ_D, FCMUO_PPzZZ_H, FCMUO_PPzZZ_S)>; - -// [122] "fcpy $Zd, $Pg/m, $imm8"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCPY_ZPmI_D, FCPY_ZPmI_H, FCPY_ZPmI_S)>; - -// [123] "fcvt $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVT_ZPmZ_DtoH, FCVT_ZPmZ_DtoS, FCVT_ZPmZ_HtoD, FCVT_ZPmZ_HtoS, FCVT_ZPmZ_StoD, FCVT_ZPmZ_StoH)>; - -// [124] "fcvtzs $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZS_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoS, FCVTZS_ZPmZ_HtoD, FCVTZS_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoS, FCVTZS_ZPmZ_StoD, FCVTZS_ZPmZ_StoS)>; - -// [125] "fcvtzu $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZU_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoS, FCVTZU_ZPmZ_HtoD, FCVTZU_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoS, FCVTZU_ZPmZ_StoD, FCVTZU_ZPmZ_StoS)>; - -// [126] "fdiv $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIV_ZPmZ_D)>; -def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIV_ZPmZ_H)>; -def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIV_ZPmZ_S)>; - -// [127] "fdivr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIVR_ZPmZ_D)>; -def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIVR_ZPmZ_H)>; -def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIVR_ZPmZ_S)>; - -// [128] "fdup $Zd, $imm8"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FDUP_ZI_D, FDUP_ZI_H, FDUP_ZI_S)>; - -// [129] "fexpa $Zd, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FEXPA_ZZ_D, FEXPA_ZZ_H, FEXPA_ZZ_S)>; - -// [130] "fmad $Zdn, $Pg/m, $Zm, $Za"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMAD_ZPmZZ_D, FMAD_ZPmZZ_H, FMAD_ZPmZZ_S)>; - -// [131] "fmax $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAX_ZPmZ_D, FMAX_ZPmZ_H, FMAX_ZPmZ_S)>; - -// [132] "fmax $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAX_ZPmI_D, FMAX_ZPmI_H, FMAX_ZPmI_S)>; - -// [133] "fmaxnm $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAXNM_ZPmZ_D, FMAXNM_ZPmZ_H, FMAXNM_ZPmZ_S)>; - -// [134] "fmaxnm $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAXNM_ZPmI_D, FMAXNM_ZPmI_H, FMAXNM_ZPmI_S)>; - -// [135] "fmaxnmv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXNMV_VPZ_D, FMAXNMV_VPZ_H, FMAXNMV_VPZ_S)>; - -// [136] "fmaxv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXV_VPZ_D, FMAXV_VPZ_H, FMAXV_VPZ_S)>; - -// [137] "fmin $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMIN_ZPmZ_D, FMIN_ZPmZ_H, FMIN_ZPmZ_S)>; - -// [138] "fmin $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMIN_ZPmI_D, FMIN_ZPmI_H, FMIN_ZPmI_S)>; - -// [139] "fminnm $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMINNM_ZPmZ_D, FMINNM_ZPmZ_H, FMINNM_ZPmZ_S)>; - -// [140] "fminnm $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMINNM_ZPmI_D, FMINNM_ZPmI_H, FMINNM_ZPmI_S)>; - -// [141] "fminnmv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINNMV_VPZ_D, FMINNMV_VPZ_H, FMINNMV_VPZ_S)>; - -// [142] "fminv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINV_VPZ_D, FMINV_VPZ_H, FMINV_VPZ_S)>; - -// [143] "fmla $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZPmZZ_D, FMLA_ZPmZZ_H, FMLA_ZPmZZ_S)>; - -// [144] "fmla $Zda, $Zn, $Zm$iop"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZZZI_D, FMLA_ZZZI_H, FMLA_ZZZI_S)>; - -// [145] "fmls $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZPmZZ_D, FMLS_ZPmZZ_H, FMLS_ZPmZZ_S)>; - -// [146] "fmls $Zda, $Zn, $Zm$iop"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZZZI_D, FMLS_ZZZI_H, FMLS_ZZZI_S)>; - -// [147] "fmsb $Zdn, $Pg/m, $Zm, $Za"; - -// [148] "fmul $Zd, $Zn, $Zm"; - -// [149] "fmul $Zd, $Zn, $Zm$iop"; - -// [150] "fmul $Zdn, $Pg/m, $_Zdn, $Zm"; - -// [151] "fmul $Zdn, $Pg/m, $_Zdn, $i1"; - -// [152] "fmulx $Zdn, $Pg/m, $_Zdn, $Zm"; - -// [153] "fneg $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FNEG_ZPmZ_D, FNEG_ZPmZ_H, FNEG_ZPmZ_S)>; - -// [154] "fnmad $Zdn, $Pg/m, $Zm, $Za"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMAD_ZPmZZ_D, FNMAD_ZPmZZ_H, FNMAD_ZPmZZ_S)>; - -// [155] "fnmla $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLA_ZPmZZ_D, FNMLA_ZPmZZ_H, FNMLA_ZPmZZ_S)>; - -// [156] "fnmls $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLS_ZPmZZ_D, FNMLS_ZPmZZ_H, FNMLS_ZPmZZ_S)>; - -// [157] "fnmsb $Zdn, $Pg/m, $Zm, $Za"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMSB_ZPmZZ_D, FNMSB_ZPmZZ_H, FNMSB_ZPmZZ_S)>; - -// [158] "frecpe $Zd, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPE_ZZ_D, FRECPE_ZZ_H, FRECPE_ZZ_S)>; - -// [159] "frecps $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRECPS_ZZZ_D, FRECPS_ZZZ_H, FRECPS_ZZZ_S)>; - -// [160] "frecpx $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPX_ZPmZ_D, FRECPX_ZPmZ_H, FRECPX_ZPmZ_S)>; - -// [161] "frinta $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTA_ZPmZ_D, FRINTA_ZPmZ_H, FRINTA_ZPmZ_S)>; - -// [162] "frinti $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTI_ZPmZ_D, FRINTI_ZPmZ_H, FRINTI_ZPmZ_S)>; - -// [163] "frintm $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTM_ZPmZ_D, FRINTM_ZPmZ_H, FRINTM_ZPmZ_S)>; - -// [164] "frintn $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTN_ZPmZ_D, FRINTN_ZPmZ_H, FRINTN_ZPmZ_S)>; - -// [165] "frintp $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTP_ZPmZ_D, FRINTP_ZPmZ_H, FRINTP_ZPmZ_S)>; - -// [166] "frintx $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTX_ZPmZ_D, FRINTX_ZPmZ_H, FRINTX_ZPmZ_S)>; - -// [167] "frintz $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTZ_ZPmZ_D, FRINTZ_ZPmZ_H, FRINTZ_ZPmZ_S)>; - -// [168] "frsqrte $Zd, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRSQRTE_ZZ_D, FRSQRTE_ZZ_H, FRSQRTE_ZZ_S)>; - -// [169] "frsqrts $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRSQRTS_ZZZ_D, FRSQRTS_ZZZ_H, FRSQRTS_ZZZ_S)>; - -// [170] "fscale $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSCALE_ZPmZ_D, FSCALE_ZPmZ_H, FSCALE_ZPmZ_S)>; - -// [171] "fsqrt $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FSQRT_ZPmZ_D)>; -def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FSQRT_ZPmZ_H)>; -def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FSQRT_ZPmZ_S)>; - -// [172] "fsub $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZZZ_D, FSUB_ZZZ_H, FSUB_ZZZ_S)>; - -// [173] "fsub $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZPmZ_D, FSUB_ZPmZ_H, FSUB_ZPmZ_S)>; - -// [174] "fsub $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUB_ZPmI_D, FSUB_ZPmI_H, FSUB_ZPmI_S)>; - -// [175] "fsubr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUBR_ZPmZ_D, FSUBR_ZPmZ_H, FSUBR_ZPmZ_S)>; - -// [176] "fsubr $Zdn, $Pg/m, $_Zdn, $i1"; -def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUBR_ZPmI_D, FSUBR_ZPmI_H, FSUBR_ZPmI_S)>; - -// [177] "ftmad $Zdn, $_Zdn, $Zm, $imm3"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTMAD_ZZI_D, FTMAD_ZZI_H, FTMAD_ZZI_S)>; - -// [178] "ftsmul $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTSMUL_ZZZ_D, FTSMUL_ZZZ_H, FTSMUL_ZZZ_S)>; - -// [180] "incb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCB_XPiI)>; - -// [181] "incd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCD_XPiI)>; - -// [182] "incd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCD_ZPiI)>; - -// [183] "inch $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCH_XPiI)>; - -// [184] "inch $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCH_ZPiI)>; - -// [185] "incp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs INCP_XP_B, INCP_XP_D, INCP_XP_H, INCP_XP_S)>; - -// [186] "incp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs INCP_ZP_D, INCP_ZP_H, INCP_ZP_S)>; - -// [187] "incw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCW_XPiI)>; - -// [188] "incw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCW_ZPiI)>; - -// [189] "index $Zd, $Rn, $Rm"; -def : InstRW<[A64FXWrite_17Cyc_GI02], (instrs INDEX_RR_B, INDEX_RR_D, INDEX_RR_H, INDEX_RR_S)>; - -// [190] "index $Zd, $Rn, $imm5"; -def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_RI_B, INDEX_RI_D, INDEX_RI_H, INDEX_RI_S)>; - -// [191] "index $Zd, $imm5, $Rm"; -def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_IR_B, INDEX_IR_D, INDEX_IR_H, INDEX_IR_S)>; - -// [192] "index $Zd, $imm5, $imm5b"; -def : InstRW<[A64FXWrite_13Cyc_GI0], (instrs INDEX_II_B, INDEX_II_D, INDEX_II_H, INDEX_II_S)>; - -// [193] "insr $Zdn, $Rm"; -def : InstRW<[A64FXWrite_10Cyc_GI02], (instrs INSR_ZR_B, INSR_ZR_D, INSR_ZR_H, INSR_ZR_S)>; - -// [194] "insr $Zdn, $Vm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs INSR_ZV_B, INSR_ZV_D, INSR_ZV_H, INSR_ZV_S)>; - -// [195] "lasta $Rd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTA_RPZ_B, LASTA_RPZ_D, LASTA_RPZ_H, LASTA_RPZ_S)>; - -// [196] "lasta $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTA_VPZ_B, LASTA_VPZ_D, LASTA_VPZ_H, LASTA_VPZ_S)>; - -// [197] "lastb $Rd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTB_RPZ_B, LASTB_RPZ_D, LASTB_RPZ_H, LASTB_RPZ_S)>; - -// [198] "lastb $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTB_VPZ_B, LASTB_VPZ_D, LASTB_VPZ_H, LASTB_VPZ_S)>; - -// [199] "ld1b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B, LD1B_D, LD1B_H, LD1B_S)>; - -// [200] "ld1b $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1B_D_REAL, GLD1B_D_SXTW_REAL, GLD1B_D_UXTW_REAL, GLD1B_S_SXTW_REAL, GLD1B_S_UXTW_REAL)>; - -// [201] "ld1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B_D_IMM_REAL, LD1B_H_IMM_REAL, LD1B_IMM_REAL, LD1B_S_IMM_REAL)>; - -// [202] "ld1b $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1B_D_IMM_REAL, GLD1B_S_IMM_REAL)>; - -// [203] "ld1d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D)>; - -// [204] "ld1d $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1D_REAL, GLD1D_SCALED_REAL, GLD1D_SXTW_REAL, GLD1D_SXTW_SCALED_REAL, GLD1D_UXTW_REAL, GLD1D_UXTW_SCALED_REAL)>; - -// [205] "ld1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D_IMM_REAL)>; - -// [206] "ld1d $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1D_IMM_REAL)>; - -// [207] "ld1h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H, LD1H_D, LD1H_S)>; - -// [208] "ld1h $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1H_D_REAL, GLD1H_D_SCALED_REAL, GLD1H_D_SXTW_REAL, GLD1H_D_SXTW_SCALED_REAL, GLD1H_D_UXTW_REAL, GLD1H_D_UXTW_SCALED_REAL, GLD1H_S_SXTW_REAL, GLD1H_S_SXTW_SCALED_REAL, GLD1H_S_UXTW_REAL, GLD1H_S_UXTW_SCALED_REAL)>; - -// [209] "ld1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H_D_IMM_REAL, LD1H_IMM_REAL, LD1H_S_IMM_REAL)>; - -// [210] "ld1h $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1H_D_IMM_REAL, GLD1H_S_IMM_REAL)>; - -// [211] "ld1rb $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RB_D_IMM, LD1RB_H_IMM, LD1RB_IMM, LD1RB_S_IMM)>; - -// [212] "ld1rd $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RD_IMM)>; - -// [213] "ld1rh $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RH_D_IMM, LD1RH_IMM, LD1RH_S_IMM)>; - -// [214] "ld1rqb $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B)>; - -// [215] "ld1rqb $Zt, $Pg/z, [$Rn, $imm4]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B_IMM)>; - -// [216] "ld1rqd $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D)>; - -// [217] "ld1rqd $Zt, $Pg/z, [$Rn, $imm4]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D_IMM)>; - -// [218] "ld1rqh $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H)>; - -// [219] "ld1rqh $Zt, $Pg/z, [$Rn, $imm4]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H_IMM)>; - -// [220] "ld1rqw $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W)>; - -// [221] "ld1rqw $Zt, $Pg/z, [$Rn, $imm4]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W_IMM)>; - -// [222] "ld1rsb $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSB_D_IMM, LD1RSB_H_IMM, LD1RSB_S_IMM)>; - -// [223] "ld1rsh $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSH_D_IMM, LD1RSH_S_IMM)>; - -// [224] "ld1rsw $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSW_IMM)>; - -// [225] "ld1rw $Zt, $Pg/z, [$Rn, $imm6]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RW_D_IMM, LD1RW_IMM)>; - -// [226] "ld1sb $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D, LD1SB_H, LD1SB_S)>; - -// [227] "ld1sb $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SB_D_REAL, GLD1SB_D_SXTW_REAL, GLD1SB_D_UXTW_REAL, GLD1SB_S_SXTW_REAL, GLD1SB_S_UXTW_REAL)>; - -// [228] "ld1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D_IMM_REAL, LD1SB_H_IMM_REAL, LD1SB_S_IMM_REAL)>; - -// [229] "ld1sb $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SB_D_IMM_REAL, GLD1SB_S_IMM_REAL)>; - -// [230] "ld1sh $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D, LD1SH_S)>; - -// [231] "ld1sh $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SH_D_REAL, GLD1SH_D_SCALED_REAL, GLD1SH_D_SXTW_REAL, GLD1SH_D_SXTW_SCALED_REAL, GLD1SH_D_UXTW_REAL, GLD1SH_D_UXTW_SCALED_REAL, GLD1SH_S_SXTW_REAL, GLD1SH_S_SXTW_SCALED_REAL, GLD1SH_S_UXTW_REAL, GLD1SH_S_UXTW_SCALED_REAL)>; - -// [232] "ld1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D_IMM_REAL, LD1SH_S_IMM_REAL)>; - -// [233] "ld1sh $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SH_D_IMM_REAL, GLD1SH_S_IMM_REAL)>; - -// [234] "ld1sw $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D)>; - -// [235] "ld1sw $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SW_D_REAL, GLD1SW_D_SCALED_REAL, GLD1SW_D_SXTW_REAL, GLD1SW_D_SXTW_SCALED_REAL, GLD1SW_D_UXTW_REAL, GLD1SW_D_UXTW_SCALED_REAL)>; - -// [236] "ld1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D_IMM_REAL)>; - -// [237] "ld1sw $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SW_D_IMM_REAL)>; - -// [238] "ld1w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W, LD1W_D)>; - -// [239] "ld1w $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1W_D_REAL, GLD1W_D_SCALED_REAL, GLD1W_D_SXTW_REAL, GLD1W_D_SXTW_SCALED_REAL, GLD1W_D_UXTW_REAL, GLD1W_D_UXTW_SCALED_REAL, GLD1W_SXTW_REAL, GLD1W_SXTW_SCALED_REAL, GLD1W_UXTW_REAL, GLD1W_UXTW_SCALED_REAL)>; - -// [240] "ld1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W_D_IMM_REAL, LD1W_IMM_REAL)>; - -// [241] "ld1w $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1W_D_IMM_REAL, GLD1W_IMM_REAL)>; - -// [242] "ld2b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B)>; - -// [243] "ld2b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B_IMM)>; - -// [244] "ld2d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D)>; - -// [245] "ld2d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D_IMM)>; - -// [246] "ld2h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H)>; - -// [247] "ld2h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H_IMM)>; - -// [248] "ld2w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W)>; - -// [249] "ld2w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W_IMM)>; - -// [250] "ld3b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B)>; - -// [251] "ld3b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B_IMM)>; - -// [252] "ld3d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D)>; - -// [253] "ld3d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D_IMM)>; - -// [254] "ld3h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H)>; - -// [255] "ld3h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H_IMM)>; - -// [256] "ld3w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W)>; - -// [257] "ld3w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W_IMM)>; - -// [258] "ld4b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B)>; - -// [259] "ld4b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B_IMM)>; - -// [260] "ld4d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D)>; - -// [261] "ld4d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D_IMM)>; - -// [262] "ld4h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H)>; - -// [263] "ld4h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H_IMM)>; - -// [264] "ld4w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W)>; - -// [265] "ld4w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W_IMM)>; - -// [266] "ldff1b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1B_D_REAL, LDFF1B_H_REAL, LDFF1B_REAL, LDFF1B_S_REAL)>; - -// [267] "ldff1b $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1B_D_REAL, GLDFF1B_D_SXTW_REAL, GLDFF1B_D_UXTW_REAL, GLDFF1B_S_SXTW_REAL, GLDFF1B_S_UXTW_REAL)>; - -// [268] "ldff1b $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1B_D_IMM_REAL, GLDFF1B_S_IMM_REAL)>; - -// [269] "ldff1d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1D_REAL)>; - -// [270] "ldff1d $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1D_REAL, GLDFF1D_SCALED_REAL, GLDFF1D_SXTW_REAL, GLDFF1D_SXTW_SCALED_REAL, GLDFF1D_UXTW_REAL, GLDFF1D_UXTW_SCALED_REAL)>; - -// [271] "ldff1d $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1D_IMM_REAL)>; - -// [272] "ldff1h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1H_D_REAL, LDFF1H_REAL, LDFF1H_S_REAL)>; - -// [273] "ldff1h $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1H_D_REAL, GLDFF1H_D_SCALED_REAL, GLDFF1H_D_SXTW_REAL, GLDFF1H_D_SXTW_SCALED_REAL, GLDFF1H_D_UXTW_REAL, GLDFF1H_D_UXTW_SCALED_REAL, GLDFF1H_S_SXTW_REAL, GLDFF1H_S_SXTW_SCALED_REAL, GLDFF1H_S_UXTW_REAL, GLDFF1H_S_UXTW_SCALED_REAL)>; - -// [274] "ldff1h $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1H_D_IMM_REAL, GLDFF1H_S_IMM_REAL)>; - -// [275] "ldff1sb $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SB_D_REAL, LDFF1SB_H_REAL, LDFF1SB_S_REAL)>; - -// [276] "ldff1sb $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SB_D_REAL, GLDFF1SB_D_SXTW_REAL, GLDFF1SB_D_UXTW_REAL, GLDFF1SB_S_SXTW_REAL, GLDFF1SB_S_UXTW_REAL)>; - -// [277] "ldff1sb $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SB_D_IMM_REAL, GLDFF1SB_S_IMM_REAL)>; - -// [278] "ldff1sh $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SH_D_REAL, LDFF1SH_S_REAL)>; - -// [279] "ldff1sh $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SH_D_REAL, GLDFF1SH_D_SCALED_REAL, GLDFF1SH_D_SXTW_REAL, GLDFF1SH_D_SXTW_SCALED_REAL, GLDFF1SH_D_UXTW_REAL, GLDFF1SH_D_UXTW_SCALED_REAL, GLDFF1SH_S_SXTW_REAL, GLDFF1SH_S_SXTW_SCALED_REAL, GLDFF1SH_S_UXTW_REAL, GLDFF1SH_S_UXTW_SCALED_REAL)>; - -// [280] "ldff1sh $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SH_D_IMM_REAL, GLDFF1SH_S_IMM_REAL)>; - -// [281] "ldff1sw $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SW_D_REAL)>; - -// [282] "ldff1sw $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SW_D_REAL, GLDFF1SW_D_SCALED_REAL, GLDFF1SW_D_SXTW_REAL, GLDFF1SW_D_SXTW_SCALED_REAL, GLDFF1SW_D_UXTW_REAL, GLDFF1SW_D_UXTW_SCALED_REAL)>; - -// [283] "ldff1sw $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SW_D_IMM_REAL)>; - -// [284] "ldff1w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1W_D_REAL, LDFF1W_REAL)>; - -// [285] "ldff1w $Zt, $Pg/z, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1W_D_REAL, GLDFF1W_D_SCALED_REAL, GLDFF1W_D_SXTW_REAL, GLDFF1W_D_SXTW_SCALED_REAL, GLDFF1W_D_UXTW_REAL, GLDFF1W_D_UXTW_SCALED_REAL, GLDFF1W_SXTW_REAL, GLDFF1W_SXTW_SCALED_REAL, GLDFF1W_UXTW_REAL, GLDFF1W_UXTW_SCALED_REAL)>; - -// [286] "ldff1w $Zt, $Pg/z, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1W_D_IMM_REAL, GLDFF1W_IMM_REAL)>; - -// [287] "ldnf1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1B_D_IMM_REAL, LDNF1B_H_IMM_REAL, LDNF1B_IMM_REAL, LDNF1B_S_IMM_REAL)>; - -// [288] "ldnf1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1D_IMM_REAL)>; - -// [289] "ldnf1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1H_D_IMM_REAL, LDNF1H_IMM_REAL, LDNF1H_S_IMM_REAL)>; - -// [290] "ldnf1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SB_D_IMM_REAL, LDNF1SB_H_IMM_REAL, LDNF1SB_S_IMM_REAL)>; - -// [291] "ldnf1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SH_D_IMM_REAL, LDNF1SH_S_IMM_REAL)>; - -// [292] "ldnf1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SW_D_IMM_REAL)>; - -// [293] "ldnf1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1W_D_IMM_REAL, LDNF1W_IMM_REAL)>; - -// [294] "ldnt1b $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRR)>; - -// [295] "ldnt1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRI)>; - -// [296] "ldnt1d $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRR)>; - -// [297] "ldnt1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRI)>; - -// [298] "ldnt1h $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRR)>; - -// [299] "ldnt1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRI)>; - -// [300] "ldnt1w $Zt, $Pg/z, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRR)>; - -// [301] "ldnt1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRI)>; - -// [302] "ldr $Pt, [$Rn, $imm9, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_PXI)>; - -// [303] "ldr $Zt, [$Rn, $imm9, mul vl]"; -def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_ZXI)>; - -// [304] "lsl $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZZZ_B, LSL_WIDE_ZZZ_H, LSL_WIDE_ZZZ_S)>; - -// [305] "lsl $Zd, $Zn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZZI_B, LSL_ZZI_D, LSL_ZZI_H, LSL_ZZI_S)>; - -// [306] "lsl $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZPmZ_B, LSL_WIDE_ZPmZ_H, LSL_WIDE_ZPmZ_S, LSL_ZPmZ_B, LSL_ZPmZ_D, LSL_ZPmZ_H, LSL_ZPmZ_S)>; - -// [307] "lsl $Zdn, $Pg/m, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZPmI_B, LSL_ZPmI_D, LSL_ZPmI_H, LSL_ZPmI_S)>; - -// [308] "lslr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSLR_ZPmZ_B, LSLR_ZPmZ_D, LSLR_ZPmZ_H, LSLR_ZPmZ_S)>; - -// [309] "lsr $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZZZ_B, LSR_WIDE_ZZZ_H, LSR_WIDE_ZZZ_S)>; - -// [310] "lsr $Zd, $Zn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZZI_B, LSR_ZZI_D, LSR_ZZI_H, LSR_ZZI_S)>; - -// [311] "lsr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZPmZ_B, LSR_WIDE_ZPmZ_H, LSR_WIDE_ZPmZ_S, LSR_ZPmZ_B, LSR_ZPmZ_D, LSR_ZPmZ_H, LSR_ZPmZ_S)>; - -// [312] "lsr $Zdn, $Pg/m, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZPmI_B, LSR_ZPmI_D, LSR_ZPmI_H, LSR_ZPmI_S)>; - -// [313] "lsrr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSRR_ZPmZ_B, LSRR_ZPmZ_D, LSRR_ZPmZ_H, LSRR_ZPmZ_S)>; - -// [314] "mad $Zdn, $Pg/m, $Zm, $Za"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MAD_ZPmZZ_B, MAD_ZPmZZ_D, MAD_ZPmZZ_H, MAD_ZPmZZ_S)>; - -// [315] "mla $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLA_ZPmZZ_B, MLA_ZPmZZ_D, MLA_ZPmZZ_H, MLA_ZPmZZ_S)>; - -// [316] "mls $Zda, $Pg/m, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLS_ZPmZZ_B, MLS_ZPmZZ_D, MLS_ZPmZZ_H, MLS_ZPmZZ_S)>; - -// [317] "movprfx $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPmZ_B, MOVPRFX_ZPmZ_D, MOVPRFX_ZPmZ_H, MOVPRFX_ZPmZ_S)>; - -// [318] "movprfx $Zd, $Pg/z, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPzZ_B, MOVPRFX_ZPzZ_D, MOVPRFX_ZPzZ_H, MOVPRFX_ZPzZ_S)>; - -// [319] "movprfx $Zd, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZZ)>; - -// [320] "msb $Zdn, $Pg/m, $Zm, $Za"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MSB_ZPmZZ_B, MSB_ZPmZZ_D, MSB_ZPmZZ_H, MSB_ZPmZZ_S)>; - -// [321] "mul $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MUL_ZPmZ_B, MUL_ZPmZ_D, MUL_ZPmZ_H, MUL_ZPmZ_S)>; - -// [322] "mul $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs MUL_ZI_B, MUL_ZI_D, MUL_ZI_H, MUL_ZI_S)>; - -// [323] "nand $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NAND_PPzPP)>; - -// [324] "nands $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NANDS_PPzPP)>; - -// [325] "neg $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NEG_ZPmZ_B, NEG_ZPmZ_D, NEG_ZPmZ_H, NEG_ZPmZ_S)>; - -// [326] "nor $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NOR_PPzPP)>; - -// [327] "nors $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NORS_PPzPP)>; - -// [328] "not $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NOT_ZPmZ_B, NOT_ZPmZ_D, NOT_ZPmZ_H, NOT_ZPmZ_S)>; - -// [329] "orn $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORN_PPzPP)>; - -// [330] "orns $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORNS_PPzPP)>; - -// [331] "orr $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORR_PPzPP)>; - -// [332] "orr $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZZZ)>; - -// [333] "orr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZPmZ_B, ORR_ZPmZ_D, ORR_ZPmZ_H, ORR_ZPmZ_S)>; - -// [334] "orr $Zdn, $_Zdn, $imms13"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs ORR_ZI)>; - -// [335] "orrs $Pd, $Pg/z, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORRS_PPzPP)>; - -// [336] "orv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ORV_VPZ_B, ORV_VPZ_D, ORV_VPZ_H, ORV_VPZ_S)>; - -// [337] "pfalse $Pd"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PFALSE)>; - -// [338] "pnext $Pdn, $Pg, $_Pdn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PNEXT_B, PNEXT_D, PNEXT_H, PNEXT_S)>; - -// [339] "prfb $prfop, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRR)>; - -// [340] "prfb $prfop, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFB_D_SCALED, PRFB_D_SXTW_SCALED, PRFB_D_UXTW_SCALED, PRFB_S_SXTW_SCALED, PRFB_S_UXTW_SCALED)>; - -// [341] "prfb $prfop, $Pg, [$Rn, $imm6, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRI)>; - -// [342] "prfb $prfop, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFB_D_PZI, PRFB_S_PZI)>; - -// [343] "prfd $prfop, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRR)>; - -// [344] "prfd $prfop, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFD_D_SCALED, PRFD_D_SXTW_SCALED, PRFD_D_UXTW_SCALED, PRFD_S_SXTW_SCALED, PRFD_S_UXTW_SCALED)>; - -// [345] "prfd $prfop, $Pg, [$Rn, $imm6, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRI)>; - -// [346] "prfd $prfop, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFD_D_PZI, PRFD_S_PZI)>; - -// [347] "prfh $prfop, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRR)>; - -// [348] "prfh $prfop, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFH_D_SCALED, PRFH_D_SXTW_SCALED, PRFH_D_UXTW_SCALED, PRFH_S_SXTW_SCALED, PRFH_S_UXTW_SCALED)>; - -// [349] "prfh $prfop, $Pg, [$Rn, $imm6, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>; - -// [350] "prfh $prfop, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>; - -// [351] "prfw $prfop, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>; - -// [352] "prfw $prfop, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>; - -// [353] "prfw $prfop, $Pg, [$Rn, $imm6, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRI)>; - -// [354] "prfw $prfop, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFW_D_PZI, PRFW_S_PZI)>; - -// [355] "ptest $Pg, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTEST_PP)>; - -// [356] "ptrue $Pd, $pattern"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUE_B, PTRUE_D, PTRUE_H, PTRUE_S)>; - -// [357] "ptrues $Pd, $pattern"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUES_B, PTRUES_D, PTRUES_H, PTRUES_S)>; - -// [358] "punpkhi $Pd, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKHI_PP)>; - -// [359] "punpklo $Pd, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKLO_PP)>; - -// [360] "rbit $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBIT_ZPmZ_B, RBIT_ZPmZ_D, RBIT_ZPmZ_H, RBIT_ZPmZ_S)>; - -// [361] "rdffr $Pd"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_P)>; - -// [362] "rdffr $Pd, $Pg/z"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_PPz)>; - -// [363] "rdffrs $Pd, $Pg/z"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFRS_PPz)>; - -// [364] "rdvl $Rd, $imm6"; -def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs RDVLI_XI)>; - -// [365] "rev $Pd, $Pn"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs REV_PP_B, REV_PP_D, REV_PP_H, REV_PP_S)>; - -// [366] "rev $Zd, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs REV_ZZ_B, REV_ZZ_D, REV_ZZ_H, REV_ZZ_S)>; - -// [367] "revb $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVB_ZPmZ_D, REVB_ZPmZ_H, REVB_ZPmZ_S)>; - -// [368] "revh $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVH_ZPmZ_D, REVH_ZPmZ_S)>; - -// [369] "revw $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVW_ZPmZ_D)>; - -// [370] "sabd $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SABD_ZPmZ_B, SABD_ZPmZ_D, SABD_ZPmZ_H, SABD_ZPmZ_S)>; - -// [371] "saddv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs SADDV_VPZ_B, SADDV_VPZ_H, SADDV_VPZ_S)>; - -// [372] "scvtf $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SCVTF_ZPmZ_DtoD, SCVTF_ZPmZ_DtoH, SCVTF_ZPmZ_DtoS, SCVTF_ZPmZ_HtoH, SCVTF_ZPmZ_StoD, SCVTF_ZPmZ_StoH, SCVTF_ZPmZ_StoS)>; - -// [373] "sdiv $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIV_ZPmZ_D, SDIV_ZPmZ_S)>; - -// [374] "sdivr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIVR_ZPmZ_D, SDIVR_ZPmZ_S)>; - -// [375] "sdot $Zda, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SDOT_ZZZ_D, SDOT_ZZZ_S)>; - -// [376] "sdot $Zda, $Zn, $Zm$iop"; -def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs SDOT_ZZZI_D, SDOT_ZZZI_S)>; - -// [377] "sel $Pd, $Pg, $Pn, $Pm"; -def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs SEL_PPPP)>; - -// [378] "sel $Zd, $Pg, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SEL_ZPZZ_B, SEL_ZPZZ_D, SEL_ZPZZ_H, SEL_ZPZZ_S)>; - -// [379] "setffr"; -def : InstRW<[A64FXWrite_6Cyc], (instrs SETFFR)>; - -// [380] "smax $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMAX_ZPmZ_B, SMAX_ZPmZ_D, SMAX_ZPmZ_H, SMAX_ZPmZ_S)>; - -// [381] "smax $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMAX_ZI_B, SMAX_ZI_D, SMAX_ZI_H, SMAX_ZI_S)>; - -// [382] "smaxv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMAXV_VPZ_B, SMAXV_VPZ_D, SMAXV_VPZ_H, SMAXV_VPZ_S)>; - -// [383] "smin $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMIN_ZPmZ_B, SMIN_ZPmZ_D, SMIN_ZPmZ_H, SMIN_ZPmZ_S)>; - -// [384] "smin $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMIN_ZI_B, SMIN_ZI_D, SMIN_ZI_H, SMIN_ZI_S)>; - -// [385] "sminv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMINV_VPZ_B, SMINV_VPZ_D, SMINV_VPZ_H, SMINV_VPZ_S)>; - -// [386] "smulh $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SMULH_ZPmZ_B, SMULH_ZPmZ_D, SMULH_ZPmZ_H, SMULH_ZPmZ_S)>; - -// [387] "splice $Zdn, $Pg, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SPLICE_ZPZ_B, SPLICE_ZPZ_D, SPLICE_ZPZ_H, SPLICE_ZPZ_S)>; - -// [388] "sqadd $Zd, $Zn, $Zm"; - -// [389] "sqadd $Zdn, $_Zdn, $imm"; - -// [390] "sqdecb $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiWdI)>; - -// [391] "sqdecb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiI)>; - -// [392] "sqdecd $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiWdI)>; - -// [393] "sqdecd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiI)>; - -// [394] "sqdecd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECD_ZPiI)>; - -// [395] "sqdech $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiWdI)>; - -// [396] "sqdech $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiI)>; - -// [397] "sqdech $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECH_ZPiI)>; - -// [398] "sqdecp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XP_B, SQDECP_XP_D, SQDECP_XP_H, SQDECP_XP_S)>; - -// [399] "sqdecp $Rdn, $Pg, $_Rdn"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XPWd_B, SQDECP_XPWd_D, SQDECP_XPWd_H, SQDECP_XPWd_S)>; - -// [400] "sqdecp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQDECP_ZP_D, SQDECP_ZP_H, SQDECP_ZP_S)>; - -// [401] "sqdecw $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiWdI)>; - -// [402] "sqdecw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiI)>; - -// [403] "sqdecw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECW_ZPiI)>; - -// [404] "sqincb $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiWdI)>; - -// [405] "sqincb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiI)>; - -// [406] "sqincd $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiWdI)>; - -// [407] "sqincd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiI)>; - -// [408] "sqincd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCD_ZPiI)>; - -// [409] "sqinch $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiWdI)>; - -// [410] "sqinch $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiI)>; - -// [411] "sqinch $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCH_ZPiI)>; - -// [412] "sqincp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XP_B, SQINCP_XP_D, SQINCP_XP_H, SQINCP_XP_S)>; - -// [413] "sqincp $Rdn, $Pg, $_Rdn"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XPWd_B, SQINCP_XPWd_D, SQINCP_XPWd_H, SQINCP_XPWd_S)>; - -// [414] "sqincp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQINCP_ZP_D, SQINCP_ZP_H, SQINCP_ZP_S)>; - -// [415] "sqincw $Rdn, $_Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiWdI)>; - -// [416] "sqincw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiI)>; - -// [417] "sqincw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>; - -// [418] "sqsub $Zd, $Zn, $Zm"; - -// [419] "sqsub $Zdn, $_Zdn, $imm"; - -// [420] "st1b $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>; - -// [421] "st1b $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>; - -// [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>; - -// [423] "st1b $Zt, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>; - -// [424] "st1d $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>; - -// [425] "st1d $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>; - -// [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>; - -// [427] "st1d $Zt, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>; - -// [428] "st1h $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>; - -// [429] "st1h $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>; - -// [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>; - -// [431] "st1h $Zt, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>; - -// [432] "st1w $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>; - -// [433] "st1w $Zt, $Pg, [$Rn, $Zm]"; -def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>; - -// [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>; - -// [435] "st1w $Zt, $Pg, [$Zn, $imm5]"; -def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1W_D_IMM, SST1W_IMM)>; - -// [436] "st2b $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B)>; - -// [437] "st2b $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B_IMM)>; - -// [438] "st2d $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D)>; - -// [439] "st2d $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D_IMM)>; - -// [440] "st2h $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H)>; - -// [441] "st2h $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H_IMM)>; - -// [442] "st2w $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W)>; - -// [443] "st2w $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W_IMM)>; - -// [444] "st3b $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B)>; - -// [445] "st3b $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B_IMM)>; - -// [446] "st3d $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D)>; - -// [447] "st3d $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D_IMM)>; - -// [448] "st3h $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H)>; - -// [449] "st3h $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H_IMM)>; - -// [450] "st3w $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W)>; - -// [451] "st3w $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W_IMM)>; - -// [452] "st4b $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B)>; - -// [453] "st4b $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B_IMM)>; - -// [454] "st4d $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D)>; - -// [455] "st4d $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D_IMM)>; - -// [456] "st4h $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H)>; - -// [457] "st4h $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H_IMM)>; - -// [458] "st4w $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W)>; - -// [459] "st4w $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W_IMM)>; - -// [460] "stnt1b $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRR)>; - -// [461] "stnt1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRI)>; - -// [462] "stnt1d $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRR)>; - -// [463] "stnt1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRI)>; - -// [464] "stnt1h $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRR)>; - -// [465] "stnt1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRI)>; - -// [466] "stnt1w $Zt, $Pg, [$Rn, $Rm]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRR)>; - -// [467] "stnt1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; -def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRI)>; - -// [468] "str $Pt, [$Rn, $imm9, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI15], (instrs STR_PXI)>; - -// [469] "str $Zt, [$Rn, $imm9, mul vl]"; -def : InstRW<[A64FXWrite_6Cyc_GI05], (instrs STR_ZXI)>; - -// [470] "sub $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZZZ_B, SUB_ZZZ_D, SUB_ZZZ_H, SUB_ZZZ_S)>; - -// [471] "sub $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZPmZ_B, SUB_ZPmZ_D, SUB_ZPmZ_H, SUB_ZPmZ_S)>; - -// [472] "sub $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZI_B, SUB_ZI_D, SUB_ZI_H, SUB_ZI_S)>; - -// [473] "subr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUBR_ZPmZ_B, SUBR_ZPmZ_D, SUBR_ZPmZ_H, SUBR_ZPmZ_S)>; - -// [474] "subr $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SUBR_ZI_B, SUBR_ZI_D, SUBR_ZI_H, SUBR_ZI_S)>; - -// [475] "sunpkhi $Zd, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKHI_ZZ_D, SUNPKHI_ZZ_H, SUNPKHI_ZZ_S)>; - -// [476] "sunpklo $Zd, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKLO_ZZ_D, SUNPKLO_ZZ_H, SUNPKLO_ZZ_S)>; - -// [477] "sxtb $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTB_ZPmZ_D, SXTB_ZPmZ_H, SXTB_ZPmZ_S)>; - -// [478] "sxth $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTH_ZPmZ_D, SXTH_ZPmZ_S)>; - -// [479] "sxtw $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTW_ZPmZ_D)>; - -// [480] "tbl $Zd, $Zn, $Zm"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs TBL_ZZZ_B, TBL_ZZZ_D, TBL_ZZZ_H, TBL_ZZZ_S)>; - -// [481] "trn1 $Pd, $Pn, $Pm"; - -// [482] "trn1 $Zd, $Zn, $Zm"; - -// [483] "trn2 $Pd, $Pn, $Pm"; - -// [484] "trn2 $Zd, $Zn, $Zm"; - -// [486] "uabd $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UABD_ZPmZ_B, UABD_ZPmZ_D, UABD_ZPmZ_H, UABD_ZPmZ_S)>; - -// [487] "uaddv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs UADDV_VPZ_B, UADDV_VPZ_D, UADDV_VPZ_H, UADDV_VPZ_S)>; - -// [488] "ucvtf $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UCVTF_ZPmZ_DtoD, UCVTF_ZPmZ_DtoH, UCVTF_ZPmZ_DtoS, UCVTF_ZPmZ_HtoH, UCVTF_ZPmZ_StoD, UCVTF_ZPmZ_StoH, UCVTF_ZPmZ_StoS)>; - -// [489] "udiv $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIV_ZPmZ_D, UDIV_ZPmZ_S)>; - -// [490] "udivr $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIVR_ZPmZ_D, UDIVR_ZPmZ_S)>; - -// [491] "udot $Zda, $Zn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UDOT_ZZZ_D, UDOT_ZZZ_S)>; - -// [492] "udot $Zda, $Zn, $Zm$iop"; -def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs UDOT_ZZZI_D, UDOT_ZZZI_S)>; - -// [493] "umax $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMAX_ZPmZ_B, UMAX_ZPmZ_D, UMAX_ZPmZ_H, UMAX_ZPmZ_S)>; - -// [494] "umax $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMAX_ZI_B, UMAX_ZI_D, UMAX_ZI_H, UMAX_ZI_S)>; - -// [495] "umaxv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMAXV_VPZ_B, UMAXV_VPZ_D, UMAXV_VPZ_H, UMAXV_VPZ_S)>; - -// [496] "umin $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMIN_ZPmZ_B, UMIN_ZPmZ_D, UMIN_ZPmZ_H, UMIN_ZPmZ_S)>; - -// [497] "umin $Zdn, $_Zdn, $imm"; -def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMIN_ZI_B, UMIN_ZI_D, UMIN_ZI_H, UMIN_ZI_S)>; - -// [498] "uminv $Vd, $Pg, $Zn"; -def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMINV_VPZ_B, UMINV_VPZ_D, UMINV_VPZ_H, UMINV_VPZ_S)>; - -// [499] "umulh $Zdn, $Pg/m, $_Zdn, $Zm"; -def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UMULH_ZPmZ_B, UMULH_ZPmZ_D, UMULH_ZPmZ_H, UMULH_ZPmZ_S)>; - -// [500] "uqadd $Zd, $Zn, $Zm"; - -// [501] "uqadd $Zdn, $_Zdn, $imm"; - -// [502] "uqdecb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECB_WPiI, UQDECB_XPiI)>; - -// [503] "uqdecd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECD_WPiI, UQDECD_XPiI)>; - -// [504] "uqdecd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECD_ZPiI)>; - -// [505] "uqdech $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECH_WPiI, UQDECH_XPiI)>; - -// [506] "uqdech $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECH_ZPiI)>; - -// [507] "uqdecp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQDECP_WP_B, UQDECP_WP_D, UQDECP_WP_H, UQDECP_WP_S, UQDECP_XP_B, UQDECP_XP_D, UQDECP_XP_H, UQDECP_XP_S)>; - -// [508] "uqdecp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQDECP_ZP_D, UQDECP_ZP_H, UQDECP_ZP_S)>; - -// [509] "uqdecw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECW_WPiI, UQDECW_XPiI)>; - -// [510] "uqdecw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECW_ZPiI)>; - -// [511] "uqincb $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCB_WPiI, UQINCB_XPiI)>; - -// [512] "uqincd $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCD_WPiI, UQINCD_XPiI)>; - -// [513] "uqincd $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCD_ZPiI)>; - -// [514] "uqinch $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCH_WPiI, UQINCH_XPiI)>; - -// [515] "uqinch $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCH_ZPiI)>; - -// [516] "uqincp $Rdn, $Pg"; -def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQINCP_WP_B, UQINCP_WP_D, UQINCP_WP_H, UQINCP_WP_S, UQINCP_XP_B, UQINCP_XP_D, UQINCP_XP_H, UQINCP_XP_S)>; - -// [517] "uqincp $Zdn, $Pg"; -def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQINCP_ZP_D, UQINCP_ZP_H, UQINCP_ZP_S)>; - -// [518] "uqincw $Rdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCW_WPiI, UQINCW_XPiI)>; - -// [519] "uqincw $Zdn, $pattern, mul $imm4"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCW_ZPiI)>; - -// [520] "uqsub $Zd, $Zn, $Zm"; -//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZZZ_B, UQSUB_ZZZ_D, UQSUB_ZZZ_H, UQSUB_ZZZ_S)>; - -// [521] "uqsub $Zdn, $_Zdn, $imm"; -//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZI_B, UQSUB_ZI_D, UQSUB_ZI_H, UQSUB_ZI_S)>; - -// [522] "uunpkhi $Zd, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKHI_ZZ_D, UUNPKHI_ZZ_H, UUNPKHI_ZZ_S)>; - -// [523] "uunpklo $Zd, $Zn"; -def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKLO_ZZ_D, UUNPKLO_ZZ_H, UUNPKLO_ZZ_S)>; - -// [524] "uxtb $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTB_ZPmZ_D, UXTB_ZPmZ_H, UXTB_ZPmZ_S)>; - -// [525] "uxth $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTH_ZPmZ_D, UXTH_ZPmZ_S)>; - -// [526] "uxtw $Zd, $Pg/m, $Zn"; -def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTW_ZPmZ_D)>; - -// [527] "uzp1 $Pd, $Pn, $Pm"; - -// [528] "uzp1 $Zd, $Zn, $Zm"; - -// [529] "uzp2 $Pd, $Pn, $Pm"; - -// [530] "uzp2 $Zd, $Zn, $Zm"; - -// [531] "whilele $Pd, $Rn, $Rm"; -def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELE_PWW_B, WHILELE_PWW_D, WHILELE_PWW_H, WHILELE_PWW_S, WHILELE_PXX_B, WHILELE_PXX_D, WHILELE_PXX_H, WHILELE_PXX_S)>; - -// [532] "whilelo $Pd, $Rn, $Rm"; -def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELO_PWW_B, WHILELO_PWW_D, WHILELO_PWW_H, WHILELO_PWW_S, WHILELO_PXX_B, WHILELO_PXX_D, WHILELO_PXX_H, WHILELO_PXX_S)>; - -// [533] "whilels $Pd, $Rn, $Rm"; -def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELS_PWW_B, WHILELS_PWW_D, WHILELS_PWW_H, WHILELS_PWW_S, WHILELS_PXX_B, WHILELS_PXX_D, WHILELS_PXX_H, WHILELS_PXX_S)>; - -// [534] "whilelt $Pd, $Rn, $Rm"; -def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELT_PWW_B, WHILELT_PWW_D, WHILELT_PWW_H, WHILELT_PWW_S, WHILELT_PXX_B, WHILELT_PXX_D, WHILELT_PXX_H, WHILELT_PXX_S)>; - -// [535] "wrffr $Pn"; -def : InstRW<[A64FXWrite_6Cyc_NGI1], (instrs WRFFR)>; - -// [536] "zip1 $Pd, $Pn, $Pm"; - -// [537] "zip1 $Zd, $Zn, $Zm"; - -// [538] "zip2 $Pd, $Pn, $Pm"; - -// [539] "zip2 $Zd, $Zn, $Zm"; - -} // SchedModel = A64FXModel +//=- AArch64SchedA64FX.td - Fujitsu A64FX Scheduling Defs -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Fujitsu A64FX processors. +// +//===----------------------------------------------------------------------===// + +def A64FXModel : SchedMachineModel { + let IssueWidth = 6; // 6 micro-ops dispatched at a time. + let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. + let LoadLatency = 5; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 128; + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = + [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth]; + + let FullInstRWOverlapCheck = 0; +} + +let SchedModel = A64FXModel in { + +// Define the issue ports. + +// A64FXIP* + +// Port 0 +def A64FXIPFLA : ProcResource<1>; + +// Port 1 +def A64FXIPPR : ProcResource<1>; + +// Port 2 +def A64FXIPEXA : ProcResource<1>; + +// Port 3 +def A64FXIPFLB : ProcResource<1>; + +// Port 4 +def A64FXIPEXB : ProcResource<1>; + +// Port 5 +def A64FXIPEAGA : ProcResource<1>; + +// Port 6 +def A64FXIPEAGB : ProcResource<1>; + +// Port 7 +def A64FXIPBR : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes later on. + +def A64FXGI7 : ProcResGroup<[A64FXIPBR]>; + +def A64FXGI0 : ProcResGroup<[A64FXIPFLA]>; + +def A64FXGI1 : ProcResGroup<[A64FXIPPR]>; + +def A64FXGI2 : ProcResGroup<[A64FXIPEXA]>; + +def A64FXGI3 : ProcResGroup<[A64FXIPFLB]>; + +def A64FXGI4 : ProcResGroup<[A64FXIPEXB]>; + +def A64FXGI5 : ProcResGroup<[A64FXIPEAGA]>; + +def A64FXGI6 : ProcResGroup<[A64FXIPEAGB]>; + +def A64FXGI03 : ProcResGroup<[A64FXIPFLA, A64FXIPFLB]>; + +def A64FXGI01 : ProcResGroup<[A64FXIPFLA, A64FXIPPR]>; + +def A64FXGI02 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA]>; + +def A64FXGI12 : ProcResGroup<[A64FXIPEXA, A64FXIPPR]>; + +def A64FXGI15 : ProcResGroup<[A64FXIPEAGA, A64FXIPPR]>; + +def A64FXGI05 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA]>; + +def A64FXGI24 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB]>; + +def A64FXGI124 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPPR]>; + +def A64FXGI056 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI0256 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI56 : ProcResGroup<[A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI2456 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXAny : ProcResGroup<[A64FXIPFLA, A64FXIPPR, A64FXIPEXA, A64FXIPFLB, + A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB, A64FXIPBR]> { + let BufferSize = 60; +} + +def A64FXWrite_6Cyc : SchedWriteRes<[]> { + let Latency = 6; +} + +def A64FXWrite_1Cyc_GI7 : SchedWriteRes<[A64FXGI7]> { + let Latency = 1; +} + +def A64FXWrite_2Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 2; +} + +def A64FXWrite_4Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 4; +} + +def A64FXWrite_5Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 5; +} + +def A64FXWrite_6Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 8; +} + +def A64FXWrite_9Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 9; +} + +def A64FXWrite_13Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 13; +} + +def A64FXWrite_37Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 37; +} + +def A64FXWrite_98Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 98; +} + +def A64FXWrite_134Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 134; +} + +def A64FXWrite_154Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 154; +} + +def A64FXWrite_4Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 8; +} + +def A64FXWrite_12Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 12; +} + +def A64FXWrite_10Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 10; +} + +def A64FXWrite_17Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 17; +} + +def A64FXWrite_21Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 21; +} + +def A64FXWrite_3Cyc_GI1 : SchedWriteRes<[A64FXGI1]> { + let Latency = 3; +} + +def A64FXWrite_6Cyc_NGI1 : SchedWriteRes<[A64FXGI1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def A64FXWrite_4Cyc_GI12 : SchedWriteRes<[A64FXGI12]> { + let Latency = 4; +} + +def A64FXWrite_3Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 3; +} + +def A64FXWrite_5Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 5; +} + +def A64FXWrite_6Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 6; +} + +def A64FXWrite_4Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { + let Latency = 6; +} + +def A64FXWrite_6Cyc_GI15 : SchedWriteRes<[A64FXGI15]> { + let Latency = 6; +} + +def A64FXWrite_3Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 3; +} + +def A64FXWrite_4Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + +def A64FXWrite_9Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; +} + +def A64FXWrite_10Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_12Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; +} + +def A64FXWrite_14Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_15Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; +} + +def A64FXWrite_15Cyc_NGI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_18Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 18; +} + +def A64FXWrite_45Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 45; +} + +def A64FXWrite_60Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 60; +} + +def A64FXWrite_75Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 75; +} + +def A64FXWrite_6Cyc_GI05 : SchedWriteRes<[A64FXGI05]> { + let Latency = 6; +} + +def A64FXWrite_10Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 10; +} + +def A64FXWrite_12Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 12; +} + +def A64FXWrite_20Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 20; +} + +def A64FXWrite_5Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { + let Latency = 5; +} + +def A64FXWrite_11Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { + let Latency = 11; +} + +def A64FXWrite_5Cyc_GI6 : SchedWriteRes<[A64FXGI6]> { + let Latency = 5; +} + +def A64FXWrite_1Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 1; +} + +def A64FXWrite_2Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 2; +} + +def A64FXWrite_4Cyc_NGI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def A64FXWrite_6Cyc_GI124: SchedWriteRes<[A64FXGI124]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI124 : SchedWriteRes<[A64FXGI124]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_6Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_1Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_5Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 5; +} + +def A64FXWrite_8Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 8; +} + +def A64FXWrite_11Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 11; +} + +def A64FXWrite_44Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 44; +} + +def A64FXWrite_10Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 10; +} + +def A64FXWrite_15Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 15; +} + +def A64FXWrite_19Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 19; +} + +def A64FXWrite_25Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 25; +} + +def A64FXWrite_14Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 14; +} + +def A64FXWrite_19Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 19; +} + +def A64FXWrite_29Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 29; +} + +def A64FXWrite_LDNP: SchedWriteRes<[A64FXGI56]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def A64FXWrite_LDP01: SchedWriteRes<[A64FXGI2456]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def A64FXWrite_LDR01: SchedWriteRes<[A64FXGI2456]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def A64FXWrite_LD102: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD103: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 2; + +} + +def A64FXWrite_LD104: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD105: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def A64FXWrite_LD106: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD107: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def A64FXWrite_LD108: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD109: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def A64FXWrite_LD110: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD111: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def A64FXWrite_LD112: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD113: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def A64FXWrite_LD114: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def A64FXWrite_LD115: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 5; +} + +def A64FXWrite_LD1I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD1I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD2I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD2I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def A64FXWrite_LD3I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def A64FXWrite_LD3I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 7; +} + +def A64FXWrite_LD4I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def A64FXWrite_LD4I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 9; +} + +def A64FXWrite_1Cyc_GI2456 : SchedWriteRes<[A64FXGI2456]> { + let Latency = 1; +} + +def A64FXWrite_FMOV_GV : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_FMOV_VG14 : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_FMOV_VG : SchedWriteRes<[A64FXGI03]> { + let Latency = 25; +} + +def A64FXWrite_ADDLV : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; +} + +def A64FXWrite_MULLE : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_MULLV : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_MADDL : SchedWriteRes<[A64FXGI03]> { + let Latency = 6; +} + +def A64FXWrite_ABA : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + +def A64FXWrite_ABAL : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_ADDLV1 : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; + let NumMicroOps = 6; +} + +def A64FXWrite_MINMAXV : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; + let NumMicroOps = 6; +} + +def A64FXWrite_SQRDMULH : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; +} + +def A64FXWrite_PMUL : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + + +def A64FXWrite_SRSRAV : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_SSRAV : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_RSHRN : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_SHRN : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 2; +} + + +def A64FXWrite_ADDP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_FMULXE : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_FADDPV : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 3; +} + +def A64FXWrite_SADALP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_SADDLP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def A64FXWrite_FCVTXNV : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_FMAXVVH : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; + let NumMicroOps = 7; +} + +def A64FXWrite_FMAXVVS : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_BIF : SchedWriteRes<[A64FXGI03]> { + let Latency = 5; +} + +def A64FXWrite_DUPGENERAL : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_SHA00 : SchedWriteRes<[A64FXGI0]> { + let Latency = 9; +} + +def A64FXWrite_SHA01 : SchedWriteRes<[A64FXGI0]> { + let Latency = 12; +} + +def A64FXWrite_SMOV : SchedWriteRes<[A64FXGI03]> { + let Latency = 25; +} + +def A64FXWrite_TBX1 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_TBX2 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 5; +} + +def A64FXWrite_TBX3 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 7; +} + +def A64FXWrite_TBX4 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 9; +} + +def A64FXWrite_PREF0: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_PREF1: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_SWP: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STUR: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STNP: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STP01: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST10: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST11: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST12: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST13: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST14: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST15: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST16: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST17: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST1W_6: SchedWriteRes<[A64FXGI056]> { + let Latency = 6; +} + +def A64FXWrite_ST2W_7: SchedWriteRes<[A64FXGI056]> { + let Latency = 7; +} + +def A64FXWrite_ST3W_8: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; +} + +def A64FXWrite_ST4W_9: SchedWriteRes<[A64FXGI056]> { + let Latency = 9; +} + +def A64FXWrite_ST1W_15: SchedWriteRes<[A64FXGI056]> { + let Latency = 15; +} + +def A64FXWrite_ST1W_19: SchedWriteRes<[A64FXGI056]> { + let Latency = 19; +} + +def A64FXWrite_CAS: SchedWriteRes<[A64FXGI56]> { + let Latency = 7; +} + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes<WriteBr, [A64FXGI7]> { + let Latency = 1; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes<WriteBrReg, [A64FXGI7]> { + let Latency = 1; +} + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteAtomic, []> { + let Latency = 4; +} + +//--- +// Branch +//--- +def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs B, BL, BR, BLR)>; +def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs RET)>; +def : InstRW<[A64FXWrite_1Cyc_GI7], (instregex "^B..$")>; +def : InstRW<[A64FXWrite_1Cyc_GI7], + (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes<WriteI, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes<WriteISReg, [A64FXGI2456]> { + let Latency = 2; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : WriteRes<WriteIEReg, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +// Move immed +def : WriteRes<WriteImm, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[A64FXWrite_1Cyc_GI2456], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[A64FXWrite_2Cyc_GI24], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes<WriteIS, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +def : WriteRes<WriteID32, [A64FXGI4]> { + let Latency = 39; + let ResourceCycles = [39]; +} + +// Divide, X-form +def : WriteRes<WriteID64, [A64FXGI4]> { + let Latency = 23; + let ResourceCycles = [23]; +} + +// Multiply accumulate, W-form +def : WriteRes<WriteIM32, [A64FXGI2456]> { + let Latency = 5; + let ResourceCycles = [1]; +} + +// Multiply accumulate, X-form +def : WriteRes<WriteIM64, [A64FXGI2456]> { + let Latency = 5; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[A64FXWrite_MADDL], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes<WriteExtr, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +// Multiply high +def : InstRW<[A64FXWrite_5Cyc_GI2], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[A64FXWrite_1Cyc_GI24], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[A64FXWrite_4Cyc_NGI24], (instregex "^BFM")>; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[A64FXWrite_2Cyc_GI0], (instregex "^CLS(W|X)r$", + "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AES[DE]")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AESI?MC")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^PMULL")>; +def : InstRW<[A64FXWrite_SHA00], (instregex "^SHA1SU0")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA1[CMP]")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU0")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU1")>; +def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA256(H|H2)")>; + +// CRC Instructions +def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32Brr, CRC32Hrr)>; +def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32Wrr)>; +def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32Xrr)>; + +def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32CBrr, CRC32CHrr)>; +def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32CWrr)>; +def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes<WriteLD, [A64FXGI56]> { + let Latency = 4; + let ResourceCycles = [3]; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes<WriteAdr, [A64FXGI2456]> { + let Latency = 1; + let ResourceCycles = [1]; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes<WriteLDHi, []> { + let Latency = 5; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def A64FXWriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [A64FXWrite_1Cyc_GI56]>, + SchedVar<NoSchedPred, [A64FXWrite_1Cyc_GI56]>]>; +def : SchedAlias<WriteLDIdx, A64FXWriteLDIdx>; + +def A64FXReadAdrBase : SchedReadVariant<[ + SchedVar<ScaledIdxPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]>]>; +def : SchedAlias<ReadAdrBase, A64FXReadAdrBase>; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRBui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRDui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRHui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRQui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRSui)>; + +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRDl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRQl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRWl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRXl)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRXi)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSWi)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpost)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRDroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRQroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRDroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRQroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHXroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRXroX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURDi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURQi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSWi)>; + +//--- +// Prefetch +//--- +def : InstRW<[A64FXWrite_PREF0], (instrs PRFMl)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFUMi)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMui)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroW)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes<WriteST, [A64FXGI56]> { + let Latency = 1; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes<WriteSTIdx, [A64FXGI56, A64FXGI2456]> { + let Latency = 1; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes<WriteSTP, [A64FXGI56]> { + let Latency = 1; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. + +def : InstRW<[A64FXWrite_STUR], (instrs STURBi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURBBi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURDi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURHi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURHHi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURQi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURSi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURWi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURXi)>; + +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRBi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRHi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRWi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRXi)>; + +def : InstRW<[A64FXWrite_STNP], (instrs STNPDi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPQi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPXi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPWi)>; + +def : InstRW<[A64FXWrite_STNP], (instrs STPDi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPQi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPXi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPWi)>; + +def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; + +def : InstRW<[A64FXWrite_STP01], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRXroW, STRXroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes<WriteF, [A64FXGI03]> { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP arithmetic + +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FADDDrr, FADDHrr)>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FSUBDrr, FSUBHrr)>; + +// FP compare +def : WriteRes<WriteFCmp, [A64FXGI03]> { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP Div, Sqrt +def : WriteRes<WriteFDiv, [A64FXGI0]> { + let Latency = 43; +} + +def A64FXXWriteFDiv : SchedWriteRes<[A64FXGI0]> { + let Latency = 38; +} + +def A64FXXWriteFDivSP : SchedWriteRes<[A64FXGI0]> { + let Latency = 29; +} + +def A64FXXWriteFDivDP : SchedWriteRes<[A64FXGI0]> { + let Latency = 43; +} + +def A64FXXWriteFSqrtSP : SchedWriteRes<[A64FXGI0]> { + let Latency = 29; +} + +def A64FXXWriteFSqrtDP : SchedWriteRes<[A64FXGI0]> { + let Latency = 43; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[A64FXXWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVSrr")>; +def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^FSQRTSr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[A64FXXWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVDrr")>; +def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^FSQRTDr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes<WriteFMul, [A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +def A64FXXWriteFMul : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +def A64FXXWriteFMulAcc : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +def : InstRW<[A64FXXWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[A64FXXWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes<WriteFCvt, [A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +// FP move, immed +// FP move, register +def : WriteRes<WriteFImm, [A64FXGI0]> { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes<WriteFCopy, [A64FXGI0]> { + let Latency = 4; + let ResourceCycles = [2]; +} + +def : InstRW<[A64FXWrite_FMOV_GV], (instrs FMOVXDHighr)>; +def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes<WriteV, [A64FXGI03]> { + let Latency = 4; + let ResourceCycles = [1]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MVN (alias for NOT), ORN, ORR) +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD arith, reduce +def : InstRW<[A64FXWrite_ADDLV], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[A64FXWrite_MULLE], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A64FXWrite_MULLV], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[A64FXWrite_ABA], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[A64FXWrite_ABA], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[A64FXWrite_ABAL], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[A64FXWrite_PMUL], + (instregex "^(P?MUL|SQR?DMUL)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; + +// ASIMD multiply, Q-form +def : InstRW<[A64FXWrite_PMUL], + (instregex "^(P?MUL)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply, Q-form +def : InstRW<[A64FXWrite_SQRDMULH], + (instregex "^(SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[A64FXWrite_SRSRAV], + (instregex "SRSRAv", "URSRAv")>; +def : InstRW<[A64FXWrite_SSRAV], + (instregex "SSRAv", "USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[A64FXWrite_RSHRN], + (instregex "RSHRNv", "SQRSHRNv", "SQRSHRUNv", "UQRSHRNv")>; +def : InstRW<[A64FXWrite_SHRN], + (instregex "SHRNv", "SQSHRNv", "SQSHRUNv", "UQSHRNv")>; + +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "SQXTNv", "SQXTUNv", "UQXTNv")>; + +// ASIMD shift by immed, complex +def : InstRW<[A64FXWrite_ABA], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A64FXWrite_SHRN], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[A64FXWrite_RSHRN], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[A64FXWrite_ADDP], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[A64FXWrite_4Cyc_GI0], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[A64FXWrite_SADALP], (instregex "^SADALP", "^UADALP")>; +def : InstRW<[A64FXWrite_SADDLP], (instregex "^SADDLPv", "^UADDLPv")>; +def : InstRW<[A64FXWrite_ADDLV1], (instregex "^SADDLV", "^UADDLV")>; +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^ADDVv", "^SMAXVv", "^UMAXVv", "^SMINVv", "^UMINVv")>; +def : InstRW<[A64FXWrite_ABA], + (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQADDv", "^SQSUBv", "^UQADDv", "^UQSUBv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^SUQADDv", "^USQADDv")>; +def : InstRW<[A64FXWrite_SHRN], + (instregex "^ADDHNv", "^SUBHNv")>; +def : InstRW<[A64FXWrite_RSHRN], + (instregex "^RADDHNv", "^RSUBHNv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUQADD", "^UQADD", "^UQSUB", + "^URHADD", "^USQADD")>; + +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", + "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>; +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[A64FXWrite_ADDP], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SABDv", "^UABDv")>; +def : InstRW<[A64FXWrite_TBX1], + (instregex "^SABDLv", "^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith, pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[A64FXWrite_FADDPV], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; +// ASIMD FP round, D-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form + +// ASIMD FP convert, long and narrow +def : InstRW<[A64FXWrite_FCVTXNV], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[A64FXWrite_FCVTXNV], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[A64FXWrite_FCVTXNV], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVv2f32)>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[A64FXXWriteFDiv], (instrs FDIVv4f32)>; +def : InstRW<[A64FXXWriteFDiv], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVv2f64)>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[A64FXWrite_ADDP], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[A64FXWrite_FMAXVVH], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[A64FXWrite_1Cyc_GI2456], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[A64FXWrite_BIF], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI0], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^MOVIv")>; + +// ASIMD move, FP immed +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMOVv")>; + +// ASIMD table lookup, D-form +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv8i8One")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv8i8Two")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv8i8Three")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv8i8Four")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv8i8One")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv8i8Two")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv8i8Three")>; +def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv8i8Four")>; + +// ASIMD table lookup, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv16i8One")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv16i8Two")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv16i8Three")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv16i8Four")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv16i8One")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv16i8Two")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv16i8Three")>; +def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv16i8Four")>; + +// ASIMD transpose +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[A64FXWrite_6Cyc_GI0], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI0], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TBLv", "^TBXv")>; + +// ASIMD transfer, element to word or word +def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A64FXWrite_SMOV], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1v", "^TRN2v", + "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[A64FXWrite_8Cyc_GI56], + (instregex "^LD1Onev(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_11Cyc_GI56], + (instregex "^LD1Onev(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD108, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD109, WriteAdr], + (instregex "^LD1Onev(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[A64FXWrite_LD102], + (instregex "^LD1Twov(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD103], + (instregex "^LD1Twov(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD110, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD111, WriteAdr], + (instregex "^LD1Twov(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[A64FXWrite_LD104], + (instregex "^LD1Threev(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD105], + (instregex "^LD1Threev(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD112, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD113, WriteAdr], + (instregex "^LD1Threev(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[A64FXWrite_LD106], + (instregex "^LD1Fourv(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD107], + (instregex "^LD1Fourv(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD114, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD115, WriteAdr], + (instregex "^LD1Fourv(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[A64FXWrite_LD1I0], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD1I1, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[A64FXWrite_8Cyc_GI03], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD108, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD103], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD111, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[A64FXWrite_LD2I0], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD2I1, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[A64FXWrite_LD102], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD110, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD105], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD113, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[A64FXWrite_LD3I0], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD3I1, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[A64FXWrite_LD104], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD112, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD107], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD115, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[A64FXWrite_LD4I0], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD4I1, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[A64FXWrite_LD106], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD114, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[A64FXWrite_ST10], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST14, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[A64FXWrite_ST12], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[A64FXWrite_ST13], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[A64FXWrite_ST10], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST14, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST12], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[A64FXWrite_ST12], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST13], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[A64FXWrite_ST13], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +// V8.1a Atomics (LSE) +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[A64FXWrite_STUR, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + +// [ 1] "abs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ABS_ZPmZ_B, ABS_ZPmZ_D, ABS_ZPmZ_H, ABS_ZPmZ_S)>; + +// [ 2] "add $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZZZ_B, ADD_ZZZ_D, ADD_ZZZ_H, ADD_ZZZ_S)>; + +// [ 3] "add $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZPmZ_B, ADD_ZPmZ_D, ADD_ZPmZ_H, ADD_ZPmZ_S)>; + +// [ 4] "add $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZI_B, ADD_ZI_D, ADD_ZI_H, ADD_ZI_S)>; + +// [ 5] "addpl $Rd, $Rn, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDPL_XXI)>; + +// [ 6] "addvl $Rd, $Rn, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDVL_XXI)>; + +// [ 7] "adr $Zd, [$Zn, $Zm]"; +def : InstRW<[A64FXWrite_5Cyc_GI0], (instrs ADR_LSL_ZZZ_D_0, ADR_LSL_ZZZ_D_1, ADR_LSL_ZZZ_D_2, ADR_LSL_ZZZ_D_3, ADR_LSL_ZZZ_S_0, ADR_LSL_ZZZ_S_1, ADR_LSL_ZZZ_S_2, ADR_LSL_ZZZ_S_3, ADR_SXTW_ZZZ_D_0, ADR_SXTW_ZZZ_D_1, ADR_SXTW_ZZZ_D_2, ADR_SXTW_ZZZ_D_3, ADR_UXTW_ZZZ_D_0, ADR_UXTW_ZZZ_D_1, ADR_UXTW_ZZZ_D_2, ADR_UXTW_ZZZ_D_3)>; + +// [ 8] "and $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs AND_PPzPP)>; + +// [ 9] "and $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZZZ)>; + +// [10] "and $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZPmZ_B, AND_ZPmZ_D, AND_ZPmZ_H, AND_ZPmZ_S)>; + +// [11] "and $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZI)>; + +// [12] "ands $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ANDS_PPzPP)>; + +// [13] "andv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ANDV_VPZ_B, ANDV_VPZ_D, ANDV_VPZ_H, ANDV_VPZ_S)>; + +// [14] "asr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZZZ_B, ASR_WIDE_ZZZ_H, ASR_WIDE_ZZZ_S)>; + +// [15] "asr $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZZI_B, ASR_ZZI_D, ASR_ZZI_H, ASR_ZZI_S)>; + +// [16] "asr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZPmZ_B, ASR_WIDE_ZPmZ_H, ASR_WIDE_ZPmZ_S, ASR_ZPmZ_B, ASR_ZPmZ_D, ASR_ZPmZ_H, ASR_ZPmZ_S)>; + +// [17] "asr $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZPmI_B, ASR_ZPmI_D, ASR_ZPmI_H, ASR_ZPmI_S)>; + +// [18] "asrd $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRD_ZPmI_B, ASRD_ZPmI_D, ASRD_ZPmI_H, ASRD_ZPmI_S)>; + +// [19] "asrr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRR_ZPmZ_B, ASRR_ZPmZ_D, ASRR_ZPmZ_H, ASRR_ZPmZ_S)>; + +// [20] "bic $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BIC_PPzPP)>; + +// [21] "bic $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZZZ)>; + +// [22] "bic $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZPmZ_B, BIC_ZPmZ_D, BIC_ZPmZ_H, BIC_ZPmZ_S)>; + +// [23] "bics $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BICS_PPzPP)>; + +// [24] "brka $Pd, $Pg/m, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPmP)>; + +// [25] "brka $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPzP)>; + +// [26] "brkas $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKAS_PPzP)>; + +// [27] "brkb $Pd, $Pg/m, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPmP)>; + +// [28] "brkb $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPzP)>; + +// [29] "brkbs $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKBS_PPzP)>; + +// [30] "brkn $Pdm, $Pg/z, $Pn, $_Pdm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKN_PPzP)>; + +// [31] "brkns $Pdm, $Pg/z, $Pn, $_Pdm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKNS_PPzP)>; + +// [32] "brkpa $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPA_PPzPP)>; + +// [33] "brkpas $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPAS_PPzPP)>; + +// [34] "brkpb $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPB_PPzPP)>; + +// [35] "brkpbs $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPBS_PPzPP)>; + +// [36] "clasta $Rdn, $Pg, $_Rdn, $Zm"; +def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTA_RPZ_B, CLASTA_RPZ_D, CLASTA_RPZ_H, CLASTA_RPZ_S)>; + +// [37] "clasta $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_VPZ_B, CLASTA_VPZ_D, CLASTA_VPZ_H, CLASTA_VPZ_S)>; + +// [38] "clasta $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_ZPZ_B, CLASTA_ZPZ_D, CLASTA_ZPZ_H, CLASTA_ZPZ_S)>; + +// [39] "clastb $Rdn, $Pg, $_Rdn, $Zm"; +def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTB_RPZ_B, CLASTB_RPZ_D, CLASTB_RPZ_H, CLASTB_RPZ_S)>; + +// [40] "clastb $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_VPZ_B, CLASTB_VPZ_D, CLASTB_VPZ_H, CLASTB_VPZ_S)>; + +// [41] "clastb $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_ZPZ_B, CLASTB_ZPZ_D, CLASTB_ZPZ_H, CLASTB_ZPZ_S)>; + +// [42] "cls $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLS_ZPmZ_B, CLS_ZPmZ_D, CLS_ZPmZ_H, CLS_ZPmZ_S)>; + +// [43] "clz $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLZ_ZPmZ_B, CLZ_ZPmZ_D, CLZ_ZPmZ_H, CLZ_ZPmZ_S)>; + +// [44] "cmpeq $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZZ_B, CMPEQ_PPzZZ_D, CMPEQ_PPzZZ_H, CMPEQ_PPzZZ_S, CMPEQ_WIDE_PPzZZ_B, CMPEQ_WIDE_PPzZZ_H, CMPEQ_WIDE_PPzZZ_S)>; + +// [45] "cmpeq $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZI_B, CMPEQ_PPzZI_D, CMPEQ_PPzZI_H, CMPEQ_PPzZI_S)>; + +// [46] "cmpge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZZ_B, CMPGE_PPzZZ_D, CMPGE_PPzZZ_H, CMPGE_PPzZZ_S, CMPGE_WIDE_PPzZZ_B, CMPGE_WIDE_PPzZZ_H, CMPGE_WIDE_PPzZZ_S)>; + +// [47] "cmpge $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZI_B, CMPGE_PPzZI_D, CMPGE_PPzZI_H, CMPGE_PPzZI_S)>; + +// [48] "cmpgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZZ_B, CMPGT_PPzZZ_D, CMPGT_PPzZZ_H, CMPGT_PPzZZ_S, CMPGT_WIDE_PPzZZ_B, CMPGT_WIDE_PPzZZ_H, CMPGT_WIDE_PPzZZ_S)>; + +// [49] "cmpgt $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZI_B, CMPGT_PPzZI_D, CMPGT_PPzZI_H, CMPGT_PPzZI_S)>; + +// [50] "cmphi $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZZ_B, CMPHI_PPzZZ_D, CMPHI_PPzZZ_H, CMPHI_PPzZZ_S, CMPHI_WIDE_PPzZZ_B, CMPHI_WIDE_PPzZZ_H, CMPHI_WIDE_PPzZZ_S)>; + +// [51] "cmphi $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZI_B, CMPHI_PPzZI_D, CMPHI_PPzZI_H, CMPHI_PPzZI_S)>; + +// [52] "cmphs $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZZ_B, CMPHS_PPzZZ_D, CMPHS_PPzZZ_H, CMPHS_PPzZZ_S, CMPHS_WIDE_PPzZZ_B, CMPHS_WIDE_PPzZZ_H, CMPHS_WIDE_PPzZZ_S)>; + +// [53] "cmphs $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZI_B, CMPHS_PPzZI_D, CMPHS_PPzZI_H, CMPHS_PPzZI_S)>; + +// [54] "cmple $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_WIDE_PPzZZ_B, CMPLE_WIDE_PPzZZ_H, CMPLE_WIDE_PPzZZ_S)>; + +// [55] "cmple $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_PPzZI_B, CMPLE_PPzZI_D, CMPLE_PPzZI_H, CMPLE_PPzZI_S)>; + +// [56] "cmplo $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_WIDE_PPzZZ_B, CMPLO_WIDE_PPzZZ_H, CMPLO_WIDE_PPzZZ_S)>; + +// [57] "cmplo $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_PPzZI_B, CMPLO_PPzZI_D, CMPLO_PPzZI_H, CMPLO_PPzZI_S)>; + +// [58] "cmpls $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_WIDE_PPzZZ_B, CMPLS_WIDE_PPzZZ_H, CMPLS_WIDE_PPzZZ_S)>; + +// [59] "cmpls $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_PPzZI_B, CMPLS_PPzZI_D, CMPLS_PPzZI_H, CMPLS_PPzZI_S)>; + +// [60] "cmplt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_WIDE_PPzZZ_B, CMPLT_WIDE_PPzZZ_H, CMPLT_WIDE_PPzZZ_S)>; + +// [61] "cmplt $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_PPzZI_B, CMPLT_PPzZI_D, CMPLT_PPzZI_H, CMPLT_PPzZI_S)>; + +// [62] "cmpne $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZZ_B, CMPNE_PPzZZ_D, CMPNE_PPzZZ_H, CMPNE_PPzZZ_S, CMPNE_WIDE_PPzZZ_B, CMPNE_WIDE_PPzZZ_H, CMPNE_WIDE_PPzZZ_S)>; + +// [63] "cmpne $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZI_B, CMPNE_PPzZI_D, CMPNE_PPzZI_H, CMPNE_PPzZI_S)>; + +// [64] "cnot $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs CNOT_ZPmZ_B, CNOT_ZPmZ_D, CNOT_ZPmZ_H, CNOT_ZPmZ_S)>; + +// [65] "cnt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI3], (instrs CNT_ZPmZ_B, CNT_ZPmZ_D, CNT_ZPmZ_H, CNT_ZPmZ_S)>; + +// [66] "cntb $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTB_XPiI)>; + +// [67] "cntd $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTD_XPiI)>; + +// [68] "cnth $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTH_XPiI)>; + +// [69] "cntp $Rd, $Pg, $Pn"; +def : InstRW<[A64FXWrite_6Cyc_GI01], (instrs CNTP_XPP_B, CNTP_XPP_D, CNTP_XPP_H, CNTP_XPP_S)>; + +// [70] "cntw $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>; + +// [71] "compact $Zd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>; + +// [72] "cpy $Zd, $Pg/m, $Rn"; +//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>; + +// [73] "cpy $Zd, $Pg/m, $Vn"; +//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>; + +// [74] "cpy $Zd, $Pg/m, $imm"; +//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>; + +// [75] "cpy $Zd, $Pg/z, $imm"; +//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>; + +// [76] "ctermeq $Rn, $Rm"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>; + +// [77] "ctermne $Rn, $Rm"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMNE_WW, CTERMNE_XX)>; + +// [78] "decb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECB_XPiI)>; + +// [79] "decd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECD_XPiI)>; + +// [80] "decd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECD_ZPiI)>; + +// [81] "dech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECH_XPiI)>; + +// [82] "dech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECH_ZPiI)>; + +// [83] "decp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs DECP_XP_B, DECP_XP_D, DECP_XP_H, DECP_XP_S)>; + +// [84] "decp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs DECP_ZP_D, DECP_ZP_H, DECP_ZP_S)>; + +// [85] "decw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECW_XPiI)>; + +// [86] "decw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECW_ZPiI)>; + +// [87] "dup $Zd, $Rn"; +def : InstRW<[A64FXWrite_8Cyc_GI01], (instrs DUP_ZR_B, DUP_ZR_D, DUP_ZR_H, DUP_ZR_S)>; + +// [88] "dup $Zd, $Zn$idx"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs DUP_ZZI_B, DUP_ZZI_D, DUP_ZZI_H, DUP_ZZI_Q, DUP_ZZI_S)>; + +// [89] "dup $Zd, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUP_ZI_B, DUP_ZI_D, DUP_ZI_H, DUP_ZI_S)>; + +// [90] "dupm $Zd, $imms"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUPM_ZI)>; + +// [91] "eor $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EOR_PPzPP)>; + +// [92] "eor $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZZZ)>; + +// [93] "eor $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZPmZ_B, EOR_ZPmZ_D, EOR_ZPmZ_H, EOR_ZPmZ_S)>; + +// [94] "eor $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs EOR_ZI)>; + +// [95] "eors $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EORS_PPzPP)>; + +// [96] "eorv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs EORV_VPZ_B, EORV_VPZ_D, EORV_VPZ_H, EORV_VPZ_S)>; + +// [97] "ext $Zdn, $_Zdn, $Zm, $imm8"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs EXT_ZZI)>; + +// [99] "fabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FABD_ZPmZ_D, FABD_ZPmZ_H, FABD_ZPmZ_S)>; + +// [100] "fabs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FABS_ZPmZ_D, FABS_ZPmZ_H, FABS_ZPmZ_S)>; + +// [101] "facge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGE_PPzZZ_D, FACGE_PPzZZ_H, FACGE_PPzZZ_S)>; + +// [102] "facgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGT_PPzZZ_D, FACGT_PPzZZ_H, FACGT_PPzZZ_S)>; + +// [103] "fadd $Zd, $Zn, $Zm"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZZZ_D, FADD_ZZZ_H, FADD_ZZZ_S)>; + +// [104] "fadd $Zdn, $Pg/m, $_Zdn, $Zm"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmZ_D, FADD_ZPmZ_H, FADD_ZPmZ_S)>; + +// [105] "fadd $Zdn, $Pg/m, $_Zdn, $i1"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmI_D, FADD_ZPmI_H, FADD_ZPmI_S)>; + +// [106] "fadda $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_18Cyc_GI03], (instrs FADDA_VPZ_D, FADDA_VPZ_H, FADDA_VPZ_S)>; + +// [107] "faddv $Vd, $Pg, $Zn"; +// H : 4 / 6 / ([1,2]9 / [1]6) x 4 / [1,2]9 = 75 cycle +// S : 4 / 6 / ([1,2]9 / [1]6) x 3 / [1,2]9 = 60 cycle +// D : 4 / 6 / ([1,2]9 / [1]6) x 2 / [1,2]9 = 45 cycle +def : InstRW<[A64FXWrite_75Cyc_GI03], (instrs FADDV_VPZ_H)>; +def : InstRW<[A64FXWrite_60Cyc_GI03], (instrs FADDV_VPZ_S)>; +def : InstRW<[A64FXWrite_45Cyc_GI03], (instrs FADDV_VPZ_D)>; + +// [108] "fcadd $Zdn, $Pg/m, $_Zdn, $Zm, $imm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCADD_ZPmZ_D, FCADD_ZPmZ_H, FCADD_ZPmZ_S)>; + +// [109] "fcmeq $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZ0_D, FCMEQ_PPzZ0_H, FCMEQ_PPzZ0_S)>; + +// [110] "fcmeq $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZZ_D, FCMEQ_PPzZZ_H, FCMEQ_PPzZZ_S)>; + +// [111] "fcmge $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZ0_D, FCMGE_PPzZ0_H, FCMGE_PPzZ0_S)>; + +// [112] "fcmge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZZ_D, FCMGE_PPzZZ_H, FCMGE_PPzZZ_S)>; + +// [113] "fcmgt $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZ0_D, FCMGT_PPzZ0_H, FCMGT_PPzZ0_S)>; + +// [114] "fcmgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZZ_D, FCMGT_PPzZZ_H, FCMGT_PPzZZ_S)>; + +// [115] "fcmla $Zda, $Pg/m, $Zn, $Zm, $imm"; +def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZPmZZ_D, FCMLA_ZPmZZ_H, FCMLA_ZPmZZ_S)>; + +// [116] "fcmla $Zda, $Zn, $Zm$iop, $imm"; +def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZZZI_H, FCMLA_ZZZI_S)>; + +// [117] "fcmle $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLE_PPzZ0_D, FCMLE_PPzZ0_H, FCMLE_PPzZ0_S)>; + +// [118] "fcmlt $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLT_PPzZ0_D, FCMLT_PPzZ0_H, FCMLT_PPzZ0_S)>; + +// [119] "fcmne $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZ0_D, FCMNE_PPzZ0_H, FCMNE_PPzZ0_S)>; + +// [120] "fcmne $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZZ_D, FCMNE_PPzZZ_H, FCMNE_PPzZZ_S)>; + +// [121] "fcmuo $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMUO_PPzZZ_D, FCMUO_PPzZZ_H, FCMUO_PPzZZ_S)>; + +// [122] "fcpy $Zd, $Pg/m, $imm8"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCPY_ZPmI_D, FCPY_ZPmI_H, FCPY_ZPmI_S)>; + +// [123] "fcvt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVT_ZPmZ_DtoH, FCVT_ZPmZ_DtoS, FCVT_ZPmZ_HtoD, FCVT_ZPmZ_HtoS, FCVT_ZPmZ_StoD, FCVT_ZPmZ_StoH)>; + +// [124] "fcvtzs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZS_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoS, FCVTZS_ZPmZ_HtoD, FCVTZS_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoS, FCVTZS_ZPmZ_StoD, FCVTZS_ZPmZ_StoS)>; + +// [125] "fcvtzu $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZU_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoS, FCVTZU_ZPmZ_HtoD, FCVTZU_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoS, FCVTZU_ZPmZ_StoD, FCVTZU_ZPmZ_StoS)>; + +// [126] "fdiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIV_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIV_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIV_ZPmZ_S)>; + +// [127] "fdivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIVR_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIVR_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIVR_ZPmZ_S)>; + +// [128] "fdup $Zd, $imm8"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FDUP_ZI_D, FDUP_ZI_H, FDUP_ZI_S)>; + +// [129] "fexpa $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FEXPA_ZZ_D, FEXPA_ZZ_H, FEXPA_ZZ_S)>; + +// [130] "fmad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMAD_ZPmZZ_D, FMAD_ZPmZZ_H, FMAD_ZPmZZ_S)>; + +// [131] "fmax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAX_ZPmZ_D, FMAX_ZPmZ_H, FMAX_ZPmZ_S)>; + +// [132] "fmax $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAX_ZPmI_D, FMAX_ZPmI_H, FMAX_ZPmI_S)>; + +// [133] "fmaxnm $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAXNM_ZPmZ_D, FMAXNM_ZPmZ_H, FMAXNM_ZPmZ_S)>; + +// [134] "fmaxnm $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAXNM_ZPmI_D, FMAXNM_ZPmI_H, FMAXNM_ZPmI_S)>; + +// [135] "fmaxnmv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXNMV_VPZ_D, FMAXNMV_VPZ_H, FMAXNMV_VPZ_S)>; + +// [136] "fmaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXV_VPZ_D, FMAXV_VPZ_H, FMAXV_VPZ_S)>; + +// [137] "fmin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMIN_ZPmZ_D, FMIN_ZPmZ_H, FMIN_ZPmZ_S)>; + +// [138] "fmin $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMIN_ZPmI_D, FMIN_ZPmI_H, FMIN_ZPmI_S)>; + +// [139] "fminnm $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMINNM_ZPmZ_D, FMINNM_ZPmZ_H, FMINNM_ZPmZ_S)>; + +// [140] "fminnm $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMINNM_ZPmI_D, FMINNM_ZPmI_H, FMINNM_ZPmI_S)>; + +// [141] "fminnmv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINNMV_VPZ_D, FMINNMV_VPZ_H, FMINNMV_VPZ_S)>; + +// [142] "fminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINV_VPZ_D, FMINV_VPZ_H, FMINV_VPZ_S)>; + +// [143] "fmla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZPmZZ_D, FMLA_ZPmZZ_H, FMLA_ZPmZZ_S)>; + +// [144] "fmla $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZZZI_D, FMLA_ZZZI_H, FMLA_ZZZI_S)>; + +// [145] "fmls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZPmZZ_D, FMLS_ZPmZZ_H, FMLS_ZPmZZ_S)>; + +// [146] "fmls $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZZZI_D, FMLS_ZZZI_H, FMLS_ZZZI_S)>; + +// [147] "fmsb $Zdn, $Pg/m, $Zm, $Za"; + +// [148] "fmul $Zd, $Zn, $Zm"; + +// [149] "fmul $Zd, $Zn, $Zm$iop"; + +// [150] "fmul $Zdn, $Pg/m, $_Zdn, $Zm"; + +// [151] "fmul $Zdn, $Pg/m, $_Zdn, $i1"; + +// [152] "fmulx $Zdn, $Pg/m, $_Zdn, $Zm"; + +// [153] "fneg $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FNEG_ZPmZ_D, FNEG_ZPmZ_H, FNEG_ZPmZ_S)>; + +// [154] "fnmad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMAD_ZPmZZ_D, FNMAD_ZPmZZ_H, FNMAD_ZPmZZ_S)>; + +// [155] "fnmla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLA_ZPmZZ_D, FNMLA_ZPmZZ_H, FNMLA_ZPmZZ_S)>; + +// [156] "fnmls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLS_ZPmZZ_D, FNMLS_ZPmZZ_H, FNMLS_ZPmZZ_S)>; + +// [157] "fnmsb $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMSB_ZPmZZ_D, FNMSB_ZPmZZ_H, FNMSB_ZPmZZ_S)>; + +// [158] "frecpe $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPE_ZZ_D, FRECPE_ZZ_H, FRECPE_ZZ_S)>; + +// [159] "frecps $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRECPS_ZZZ_D, FRECPS_ZZZ_H, FRECPS_ZZZ_S)>; + +// [160] "frecpx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPX_ZPmZ_D, FRECPX_ZPmZ_H, FRECPX_ZPmZ_S)>; + +// [161] "frinta $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTA_ZPmZ_D, FRINTA_ZPmZ_H, FRINTA_ZPmZ_S)>; + +// [162] "frinti $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTI_ZPmZ_D, FRINTI_ZPmZ_H, FRINTI_ZPmZ_S)>; + +// [163] "frintm $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTM_ZPmZ_D, FRINTM_ZPmZ_H, FRINTM_ZPmZ_S)>; + +// [164] "frintn $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTN_ZPmZ_D, FRINTN_ZPmZ_H, FRINTN_ZPmZ_S)>; + +// [165] "frintp $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTP_ZPmZ_D, FRINTP_ZPmZ_H, FRINTP_ZPmZ_S)>; + +// [166] "frintx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTX_ZPmZ_D, FRINTX_ZPmZ_H, FRINTX_ZPmZ_S)>; + +// [167] "frintz $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTZ_ZPmZ_D, FRINTZ_ZPmZ_H, FRINTZ_ZPmZ_S)>; + +// [168] "frsqrte $Zd, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRSQRTE_ZZ_D, FRSQRTE_ZZ_H, FRSQRTE_ZZ_S)>; + +// [169] "frsqrts $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRSQRTS_ZZZ_D, FRSQRTS_ZZZ_H, FRSQRTS_ZZZ_S)>; + +// [170] "fscale $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSCALE_ZPmZ_D, FSCALE_ZPmZ_H, FSCALE_ZPmZ_S)>; + +// [171] "fsqrt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FSQRT_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FSQRT_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FSQRT_ZPmZ_S)>; + +// [172] "fsub $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZZZ_D, FSUB_ZZZ_H, FSUB_ZZZ_S)>; + +// [173] "fsub $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZPmZ_D, FSUB_ZPmZ_H, FSUB_ZPmZ_S)>; + +// [174] "fsub $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUB_ZPmI_D, FSUB_ZPmI_H, FSUB_ZPmI_S)>; + +// [175] "fsubr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUBR_ZPmZ_D, FSUBR_ZPmZ_H, FSUBR_ZPmZ_S)>; + +// [176] "fsubr $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUBR_ZPmI_D, FSUBR_ZPmI_H, FSUBR_ZPmI_S)>; + +// [177] "ftmad $Zdn, $_Zdn, $Zm, $imm3"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTMAD_ZZI_D, FTMAD_ZZI_H, FTMAD_ZZI_S)>; + +// [178] "ftsmul $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTSMUL_ZZZ_D, FTSMUL_ZZZ_H, FTSMUL_ZZZ_S)>; + +// [180] "incb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCB_XPiI)>; + +// [181] "incd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCD_XPiI)>; + +// [182] "incd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCD_ZPiI)>; + +// [183] "inch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCH_XPiI)>; + +// [184] "inch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCH_ZPiI)>; + +// [185] "incp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs INCP_XP_B, INCP_XP_D, INCP_XP_H, INCP_XP_S)>; + +// [186] "incp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs INCP_ZP_D, INCP_ZP_H, INCP_ZP_S)>; + +// [187] "incw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCW_XPiI)>; + +// [188] "incw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCW_ZPiI)>; + +// [189] "index $Zd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_17Cyc_GI02], (instrs INDEX_RR_B, INDEX_RR_D, INDEX_RR_H, INDEX_RR_S)>; + +// [190] "index $Zd, $Rn, $imm5"; +def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_RI_B, INDEX_RI_D, INDEX_RI_H, INDEX_RI_S)>; + +// [191] "index $Zd, $imm5, $Rm"; +def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_IR_B, INDEX_IR_D, INDEX_IR_H, INDEX_IR_S)>; + +// [192] "index $Zd, $imm5, $imm5b"; +def : InstRW<[A64FXWrite_13Cyc_GI0], (instrs INDEX_II_B, INDEX_II_D, INDEX_II_H, INDEX_II_S)>; + +// [193] "insr $Zdn, $Rm"; +def : InstRW<[A64FXWrite_10Cyc_GI02], (instrs INSR_ZR_B, INSR_ZR_D, INSR_ZR_H, INSR_ZR_S)>; + +// [194] "insr $Zdn, $Vm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs INSR_ZV_B, INSR_ZV_D, INSR_ZV_H, INSR_ZV_S)>; + +// [195] "lasta $Rd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTA_RPZ_B, LASTA_RPZ_D, LASTA_RPZ_H, LASTA_RPZ_S)>; + +// [196] "lasta $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTA_VPZ_B, LASTA_VPZ_D, LASTA_VPZ_H, LASTA_VPZ_S)>; + +// [197] "lastb $Rd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTB_RPZ_B, LASTB_RPZ_D, LASTB_RPZ_H, LASTB_RPZ_S)>; + +// [198] "lastb $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTB_VPZ_B, LASTB_VPZ_D, LASTB_VPZ_H, LASTB_VPZ_S)>; + +// [199] "ld1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B, LD1B_D, LD1B_H, LD1B_S)>; + +// [200] "ld1b $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1B_D_REAL, GLD1B_D_SXTW_REAL, GLD1B_D_UXTW_REAL, GLD1B_S_SXTW_REAL, GLD1B_S_UXTW_REAL)>; + +// [201] "ld1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B_D_IMM_REAL, LD1B_H_IMM_REAL, LD1B_IMM_REAL, LD1B_S_IMM_REAL)>; + +// [202] "ld1b $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1B_D_IMM_REAL, GLD1B_S_IMM_REAL)>; + +// [203] "ld1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D)>; + +// [204] "ld1d $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1D_REAL, GLD1D_SCALED_REAL, GLD1D_SXTW_REAL, GLD1D_SXTW_SCALED_REAL, GLD1D_UXTW_REAL, GLD1D_UXTW_SCALED_REAL)>; + +// [205] "ld1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D_IMM_REAL)>; + +// [206] "ld1d $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1D_IMM_REAL)>; + +// [207] "ld1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H, LD1H_D, LD1H_S)>; + +// [208] "ld1h $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1H_D_REAL, GLD1H_D_SCALED_REAL, GLD1H_D_SXTW_REAL, GLD1H_D_SXTW_SCALED_REAL, GLD1H_D_UXTW_REAL, GLD1H_D_UXTW_SCALED_REAL, GLD1H_S_SXTW_REAL, GLD1H_S_SXTW_SCALED_REAL, GLD1H_S_UXTW_REAL, GLD1H_S_UXTW_SCALED_REAL)>; + +// [209] "ld1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H_D_IMM_REAL, LD1H_IMM_REAL, LD1H_S_IMM_REAL)>; + +// [210] "ld1h $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1H_D_IMM_REAL, GLD1H_S_IMM_REAL)>; + +// [211] "ld1rb $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RB_D_IMM, LD1RB_H_IMM, LD1RB_IMM, LD1RB_S_IMM)>; + +// [212] "ld1rd $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RD_IMM)>; + +// [213] "ld1rh $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RH_D_IMM, LD1RH_IMM, LD1RH_S_IMM)>; + +// [214] "ld1rqb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B)>; + +// [215] "ld1rqb $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B_IMM)>; + +// [216] "ld1rqd $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D)>; + +// [217] "ld1rqd $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D_IMM)>; + +// [218] "ld1rqh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H)>; + +// [219] "ld1rqh $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H_IMM)>; + +// [220] "ld1rqw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W)>; + +// [221] "ld1rqw $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W_IMM)>; + +// [222] "ld1rsb $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSB_D_IMM, LD1RSB_H_IMM, LD1RSB_S_IMM)>; + +// [223] "ld1rsh $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSH_D_IMM, LD1RSH_S_IMM)>; + +// [224] "ld1rsw $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSW_IMM)>; + +// [225] "ld1rw $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RW_D_IMM, LD1RW_IMM)>; + +// [226] "ld1sb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D, LD1SB_H, LD1SB_S)>; + +// [227] "ld1sb $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SB_D_REAL, GLD1SB_D_SXTW_REAL, GLD1SB_D_UXTW_REAL, GLD1SB_S_SXTW_REAL, GLD1SB_S_UXTW_REAL)>; + +// [228] "ld1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D_IMM_REAL, LD1SB_H_IMM_REAL, LD1SB_S_IMM_REAL)>; + +// [229] "ld1sb $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SB_D_IMM_REAL, GLD1SB_S_IMM_REAL)>; + +// [230] "ld1sh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D, LD1SH_S)>; + +// [231] "ld1sh $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SH_D_REAL, GLD1SH_D_SCALED_REAL, GLD1SH_D_SXTW_REAL, GLD1SH_D_SXTW_SCALED_REAL, GLD1SH_D_UXTW_REAL, GLD1SH_D_UXTW_SCALED_REAL, GLD1SH_S_SXTW_REAL, GLD1SH_S_SXTW_SCALED_REAL, GLD1SH_S_UXTW_REAL, GLD1SH_S_UXTW_SCALED_REAL)>; + +// [232] "ld1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D_IMM_REAL, LD1SH_S_IMM_REAL)>; + +// [233] "ld1sh $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SH_D_IMM_REAL, GLD1SH_S_IMM_REAL)>; + +// [234] "ld1sw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D)>; + +// [235] "ld1sw $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SW_D_REAL, GLD1SW_D_SCALED_REAL, GLD1SW_D_SXTW_REAL, GLD1SW_D_SXTW_SCALED_REAL, GLD1SW_D_UXTW_REAL, GLD1SW_D_UXTW_SCALED_REAL)>; + +// [236] "ld1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D_IMM_REAL)>; + +// [237] "ld1sw $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SW_D_IMM_REAL)>; + +// [238] "ld1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W, LD1W_D)>; + +// [239] "ld1w $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1W_D_REAL, GLD1W_D_SCALED_REAL, GLD1W_D_SXTW_REAL, GLD1W_D_SXTW_SCALED_REAL, GLD1W_D_UXTW_REAL, GLD1W_D_UXTW_SCALED_REAL, GLD1W_SXTW_REAL, GLD1W_SXTW_SCALED_REAL, GLD1W_UXTW_REAL, GLD1W_UXTW_SCALED_REAL)>; + +// [240] "ld1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W_D_IMM_REAL, LD1W_IMM_REAL)>; + +// [241] "ld1w $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1W_D_IMM_REAL, GLD1W_IMM_REAL)>; + +// [242] "ld2b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B)>; + +// [243] "ld2b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B_IMM)>; + +// [244] "ld2d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D)>; + +// [245] "ld2d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D_IMM)>; + +// [246] "ld2h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H)>; + +// [247] "ld2h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H_IMM)>; + +// [248] "ld2w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W)>; + +// [249] "ld2w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W_IMM)>; + +// [250] "ld3b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B)>; + +// [251] "ld3b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B_IMM)>; + +// [252] "ld3d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D)>; + +// [253] "ld3d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D_IMM)>; + +// [254] "ld3h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H)>; + +// [255] "ld3h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H_IMM)>; + +// [256] "ld3w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W)>; + +// [257] "ld3w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W_IMM)>; + +// [258] "ld4b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B)>; + +// [259] "ld4b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B_IMM)>; + +// [260] "ld4d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D)>; + +// [261] "ld4d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D_IMM)>; + +// [262] "ld4h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H)>; + +// [263] "ld4h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H_IMM)>; + +// [264] "ld4w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W)>; + +// [265] "ld4w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W_IMM)>; + +// [266] "ldff1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1B_D_REAL, LDFF1B_H_REAL, LDFF1B_REAL, LDFF1B_S_REAL)>; + +// [267] "ldff1b $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1B_D_REAL, GLDFF1B_D_SXTW_REAL, GLDFF1B_D_UXTW_REAL, GLDFF1B_S_SXTW_REAL, GLDFF1B_S_UXTW_REAL)>; + +// [268] "ldff1b $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1B_D_IMM_REAL, GLDFF1B_S_IMM_REAL)>; + +// [269] "ldff1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1D_REAL)>; + +// [270] "ldff1d $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1D_REAL, GLDFF1D_SCALED_REAL, GLDFF1D_SXTW_REAL, GLDFF1D_SXTW_SCALED_REAL, GLDFF1D_UXTW_REAL, GLDFF1D_UXTW_SCALED_REAL)>; + +// [271] "ldff1d $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1D_IMM_REAL)>; + +// [272] "ldff1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1H_D_REAL, LDFF1H_REAL, LDFF1H_S_REAL)>; + +// [273] "ldff1h $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1H_D_REAL, GLDFF1H_D_SCALED_REAL, GLDFF1H_D_SXTW_REAL, GLDFF1H_D_SXTW_SCALED_REAL, GLDFF1H_D_UXTW_REAL, GLDFF1H_D_UXTW_SCALED_REAL, GLDFF1H_S_SXTW_REAL, GLDFF1H_S_SXTW_SCALED_REAL, GLDFF1H_S_UXTW_REAL, GLDFF1H_S_UXTW_SCALED_REAL)>; + +// [274] "ldff1h $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1H_D_IMM_REAL, GLDFF1H_S_IMM_REAL)>; + +// [275] "ldff1sb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SB_D_REAL, LDFF1SB_H_REAL, LDFF1SB_S_REAL)>; + +// [276] "ldff1sb $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SB_D_REAL, GLDFF1SB_D_SXTW_REAL, GLDFF1SB_D_UXTW_REAL, GLDFF1SB_S_SXTW_REAL, GLDFF1SB_S_UXTW_REAL)>; + +// [277] "ldff1sb $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SB_D_IMM_REAL, GLDFF1SB_S_IMM_REAL)>; + +// [278] "ldff1sh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SH_D_REAL, LDFF1SH_S_REAL)>; + +// [279] "ldff1sh $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SH_D_REAL, GLDFF1SH_D_SCALED_REAL, GLDFF1SH_D_SXTW_REAL, GLDFF1SH_D_SXTW_SCALED_REAL, GLDFF1SH_D_UXTW_REAL, GLDFF1SH_D_UXTW_SCALED_REAL, GLDFF1SH_S_SXTW_REAL, GLDFF1SH_S_SXTW_SCALED_REAL, GLDFF1SH_S_UXTW_REAL, GLDFF1SH_S_UXTW_SCALED_REAL)>; + +// [280] "ldff1sh $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SH_D_IMM_REAL, GLDFF1SH_S_IMM_REAL)>; + +// [281] "ldff1sw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SW_D_REAL)>; + +// [282] "ldff1sw $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SW_D_REAL, GLDFF1SW_D_SCALED_REAL, GLDFF1SW_D_SXTW_REAL, GLDFF1SW_D_SXTW_SCALED_REAL, GLDFF1SW_D_UXTW_REAL, GLDFF1SW_D_UXTW_SCALED_REAL)>; + +// [283] "ldff1sw $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SW_D_IMM_REAL)>; + +// [284] "ldff1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1W_D_REAL, LDFF1W_REAL)>; + +// [285] "ldff1w $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1W_D_REAL, GLDFF1W_D_SCALED_REAL, GLDFF1W_D_SXTW_REAL, GLDFF1W_D_SXTW_SCALED_REAL, GLDFF1W_D_UXTW_REAL, GLDFF1W_D_UXTW_SCALED_REAL, GLDFF1W_SXTW_REAL, GLDFF1W_SXTW_SCALED_REAL, GLDFF1W_UXTW_REAL, GLDFF1W_UXTW_SCALED_REAL)>; + +// [286] "ldff1w $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1W_D_IMM_REAL, GLDFF1W_IMM_REAL)>; + +// [287] "ldnf1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1B_D_IMM_REAL, LDNF1B_H_IMM_REAL, LDNF1B_IMM_REAL, LDNF1B_S_IMM_REAL)>; + +// [288] "ldnf1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1D_IMM_REAL)>; + +// [289] "ldnf1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1H_D_IMM_REAL, LDNF1H_IMM_REAL, LDNF1H_S_IMM_REAL)>; + +// [290] "ldnf1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SB_D_IMM_REAL, LDNF1SB_H_IMM_REAL, LDNF1SB_S_IMM_REAL)>; + +// [291] "ldnf1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SH_D_IMM_REAL, LDNF1SH_S_IMM_REAL)>; + +// [292] "ldnf1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SW_D_IMM_REAL)>; + +// [293] "ldnf1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1W_D_IMM_REAL, LDNF1W_IMM_REAL)>; + +// [294] "ldnt1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRR)>; + +// [295] "ldnt1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRI)>; + +// [296] "ldnt1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRR)>; + +// [297] "ldnt1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRI)>; + +// [298] "ldnt1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRR)>; + +// [299] "ldnt1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRI)>; + +// [300] "ldnt1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRR)>; + +// [301] "ldnt1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRI)>; + +// [302] "ldr $Pt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_PXI)>; + +// [303] "ldr $Zt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_ZXI)>; + +// [304] "lsl $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZZZ_B, LSL_WIDE_ZZZ_H, LSL_WIDE_ZZZ_S)>; + +// [305] "lsl $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZZI_B, LSL_ZZI_D, LSL_ZZI_H, LSL_ZZI_S)>; + +// [306] "lsl $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZPmZ_B, LSL_WIDE_ZPmZ_H, LSL_WIDE_ZPmZ_S, LSL_ZPmZ_B, LSL_ZPmZ_D, LSL_ZPmZ_H, LSL_ZPmZ_S)>; + +// [307] "lsl $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZPmI_B, LSL_ZPmI_D, LSL_ZPmI_H, LSL_ZPmI_S)>; + +// [308] "lslr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSLR_ZPmZ_B, LSLR_ZPmZ_D, LSLR_ZPmZ_H, LSLR_ZPmZ_S)>; + +// [309] "lsr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZZZ_B, LSR_WIDE_ZZZ_H, LSR_WIDE_ZZZ_S)>; + +// [310] "lsr $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZZI_B, LSR_ZZI_D, LSR_ZZI_H, LSR_ZZI_S)>; + +// [311] "lsr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZPmZ_B, LSR_WIDE_ZPmZ_H, LSR_WIDE_ZPmZ_S, LSR_ZPmZ_B, LSR_ZPmZ_D, LSR_ZPmZ_H, LSR_ZPmZ_S)>; + +// [312] "lsr $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZPmI_B, LSR_ZPmI_D, LSR_ZPmI_H, LSR_ZPmI_S)>; + +// [313] "lsrr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSRR_ZPmZ_B, LSRR_ZPmZ_D, LSRR_ZPmZ_H, LSRR_ZPmZ_S)>; + +// [314] "mad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MAD_ZPmZZ_B, MAD_ZPmZZ_D, MAD_ZPmZZ_H, MAD_ZPmZZ_S)>; + +// [315] "mla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLA_ZPmZZ_B, MLA_ZPmZZ_D, MLA_ZPmZZ_H, MLA_ZPmZZ_S)>; + +// [316] "mls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLS_ZPmZZ_B, MLS_ZPmZZ_D, MLS_ZPmZZ_H, MLS_ZPmZZ_S)>; + +// [317] "movprfx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPmZ_B, MOVPRFX_ZPmZ_D, MOVPRFX_ZPmZ_H, MOVPRFX_ZPmZ_S)>; + +// [318] "movprfx $Zd, $Pg/z, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPzZ_B, MOVPRFX_ZPzZ_D, MOVPRFX_ZPzZ_H, MOVPRFX_ZPzZ_S)>; + +// [319] "movprfx $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZZ)>; + +// [320] "msb $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MSB_ZPmZZ_B, MSB_ZPmZZ_D, MSB_ZPmZZ_H, MSB_ZPmZZ_S)>; + +// [321] "mul $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MUL_ZPmZ_B, MUL_ZPmZ_D, MUL_ZPmZ_H, MUL_ZPmZ_S)>; + +// [322] "mul $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs MUL_ZI_B, MUL_ZI_D, MUL_ZI_H, MUL_ZI_S)>; + +// [323] "nand $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NAND_PPzPP)>; + +// [324] "nands $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NANDS_PPzPP)>; + +// [325] "neg $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NEG_ZPmZ_B, NEG_ZPmZ_D, NEG_ZPmZ_H, NEG_ZPmZ_S)>; + +// [326] "nor $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NOR_PPzPP)>; + +// [327] "nors $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NORS_PPzPP)>; + +// [328] "not $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NOT_ZPmZ_B, NOT_ZPmZ_D, NOT_ZPmZ_H, NOT_ZPmZ_S)>; + +// [329] "orn $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORN_PPzPP)>; + +// [330] "orns $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORNS_PPzPP)>; + +// [331] "orr $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORR_PPzPP)>; + +// [332] "orr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZZZ)>; + +// [333] "orr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZPmZ_B, ORR_ZPmZ_D, ORR_ZPmZ_H, ORR_ZPmZ_S)>; + +// [334] "orr $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs ORR_ZI)>; + +// [335] "orrs $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORRS_PPzPP)>; + +// [336] "orv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ORV_VPZ_B, ORV_VPZ_D, ORV_VPZ_H, ORV_VPZ_S)>; + +// [337] "pfalse $Pd"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PFALSE)>; + +// [338] "pnext $Pdn, $Pg, $_Pdn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PNEXT_B, PNEXT_D, PNEXT_H, PNEXT_S)>; + +// [339] "prfb $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRR)>; + +// [340] "prfb $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFB_D_SCALED, PRFB_D_SXTW_SCALED, PRFB_D_UXTW_SCALED, PRFB_S_SXTW_SCALED, PRFB_S_UXTW_SCALED)>; + +// [341] "prfb $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRI)>; + +// [342] "prfb $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFB_D_PZI, PRFB_S_PZI)>; + +// [343] "prfd $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRR)>; + +// [344] "prfd $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFD_D_SCALED, PRFD_D_SXTW_SCALED, PRFD_D_UXTW_SCALED, PRFD_S_SXTW_SCALED, PRFD_S_UXTW_SCALED)>; + +// [345] "prfd $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRI)>; + +// [346] "prfd $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFD_D_PZI, PRFD_S_PZI)>; + +// [347] "prfh $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRR)>; + +// [348] "prfh $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFH_D_SCALED, PRFH_D_SXTW_SCALED, PRFH_D_UXTW_SCALED, PRFH_S_SXTW_SCALED, PRFH_S_UXTW_SCALED)>; + +// [349] "prfh $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>; + +// [350] "prfh $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>; + +// [351] "prfw $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>; + +// [352] "prfw $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>; + +// [353] "prfw $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRI)>; + +// [354] "prfw $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFW_D_PZI, PRFW_S_PZI)>; + +// [355] "ptest $Pg, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTEST_PP)>; + +// [356] "ptrue $Pd, $pattern"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUE_B, PTRUE_D, PTRUE_H, PTRUE_S)>; + +// [357] "ptrues $Pd, $pattern"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUES_B, PTRUES_D, PTRUES_H, PTRUES_S)>; + +// [358] "punpkhi $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKHI_PP)>; + +// [359] "punpklo $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKLO_PP)>; + +// [360] "rbit $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBIT_ZPmZ_B, RBIT_ZPmZ_D, RBIT_ZPmZ_H, RBIT_ZPmZ_S)>; + +// [361] "rdffr $Pd"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_P)>; + +// [362] "rdffr $Pd, $Pg/z"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_PPz)>; + +// [363] "rdffrs $Pd, $Pg/z"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFRS_PPz)>; + +// [364] "rdvl $Rd, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs RDVLI_XI)>; + +// [365] "rev $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs REV_PP_B, REV_PP_D, REV_PP_H, REV_PP_S)>; + +// [366] "rev $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs REV_ZZ_B, REV_ZZ_D, REV_ZZ_H, REV_ZZ_S)>; + +// [367] "revb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVB_ZPmZ_D, REVB_ZPmZ_H, REVB_ZPmZ_S)>; + +// [368] "revh $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVH_ZPmZ_D, REVH_ZPmZ_S)>; + +// [369] "revw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVW_ZPmZ_D)>; + +// [370] "sabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SABD_ZPmZ_B, SABD_ZPmZ_D, SABD_ZPmZ_H, SABD_ZPmZ_S)>; + +// [371] "saddv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs SADDV_VPZ_B, SADDV_VPZ_H, SADDV_VPZ_S)>; + +// [372] "scvtf $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SCVTF_ZPmZ_DtoD, SCVTF_ZPmZ_DtoH, SCVTF_ZPmZ_DtoS, SCVTF_ZPmZ_HtoH, SCVTF_ZPmZ_StoD, SCVTF_ZPmZ_StoH, SCVTF_ZPmZ_StoS)>; + +// [373] "sdiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIV_ZPmZ_D, SDIV_ZPmZ_S)>; + +// [374] "sdivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIVR_ZPmZ_D, SDIVR_ZPmZ_S)>; + +// [375] "sdot $Zda, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SDOT_ZZZ_D, SDOT_ZZZ_S)>; + +// [376] "sdot $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs SDOT_ZZZI_D, SDOT_ZZZI_S)>; + +// [377] "sel $Pd, $Pg, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs SEL_PPPP)>; + +// [378] "sel $Zd, $Pg, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SEL_ZPZZ_B, SEL_ZPZZ_D, SEL_ZPZZ_H, SEL_ZPZZ_S)>; + +// [379] "setffr"; +def : InstRW<[A64FXWrite_6Cyc], (instrs SETFFR)>; + +// [380] "smax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMAX_ZPmZ_B, SMAX_ZPmZ_D, SMAX_ZPmZ_H, SMAX_ZPmZ_S)>; + +// [381] "smax $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMAX_ZI_B, SMAX_ZI_D, SMAX_ZI_H, SMAX_ZI_S)>; + +// [382] "smaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMAXV_VPZ_B, SMAXV_VPZ_D, SMAXV_VPZ_H, SMAXV_VPZ_S)>; + +// [383] "smin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMIN_ZPmZ_B, SMIN_ZPmZ_D, SMIN_ZPmZ_H, SMIN_ZPmZ_S)>; + +// [384] "smin $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMIN_ZI_B, SMIN_ZI_D, SMIN_ZI_H, SMIN_ZI_S)>; + +// [385] "sminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMINV_VPZ_B, SMINV_VPZ_D, SMINV_VPZ_H, SMINV_VPZ_S)>; + +// [386] "smulh $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SMULH_ZPmZ_B, SMULH_ZPmZ_D, SMULH_ZPmZ_H, SMULH_ZPmZ_S)>; + +// [387] "splice $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SPLICE_ZPZ_B, SPLICE_ZPZ_D, SPLICE_ZPZ_H, SPLICE_ZPZ_S)>; + +// [388] "sqadd $Zd, $Zn, $Zm"; + +// [389] "sqadd $Zdn, $_Zdn, $imm"; + +// [390] "sqdecb $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiWdI)>; + +// [391] "sqdecb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiI)>; + +// [392] "sqdecd $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiWdI)>; + +// [393] "sqdecd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiI)>; + +// [394] "sqdecd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECD_ZPiI)>; + +// [395] "sqdech $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiWdI)>; + +// [396] "sqdech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiI)>; + +// [397] "sqdech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECH_ZPiI)>; + +// [398] "sqdecp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XP_B, SQDECP_XP_D, SQDECP_XP_H, SQDECP_XP_S)>; + +// [399] "sqdecp $Rdn, $Pg, $_Rdn"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XPWd_B, SQDECP_XPWd_D, SQDECP_XPWd_H, SQDECP_XPWd_S)>; + +// [400] "sqdecp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQDECP_ZP_D, SQDECP_ZP_H, SQDECP_ZP_S)>; + +// [401] "sqdecw $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiWdI)>; + +// [402] "sqdecw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiI)>; + +// [403] "sqdecw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECW_ZPiI)>; + +// [404] "sqincb $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiWdI)>; + +// [405] "sqincb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiI)>; + +// [406] "sqincd $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiWdI)>; + +// [407] "sqincd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiI)>; + +// [408] "sqincd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCD_ZPiI)>; + +// [409] "sqinch $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiWdI)>; + +// [410] "sqinch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiI)>; + +// [411] "sqinch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCH_ZPiI)>; + +// [412] "sqincp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XP_B, SQINCP_XP_D, SQINCP_XP_H, SQINCP_XP_S)>; + +// [413] "sqincp $Rdn, $Pg, $_Rdn"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XPWd_B, SQINCP_XPWd_D, SQINCP_XPWd_H, SQINCP_XPWd_S)>; + +// [414] "sqincp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQINCP_ZP_D, SQINCP_ZP_H, SQINCP_ZP_S)>; + +// [415] "sqincw $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiWdI)>; + +// [416] "sqincw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiI)>; + +// [417] "sqincw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>; + +// [418] "sqsub $Zd, $Zn, $Zm"; + +// [419] "sqsub $Zdn, $_Zdn, $imm"; + +// [420] "st1b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>; + +// [421] "st1b $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>; + +// [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>; + +// [423] "st1b $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>; + +// [424] "st1d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>; + +// [425] "st1d $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>; + +// [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>; + +// [427] "st1d $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>; + +// [428] "st1h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>; + +// [429] "st1h $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>; + +// [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>; + +// [431] "st1h $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>; + +// [432] "st1w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>; + +// [433] "st1w $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>; + +// [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>; + +// [435] "st1w $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1W_D_IMM, SST1W_IMM)>; + +// [436] "st2b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B)>; + +// [437] "st2b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B_IMM)>; + +// [438] "st2d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D)>; + +// [439] "st2d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D_IMM)>; + +// [440] "st2h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H)>; + +// [441] "st2h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H_IMM)>; + +// [442] "st2w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W)>; + +// [443] "st2w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W_IMM)>; + +// [444] "st3b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B)>; + +// [445] "st3b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B_IMM)>; + +// [446] "st3d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D)>; + +// [447] "st3d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D_IMM)>; + +// [448] "st3h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H)>; + +// [449] "st3h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H_IMM)>; + +// [450] "st3w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W)>; + +// [451] "st3w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W_IMM)>; + +// [452] "st4b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B)>; + +// [453] "st4b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B_IMM)>; + +// [454] "st4d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D)>; + +// [455] "st4d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D_IMM)>; + +// [456] "st4h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H)>; + +// [457] "st4h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H_IMM)>; + +// [458] "st4w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W)>; + +// [459] "st4w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W_IMM)>; + +// [460] "stnt1b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRR)>; + +// [461] "stnt1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRI)>; + +// [462] "stnt1d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRR)>; + +// [463] "stnt1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRI)>; + +// [464] "stnt1h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRR)>; + +// [465] "stnt1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRI)>; + +// [466] "stnt1w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRR)>; + +// [467] "stnt1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRI)>; + +// [468] "str $Pt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI15], (instrs STR_PXI)>; + +// [469] "str $Zt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI05], (instrs STR_ZXI)>; + +// [470] "sub $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZZZ_B, SUB_ZZZ_D, SUB_ZZZ_H, SUB_ZZZ_S)>; + +// [471] "sub $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZPmZ_B, SUB_ZPmZ_D, SUB_ZPmZ_H, SUB_ZPmZ_S)>; + +// [472] "sub $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZI_B, SUB_ZI_D, SUB_ZI_H, SUB_ZI_S)>; + +// [473] "subr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUBR_ZPmZ_B, SUBR_ZPmZ_D, SUBR_ZPmZ_H, SUBR_ZPmZ_S)>; + +// [474] "subr $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SUBR_ZI_B, SUBR_ZI_D, SUBR_ZI_H, SUBR_ZI_S)>; + +// [475] "sunpkhi $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKHI_ZZ_D, SUNPKHI_ZZ_H, SUNPKHI_ZZ_S)>; + +// [476] "sunpklo $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKLO_ZZ_D, SUNPKLO_ZZ_H, SUNPKLO_ZZ_S)>; + +// [477] "sxtb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTB_ZPmZ_D, SXTB_ZPmZ_H, SXTB_ZPmZ_S)>; + +// [478] "sxth $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTH_ZPmZ_D, SXTH_ZPmZ_S)>; + +// [479] "sxtw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTW_ZPmZ_D)>; + +// [480] "tbl $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs TBL_ZZZ_B, TBL_ZZZ_D, TBL_ZZZ_H, TBL_ZZZ_S)>; + +// [481] "trn1 $Pd, $Pn, $Pm"; + +// [482] "trn1 $Zd, $Zn, $Zm"; + +// [483] "trn2 $Pd, $Pn, $Pm"; + +// [484] "trn2 $Zd, $Zn, $Zm"; + +// [486] "uabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UABD_ZPmZ_B, UABD_ZPmZ_D, UABD_ZPmZ_H, UABD_ZPmZ_S)>; + +// [487] "uaddv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs UADDV_VPZ_B, UADDV_VPZ_D, UADDV_VPZ_H, UADDV_VPZ_S)>; + +// [488] "ucvtf $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UCVTF_ZPmZ_DtoD, UCVTF_ZPmZ_DtoH, UCVTF_ZPmZ_DtoS, UCVTF_ZPmZ_HtoH, UCVTF_ZPmZ_StoD, UCVTF_ZPmZ_StoH, UCVTF_ZPmZ_StoS)>; + +// [489] "udiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIV_ZPmZ_D, UDIV_ZPmZ_S)>; + +// [490] "udivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIVR_ZPmZ_D, UDIVR_ZPmZ_S)>; + +// [491] "udot $Zda, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UDOT_ZZZ_D, UDOT_ZZZ_S)>; + +// [492] "udot $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs UDOT_ZZZI_D, UDOT_ZZZI_S)>; + +// [493] "umax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMAX_ZPmZ_B, UMAX_ZPmZ_D, UMAX_ZPmZ_H, UMAX_ZPmZ_S)>; + +// [494] "umax $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMAX_ZI_B, UMAX_ZI_D, UMAX_ZI_H, UMAX_ZI_S)>; + +// [495] "umaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMAXV_VPZ_B, UMAXV_VPZ_D, UMAXV_VPZ_H, UMAXV_VPZ_S)>; + +// [496] "umin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMIN_ZPmZ_B, UMIN_ZPmZ_D, UMIN_ZPmZ_H, UMIN_ZPmZ_S)>; + +// [497] "umin $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMIN_ZI_B, UMIN_ZI_D, UMIN_ZI_H, UMIN_ZI_S)>; + +// [498] "uminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMINV_VPZ_B, UMINV_VPZ_D, UMINV_VPZ_H, UMINV_VPZ_S)>; + +// [499] "umulh $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UMULH_ZPmZ_B, UMULH_ZPmZ_D, UMULH_ZPmZ_H, UMULH_ZPmZ_S)>; + +// [500] "uqadd $Zd, $Zn, $Zm"; + +// [501] "uqadd $Zdn, $_Zdn, $imm"; + +// [502] "uqdecb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECB_WPiI, UQDECB_XPiI)>; + +// [503] "uqdecd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECD_WPiI, UQDECD_XPiI)>; + +// [504] "uqdecd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECD_ZPiI)>; + +// [505] "uqdech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECH_WPiI, UQDECH_XPiI)>; + +// [506] "uqdech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECH_ZPiI)>; + +// [507] "uqdecp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQDECP_WP_B, UQDECP_WP_D, UQDECP_WP_H, UQDECP_WP_S, UQDECP_XP_B, UQDECP_XP_D, UQDECP_XP_H, UQDECP_XP_S)>; + +// [508] "uqdecp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQDECP_ZP_D, UQDECP_ZP_H, UQDECP_ZP_S)>; + +// [509] "uqdecw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECW_WPiI, UQDECW_XPiI)>; + +// [510] "uqdecw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECW_ZPiI)>; + +// [511] "uqincb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCB_WPiI, UQINCB_XPiI)>; + +// [512] "uqincd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCD_WPiI, UQINCD_XPiI)>; + +// [513] "uqincd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCD_ZPiI)>; + +// [514] "uqinch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCH_WPiI, UQINCH_XPiI)>; + +// [515] "uqinch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCH_ZPiI)>; + +// [516] "uqincp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQINCP_WP_B, UQINCP_WP_D, UQINCP_WP_H, UQINCP_WP_S, UQINCP_XP_B, UQINCP_XP_D, UQINCP_XP_H, UQINCP_XP_S)>; + +// [517] "uqincp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQINCP_ZP_D, UQINCP_ZP_H, UQINCP_ZP_S)>; + +// [518] "uqincw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCW_WPiI, UQINCW_XPiI)>; + +// [519] "uqincw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCW_ZPiI)>; + +// [520] "uqsub $Zd, $Zn, $Zm"; +//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZZZ_B, UQSUB_ZZZ_D, UQSUB_ZZZ_H, UQSUB_ZZZ_S)>; + +// [521] "uqsub $Zdn, $_Zdn, $imm"; +//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZI_B, UQSUB_ZI_D, UQSUB_ZI_H, UQSUB_ZI_S)>; + +// [522] "uunpkhi $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKHI_ZZ_D, UUNPKHI_ZZ_H, UUNPKHI_ZZ_S)>; + +// [523] "uunpklo $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKLO_ZZ_D, UUNPKLO_ZZ_H, UUNPKLO_ZZ_S)>; + +// [524] "uxtb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTB_ZPmZ_D, UXTB_ZPmZ_H, UXTB_ZPmZ_S)>; + +// [525] "uxth $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTH_ZPmZ_D, UXTH_ZPmZ_S)>; + +// [526] "uxtw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTW_ZPmZ_D)>; + +// [527] "uzp1 $Pd, $Pn, $Pm"; + +// [528] "uzp1 $Zd, $Zn, $Zm"; + +// [529] "uzp2 $Pd, $Pn, $Pm"; + +// [530] "uzp2 $Zd, $Zn, $Zm"; + +// [531] "whilele $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELE_PWW_B, WHILELE_PWW_D, WHILELE_PWW_H, WHILELE_PWW_S, WHILELE_PXX_B, WHILELE_PXX_D, WHILELE_PXX_H, WHILELE_PXX_S)>; + +// [532] "whilelo $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELO_PWW_B, WHILELO_PWW_D, WHILELO_PWW_H, WHILELO_PWW_S, WHILELO_PXX_B, WHILELO_PXX_D, WHILELO_PXX_H, WHILELO_PXX_S)>; + +// [533] "whilels $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELS_PWW_B, WHILELS_PWW_D, WHILELS_PWW_H, WHILELS_PWW_S, WHILELS_PXX_B, WHILELS_PXX_D, WHILELS_PXX_H, WHILELS_PXX_S)>; + +// [534] "whilelt $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELT_PWW_B, WHILELT_PWW_D, WHILELT_PWW_H, WHILELT_PWW_S, WHILELT_PXX_B, WHILELT_PXX_D, WHILELT_PXX_H, WHILELT_PXX_S)>; + +// [535] "wrffr $Pn"; +def : InstRW<[A64FXWrite_6Cyc_NGI1], (instrs WRFFR)>; + +// [536] "zip1 $Pd, $Pn, $Pm"; + +// [537] "zip1 $Zd, $Zn, $Zm"; + +// [538] "zip2 $Pd, $Pn, $Pm"; + +// [539] "zip2 $Zd, $Zn, $Zm"; + +} // SchedModel = A64FXModel diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedTSV110.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedTSV110.td index 438371c1b6..0828d8a8c9 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedTSV110.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SchedTSV110.td @@ -1,745 +1,745 @@ -//==- AArch64SchedTSV110.td - Huawei TSV110 Scheduling Definitions -*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for Huawei TSV110 to support -// instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -// ===---------------------------------------------------------------------===// -// The following definitions describe the simpler per-operand machine model. -// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details. - -// Huawei TSV110 scheduling machine model. -def TSV110Model : SchedMachineModel { - let IssueWidth = 4; // 4 micro-ops dispatched per cycle. - let MicroOpBufferSize = 128; // 128 micro-op re-order buffer - let LoopMicroOpBufferSize = 16; - let LoadLatency = 4; // Optimistic load latency. - let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch - let CompleteModel = 1; - - list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, - PAUnsupported.F); -} - -// Define each kind of processor resource and number available on the TSV110, -// which has 8 pipelines, each with its own queue where micro-ops wait for -// their operands and issue out-of-order to one of eight execution pipelines. -let SchedModel = TSV110Model in { - def TSV110UnitALU : ProcResource<1>; // Int ALU - def TSV110UnitAB : ProcResource<2>; // Int ALU/BRU - def TSV110UnitMDU : ProcResource<1>; // Multi-Cycle - def TSV110UnitFSU1 : ProcResource<1>; // FP/ASIMD - def TSV110UnitFSU2 : ProcResource<1>; // FP/ASIMD - def TSV110UnitLdSt : ProcResource<2>; // Load/Store - - def TSV110UnitF : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2]>; - def TSV110UnitALUAB : ProcResGroup<[TSV110UnitALU, TSV110UnitAB]>; - def TSV110UnitFLdSt : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2, TSV110UnitLdSt]>; -} - -let SchedModel = TSV110Model in { - -//===----------------------------------------------------------------------===// -// Map the target-defined scheduler read/write resources and latency for -// TSV110 - -// Integer ALU -def : WriteRes<WriteImm, [TSV110UnitALUAB]> { let Latency = 1; } -def : WriteRes<WriteI, [TSV110UnitALUAB]> { let Latency = 1; } -def : WriteRes<WriteISReg, [TSV110UnitMDU]> { let Latency = 2; } -def : WriteRes<WriteIEReg, [TSV110UnitMDU]> { let Latency = 2; } -def : WriteRes<WriteExtr, [TSV110UnitALUAB]> { let Latency = 1; } -def : WriteRes<WriteIS, [TSV110UnitALUAB]> { let Latency = 1; } - -// Integer Mul/MAC/Div -def : WriteRes<WriteID32, [TSV110UnitMDU]> { let Latency = 12; - let ResourceCycles = [12]; } -def : WriteRes<WriteID64, [TSV110UnitMDU]> { let Latency = 20; - let ResourceCycles = [20]; } -def : WriteRes<WriteIM32, [TSV110UnitMDU]> { let Latency = 3; } -def : WriteRes<WriteIM64, [TSV110UnitMDU]> { let Latency = 4; } - -// Load -def : WriteRes<WriteLD, [TSV110UnitLdSt]> { let Latency = 4; } -def : WriteRes<WriteLDIdx, [TSV110UnitLdSt]> { let Latency = 4; } -def : WriteRes<WriteLDHi, []> { let Latency = 4; } - -// Pre/Post Indexing -def : WriteRes<WriteAdr, [TSV110UnitALUAB]> { let Latency = 1; } - -// Store -def : WriteRes<WriteST, [TSV110UnitLdSt]> { let Latency = 1; } -def : WriteRes<WriteSTP, [TSV110UnitLdSt]> { let Latency = 1; } -def : WriteRes<WriteSTIdx, [TSV110UnitLdSt]> { let Latency = 1; } - -// FP -def : WriteRes<WriteF, [TSV110UnitF]> { let Latency = 2; } -def : WriteRes<WriteFCmp, [TSV110UnitF]> { let Latency = 3; } -def : WriteRes<WriteFCvt, [TSV110UnitF]> { let Latency = 3; } -def : WriteRes<WriteFCopy, [TSV110UnitF]> { let Latency = 2; } -def : WriteRes<WriteFImm, [TSV110UnitF]> { let Latency = 2; } -def : WriteRes<WriteFMul, [TSV110UnitF]> { let Latency = 5; } - -// FP Div, Sqrt -def : WriteRes<WriteFDiv, [TSV110UnitFSU1]> { let Latency = 18; } - -def : WriteRes<WriteV, [TSV110UnitF]> { let Latency = 4; } -def : WriteRes<WriteVLD, [TSV110UnitFLdSt]> { let Latency = 5; } -def : WriteRes<WriteVST, [TSV110UnitF]> { let Latency = 1; } - -// Branch -def : WriteRes<WriteBr, [TSV110UnitAB]> { let Latency = 1; } -def : WriteRes<WriteBrReg, [TSV110UnitAB]> { let Latency = 1; } -def : WriteRes<WriteSys, []> { let Latency = 1; } -def : WriteRes<WriteBarrier, []> { let Latency = 1; } -def : WriteRes<WriteHint, []> { let Latency = 1; } - -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } - -// Forwarding logic is modeled only for multiply and accumulate. -def : ReadAdvance<ReadI, 0>; -def : ReadAdvance<ReadISReg, 0>; -def : ReadAdvance<ReadIEReg, 0>; -def : ReadAdvance<ReadIM, 0>; -def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; -def : ReadAdvance<ReadID, 0>; -def : ReadAdvance<ReadExtrHi, 0>; -def : ReadAdvance<ReadAdrBase, 0>; -def : ReadAdvance<ReadVLD, 0>; - -def : InstRW<[WriteI], (instrs COPY)>; - -// Detailed Refinements -//===----------------------------------------------------------------------===// - -// Contains all of the TSV110 specific SchedWriteRes types. The approach -// below is to define a generic SchedWriteRes for every combination of -// latency and microOps. The naming conventions is to use a prefix, one field -// for latency, and one or more microOp count/type designators. -// Prefix: TSV110Wr -// Latency: #cyc -// MicroOp Count/Types: #(ALU|AB|MDU|FSU1|FSU2|LdSt|ALUAB|F|FLdSt) -// -// e.g. TSV110Wr_6cyc_1ALU_6MDU_4LdSt means the total latency is 6 and there are -// 1 micro-ops to be issued down one ALU pipe, six MDU pipes and four LdSt pipes. -// - -//===----------------------------------------------------------------------===// -// Define Generic 1 micro-op types - -def TSV110Wr_1cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 1; } -def TSV110Wr_1cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 1; } -def TSV110Wr_1cyc_1ALUAB : SchedWriteRes<[TSV110UnitALUAB]> { let Latency = 1; } -def TSV110Wr_1cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 1; } - -def TSV110Wr_2cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 2; } -def TSV110Wr_2cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 2; } -def TSV110Wr_2cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 2; } -def TSV110Wr_2cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 2; } -def TSV110Wr_2cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 2; } -def TSV110Wr_2cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 2; } - -def TSV110Wr_3cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 3; } -def TSV110Wr_3cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 3; } -def TSV110Wr_3cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 3; } - -def TSV110Wr_4cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 4; } -def TSV110Wr_4cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 4; } -def TSV110Wr_4cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 4; } -def TSV110Wr_4cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 4; } - -def TSV110Wr_5cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 5; } -def TSV110Wr_5cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 5; } -def TSV110Wr_5cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 5; } -def TSV110Wr_5cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 5; } - -def TSV110Wr_6cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 6; } - -def TSV110Wr_7cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 7; } - -def TSV110Wr_8cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 8; } - -def TSV110Wr_11cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 11; } - -def TSV110Wr_12cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 12; } - -def TSV110Wr_17cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 17; } - -def TSV110Wr_18cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 18; } - -def TSV110Wr_20cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 20; } - -def TSV110Wr_24cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 24; } - -def TSV110Wr_31cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 31; } - -def TSV110Wr_36cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 36; } - -def TSV110Wr_38cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 38; } - -def TSV110Wr_64cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 64; } - -//===----------------------------------------------------------------------===// -// Define Generic 2 micro-op types - -def TSV110Wr_1cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitALUAB]> { - let Latency = 1; - let NumMicroOps = 2; -} - -def TSV110Wr_2cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitALUAB]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def TSV110Wr_2cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitLdSt]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def TSV110Wr_2cyc_2F : SchedWriteRes<[TSV110UnitF, - TSV110UnitF]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def TSV110Wr_2cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, - TSV110UnitFSU2]> { - let Latency = 2; - let NumMicroOps = 2; -} - -def TSV110Wr_4cyc_2F : SchedWriteRes<[TSV110UnitF, - TSV110UnitF]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def TSV110Wr_4cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, - TSV110UnitFSU2]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def TSV110Wr_4cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitALUAB]> { - let Latency = 4; - let NumMicroOps = 2; -} - -def TSV110Wr_5cyc_1ALU_1F : SchedWriteRes<[TSV110UnitALU, - TSV110UnitF]> { - let Latency = 5; - let NumMicroOps = 2; -} - -def TSV110Wr_6cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitLdSt]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def TSV110Wr_6cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, - TSV110UnitALUAB]> { - let Latency = 6; - let NumMicroOps = 2; -} - -def TSV110Wr_7cyc_1F_1LdSt : SchedWriteRes<[TSV110UnitF, - TSV110UnitLdSt]> { - let Latency = 7; - let NumMicroOps = 2; -} - -def TSV110Wr_8cyc_2FSU1 : SchedWriteRes<[TSV110UnitFSU1, - TSV110UnitFSU1]> { - let Latency = 8; - let NumMicroOps = 2; -} - - -def TSV110Wr_8cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, - TSV110UnitFSU2]> { - let Latency = 8; - let NumMicroOps = 2; -} - -//===----------------------------------------------------------------------===// -// Define Generic 3 micro-op types - -def TSV110Wr_6cyc_3F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, - TSV110UnitF]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def TSV110Wr_6cyc_3LdSt : SchedWriteRes<[TSV110UnitLdSt, TSV110UnitLdSt, - TSV110UnitLdSt]> { - let Latency = 6; - let NumMicroOps = 3; -} - -def TSV110Wr_7cyc_2F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, - TSV110UnitLdSt]> { - let Latency = 7; - let NumMicroOps = 3; -} - -//===----------------------------------------------------------------------===// -// Define Generic 4 micro-op types - -def TSV110Wr_8cyc_4F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, - TSV110UnitF, TSV110UnitF]> { - let Latency = 8; - let NumMicroOps = 4; -} - -def TSV110Wr_8cyc_3F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, - TSV110UnitF, TSV110UnitLdSt]> { - let Latency = 8; - let NumMicroOps = 4; -} - -//===----------------------------------------------------------------------===// -// Define Generic 5 micro-op types - -def TSV110Wr_8cyc_3F_2LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, TSV110UnitF, - TSV110UnitLdSt, TSV110UnitLdSt]> { - let Latency = 8; - let NumMicroOps = 5; -} - -//===----------------------------------------------------------------------===// -// Define Generic 8 micro-op types - -def TSV110Wr_10cyc_4F_4LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, - TSV110UnitF, TSV110UnitF, - TSV110UnitLdSt, TSV110UnitLdSt, - TSV110UnitLdSt, TSV110UnitLdSt]> { - let Latency = 10; - let NumMicroOps = 8; -} - - -// Branch Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1AB], (instrs B)>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BL)>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BLR)>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>; - - -// Cryptography Extensions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AES[DE]")>; -def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AESI?MC")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA1SU1")>; -def : InstRW<[TSV110Wr_2cyc_2F], (instregex "^SHA1(H|SU0)")>; -def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA1[CMP]")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA256SU0")>; -def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^SHA256SU1")>; -def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA256(H|H2)")>; -def TSV110ReadCRC: SchedReadAdvance<1, [TSV110Wr_2cyc_1MDU]>; -def : InstRW<[TSV110Wr_2cyc_1MDU, TSV110ReadCRC], (instregex "^CRC32.*$")>; - - -// Arithmetic and Logical Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(BIC|EON|ORN)[WX]rr")>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(BIC)S[WX]rr")>; - -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)")>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(ADD|AND|EOR|ORR|SUB)S[WX]r(r|i)")>; - -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(ADC|SBC|BIC)[WX]r$")>; -def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(ADC|SBC)S[WX]r$")>; - -def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>; -def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)S[WX]rs$")>; -def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(ADD|SUB)[WX]r(s|x|x64)$")>; -def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(ADD|SUB)S[WX]r(s|x|x64)$")>; - -def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; - - -// Move and Shift Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instrs ADR, ADRP)>; -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^MOV[NZK][WX]i")>; -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(LSLV|LSRV|ASRV|RORV)(W|X)r")>; - - -// Divide and Multiply Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; -def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; - -def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; -def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; -def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; -def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; -def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; - - -// Miscellaneous Data-Processing Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^EXTR(W|X)rri$")>; -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(S|U)?BFM(W|X)ri$")>; -def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>; - - -// Load Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(W|X)l$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs LDRSWl)>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(BB|HH|W|X)ui$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTR(B|H|W|X)i$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDUR(BB|HH|W|X)i$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDNP(W|X)i$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDP(W|X)i$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt_1ALUAB, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instrs LDPSWi)>; -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpost)>; -def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpre)>; - -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFMl)>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFUMi)>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMui$")>; -def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMro(W|X)$")>; - - -// Store Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STN?P(W|X)i$")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR(BB|HH|W|X)i$")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STTR(B|H|W|X)i$")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR(BB|HH|W|X)ui$")>; - -def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)(post|pre)$")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; - - -// FP Data Processing Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "F(ABS|NEG)(D|S)r")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCCMP(E)?(S|D)rr$")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCSEL(S|D)rrr$")>; - -def : InstRW<[TSV110Wr_11cyc_1FSU1], (instrs FDIVSrr)>; -def : InstRW<[TSV110Wr_18cyc_1FSU1], (instrs FDIVDrr)>; -def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTSr)>; -def : InstRW<[TSV110Wr_31cyc_1FSU2], (instrs FSQRTDr)>; - -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN).+rr")>; - -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^FN?M(ADD|SUB)Hrrr")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FN?M(ADD|SUB)Srrr")>; -def : InstRW<[TSV110Wr_7cyc_1F], (instregex "^FN?M(ADD|SUB)Drrr")>; - -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Hrr")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|SUB)Srr")>; -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Drr")>; - -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(N)?MULHrr$")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULSrr$")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULDrr$")>; - -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT.+r")>; - - -// FP Miscellaneous Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_5cyc_1ALU_1F], (instregex "^[SU]CVTF[SU][WX][SD]ri")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT[HSD][HSD]r")>; - -def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^FMOV(DX|WS|XD|SW|DXHigh|XDHigh)r$")>; -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOV[SD][ir]$")>; - - -// FP Load Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[DSQ]l")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDUR[BDHSQ]i")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[TSV110Wr_6cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi], (instregex "^LDN?P[DQS]i")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi, WriteAdr], (instregex "^LDP[DQS](post|pre)")>; - - -// FP Store Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR[BHSDQ]i")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ](post|pre)")>; -def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR[BHSDQ]ui")>; -def : InstRW<[TSV110Wr_2cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ]ro[WX]")>; -def : InstRW<[TSV110Wr_2cyc_2LdSt], (instregex "^STN?P[SDQ]i")>; -def : InstRW<[TSV110Wr_2cyc_2LdSt, WriteAdr], (instregex "^STP[SDQ](post|pre)")>; - - -// ASIMD Integer Instructions -// ----------------------------------------------------------------------------- - -// Reference for forms in this group -// D form - v8i8, v4i16, v2i32 -// Q form - v16i8, v8i16, v4i32 -// D form - v1i8, v1i16, v1i32, v1i64 -// Q form - v16i8, v8i16, v4i32, v2i64 -// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 -// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 - -// ASIMD simple arithmetic -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(ABS|ADD(P)?|NEG|SUB)v")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](ADD(L|LP|W)|SUB(L|W))v")>; - -// ASIMD complex arithmetic -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]H(ADD|SUB)v")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^R?(ADD|SUB)HN2?v")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]Q(ADD|SUB)v")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^(SU|US)QADDv")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]RHADDv")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABAL?v")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABDL?v")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ADALPv")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^((SQ)(ABS|NEG))v")>; - -// ASIMD compare -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT|TST)v")>; - -// ASIMD max/min -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)P?v")>; - -// ASIMD logical -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(AND|BIC|BIF|BIT|BSL|EOR|MVN|NOT|ORN|ORR)v")>; - -// ASIMD multiply accumulate, D-form -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)")>; -// ASIMD multiply accumulate, Q-form -def : InstRW<[TSV110Wr_8cyc_2FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v16i8|v8i16|v4i32)")>; - -// ASIMD multiply accumulate long -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v8i8|v16i8)")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v1i64|v2i64)")>; - -// ASIMD shift -// ASIMD shift accumulate -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(S|SR|U|UR)SRA")>; -// ASIMD shift by immed, basic -def : InstRW<[TSV110Wr_4cyc_1FSU1], - (instregex "SHLv","SLIv","SRIv","SHRNv","SQXTNv","SQXTUNv","UQXTNv")>; -// ASIMD shift by immed, complex -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]?(Q|R){1,2}SHR")>; -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^SQSHLU")>; -// ASIMD shift by register, basic, Q-form -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; -// ASIMD shift by register, complex, D-form -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; -// ASIMD shift by register, complex, Q-form -def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; - -// ASIMD reduction -// ASIMD arith, reduce, 4H/4S -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; -// ASIMD arith, reduce, 8B/8H -def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; -// ASIMD arith, reduce, 16B -def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?Vv16i8v$")>; - -// ASIMD max/min, reduce, 4H/4S -def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; -// ASIMD max/min, reduce, 8B/8H -def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; -// ASIMD max/min, reduce, 16B -def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; - - -// Vector - Floating Point -// ----------------------------------------------------------------------------- - -// Reference for forms in this group -// D form - v2f32 -// Q form - v4f32, v2f64 -// D form - 32, 64 -// D form - v1i32, v1i64 -// D form - v2i32 -// Q form - v4i32, v2i64 - -// ASIMD FP sign manipulation -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FABSv")>; -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FNEGv")>; - -// ASIMD FP compare -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v")>; - -// ASIMD FP convert -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FCVT[AMNPZ][SU]v")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT(L)v")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FCVT(N|XN)v")>; - -// ASIMD FP divide, D-form, F32 -def : InstRW<[TSV110Wr_11cyc_1FSU1], (instregex "FDIVv2f32")>; -// ASIMD FP divide, Q-form, F32 -def : InstRW<[TSV110Wr_24cyc_1FSU1], (instregex "FDIVv4f32")>; -// ASIMD FP divide, Q-form, F64 -def : InstRW<[TSV110Wr_38cyc_1FSU1], (instregex "FDIVv2f64")>; - -// ASIMD FP SQRT -def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTv2f32)>; -def : InstRW<[TSV110Wr_36cyc_1FSU2], (instrs FSQRTv4f32)>; -def : InstRW<[TSV110Wr_64cyc_1FSU2], (instrs FSQRTv2f64)>; - -// ASIMD FP max,min -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?v")>; -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?Pv")>; -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(MAX|MIN)(NM)?Vv")>; - -// ASIMD FP add -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|ADDP|SUB)v")>; - -// ASIMD FP multiply -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FMULX?v")>; - - -// ASIMD Miscellaneous Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(CLS|CLZ|CNT)v")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(DUP|INS)v.+lane")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^REV(16|32|64)v")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(UZP|ZIP)[12]v")>; - -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^EXTv")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^XTNv")>; -def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^RBITv")>; - -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^(INS|DUP)v.+gpr")>; - -def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^[SU]MOVv")>; - -// ASIMD table lookup, D-form -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v8i8One")>; -def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v8i8Two")>; -def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v8i8Three")>; -def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v8i8Four")>; -// ASIMD table lookup, Q-form -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v16i8One")>; -def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v16i8Two")>; -def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v16i8Three")>; -def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v16i8Four")>; - -def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOVv")>; - -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT[AIMNPXZ]v")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[SU]CVTFv")>; -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[FU](RECP|RSQRT)(E|X)v")>; - - -// ASIMD Load Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "LD1i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "LD2i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "LD3i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "LD4i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; - -def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_6cyc_3LdSt], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_6cyc_3LdSt, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_6cyc_2LdSt], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_6cyc_2LdSt, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_10cyc_4F_4LdSt], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_10cyc_4F_4LdSt, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - - -// ASIMD Store Instructions -// ----------------------------------------------------------------------------- - -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "ST1i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "ST2i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "ST3i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; -def : InstRW<[TSV110Wr_6cyc_1F], (instregex "ST4i(8|16|32|64)$")>; -def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; - -def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[TSV110Wr_6cyc_1F], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[TSV110Wr_8cyc_1F], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[TSV110Wr_8cyc_1F, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -} // SchedModel = TSV110Model +//==- AArch64SchedTSV110.td - Huawei TSV110 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Huawei TSV110 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details. + +// Huawei TSV110 scheduling machine model. +def TSV110Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched per cycle. + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoopMicroOpBufferSize = 16; + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); +} + +// Define each kind of processor resource and number available on the TSV110, +// which has 8 pipelines, each with its own queue where micro-ops wait for +// their operands and issue out-of-order to one of eight execution pipelines. +let SchedModel = TSV110Model in { + def TSV110UnitALU : ProcResource<1>; // Int ALU + def TSV110UnitAB : ProcResource<2>; // Int ALU/BRU + def TSV110UnitMDU : ProcResource<1>; // Multi-Cycle + def TSV110UnitFSU1 : ProcResource<1>; // FP/ASIMD + def TSV110UnitFSU2 : ProcResource<1>; // FP/ASIMD + def TSV110UnitLdSt : ProcResource<2>; // Load/Store + + def TSV110UnitF : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2]>; + def TSV110UnitALUAB : ProcResGroup<[TSV110UnitALU, TSV110UnitAB]>; + def TSV110UnitFLdSt : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2, TSV110UnitLdSt]>; +} + +let SchedModel = TSV110Model in { + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// TSV110 + +// Integer ALU +def : WriteRes<WriteImm, [TSV110UnitALUAB]> { let Latency = 1; } +def : WriteRes<WriteI, [TSV110UnitALUAB]> { let Latency = 1; } +def : WriteRes<WriteISReg, [TSV110UnitMDU]> { let Latency = 2; } +def : WriteRes<WriteIEReg, [TSV110UnitMDU]> { let Latency = 2; } +def : WriteRes<WriteExtr, [TSV110UnitALUAB]> { let Latency = 1; } +def : WriteRes<WriteIS, [TSV110UnitALUAB]> { let Latency = 1; } + +// Integer Mul/MAC/Div +def : WriteRes<WriteID32, [TSV110UnitMDU]> { let Latency = 12; + let ResourceCycles = [12]; } +def : WriteRes<WriteID64, [TSV110UnitMDU]> { let Latency = 20; + let ResourceCycles = [20]; } +def : WriteRes<WriteIM32, [TSV110UnitMDU]> { let Latency = 3; } +def : WriteRes<WriteIM64, [TSV110UnitMDU]> { let Latency = 4; } + +// Load +def : WriteRes<WriteLD, [TSV110UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteLDIdx, [TSV110UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +// Pre/Post Indexing +def : WriteRes<WriteAdr, [TSV110UnitALUAB]> { let Latency = 1; } + +// Store +def : WriteRes<WriteST, [TSV110UnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTP, [TSV110UnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTIdx, [TSV110UnitLdSt]> { let Latency = 1; } + +// FP +def : WriteRes<WriteF, [TSV110UnitF]> { let Latency = 2; } +def : WriteRes<WriteFCmp, [TSV110UnitF]> { let Latency = 3; } +def : WriteRes<WriteFCvt, [TSV110UnitF]> { let Latency = 3; } +def : WriteRes<WriteFCopy, [TSV110UnitF]> { let Latency = 2; } +def : WriteRes<WriteFImm, [TSV110UnitF]> { let Latency = 2; } +def : WriteRes<WriteFMul, [TSV110UnitF]> { let Latency = 5; } + +// FP Div, Sqrt +def : WriteRes<WriteFDiv, [TSV110UnitFSU1]> { let Latency = 18; } + +def : WriteRes<WriteV, [TSV110UnitF]> { let Latency = 4; } +def : WriteRes<WriteVLD, [TSV110UnitFLdSt]> { let Latency = 5; } +def : WriteRes<WriteVST, [TSV110UnitF]> { let Latency = 1; } + +// Branch +def : WriteRes<WriteBr, [TSV110UnitAB]> { let Latency = 1; } +def : WriteRes<WriteBrReg, [TSV110UnitAB]> { let Latency = 1; } +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// Forwarding logic is modeled only for multiply and accumulate. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// Detailed Refinements +//===----------------------------------------------------------------------===// + +// Contains all of the TSV110 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: TSV110Wr +// Latency: #cyc +// MicroOp Count/Types: #(ALU|AB|MDU|FSU1|FSU2|LdSt|ALUAB|F|FLdSt) +// +// e.g. TSV110Wr_6cyc_1ALU_6MDU_4LdSt means the total latency is 6 and there are +// 1 micro-ops to be issued down one ALU pipe, six MDU pipes and four LdSt pipes. +// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def TSV110Wr_1cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 1; } +def TSV110Wr_1cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 1; } +def TSV110Wr_1cyc_1ALUAB : SchedWriteRes<[TSV110UnitALUAB]> { let Latency = 1; } +def TSV110Wr_1cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 1; } + +def TSV110Wr_2cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 2; } +def TSV110Wr_2cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 2; } +def TSV110Wr_2cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 2; } +def TSV110Wr_2cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 2; } +def TSV110Wr_2cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 2; } +def TSV110Wr_2cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 2; } + +def TSV110Wr_3cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 3; } +def TSV110Wr_3cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 3; } +def TSV110Wr_3cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 3; } + +def TSV110Wr_4cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 4; } +def TSV110Wr_4cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 4; } +def TSV110Wr_4cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 4; } +def TSV110Wr_4cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 4; } + +def TSV110Wr_5cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 5; } +def TSV110Wr_5cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 5; } +def TSV110Wr_5cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 5; } +def TSV110Wr_5cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 5; } + +def TSV110Wr_6cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 6; } + +def TSV110Wr_7cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 7; } + +def TSV110Wr_8cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 8; } + +def TSV110Wr_11cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 11; } + +def TSV110Wr_12cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 12; } + +def TSV110Wr_17cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 17; } + +def TSV110Wr_18cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 18; } + +def TSV110Wr_20cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 20; } + +def TSV110Wr_24cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 24; } + +def TSV110Wr_31cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 31; } + +def TSV110Wr_36cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 36; } + +def TSV110Wr_38cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 38; } + +def TSV110Wr_64cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 64; } + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def TSV110Wr_1cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_2F : SchedWriteRes<[TSV110UnitF, + TSV110UnitF]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_2F : SchedWriteRes<[TSV110UnitF, + TSV110UnitF]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_5cyc_1ALU_1F : SchedWriteRes<[TSV110UnitALU, + TSV110UnitF]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def TSV110Wr_6cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def TSV110Wr_6cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def TSV110Wr_7cyc_1F_1LdSt : SchedWriteRes<[TSV110UnitF, + TSV110UnitLdSt]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def TSV110Wr_8cyc_2FSU1 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU1]> { + let Latency = 8; + let NumMicroOps = 2; +} + + +def TSV110Wr_8cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 8; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def TSV110Wr_6cyc_3F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def TSV110Wr_6cyc_3LdSt : SchedWriteRes<[TSV110UnitLdSt, TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def TSV110Wr_7cyc_2F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt]> { + let Latency = 7; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define Generic 4 micro-op types + +def TSV110Wr_8cyc_4F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitF]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def TSV110Wr_8cyc_3F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitLdSt]> { + let Latency = 8; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define Generic 5 micro-op types + +def TSV110Wr_8cyc_3F_2LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt, TSV110UnitLdSt]> { + let Latency = 8; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define Generic 8 micro-op types + +def TSV110Wr_10cyc_4F_4LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt, TSV110UnitLdSt, + TSV110UnitLdSt, TSV110UnitLdSt]> { + let Latency = 10; + let NumMicroOps = 8; +} + + +// Branch Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs B)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BL)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BLR)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>; + + +// Cryptography Extensions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AES[DE]")>; +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AESI?MC")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA1SU1")>; +def : InstRW<[TSV110Wr_2cyc_2F], (instregex "^SHA1(H|SU0)")>; +def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA1[CMP]")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA256SU0")>; +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^SHA256SU1")>; +def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA256(H|H2)")>; +def TSV110ReadCRC: SchedReadAdvance<1, [TSV110Wr_2cyc_1MDU]>; +def : InstRW<[TSV110Wr_2cyc_1MDU, TSV110ReadCRC], (instregex "^CRC32.*$")>; + + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(BIC|EON|ORN)[WX]rr")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(BIC)S[WX]rr")>; + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(ADD|AND|EOR|ORR|SUB)S[WX]r(r|i)")>; + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(ADC|SBC|BIC)[WX]r$")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(ADC|SBC)S[WX]r$")>; + +def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>; +def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)S[WX]rs$")>; +def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(ADD|SUB)[WX]r(s|x|x64)$")>; +def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(ADD|SUB)S[WX]r(s|x|x64)$")>; + +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; + + +// Move and Shift Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instrs ADR, ADRP)>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^MOV[NZK][WX]i")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(LSLV|LSRV|ASRV|RORV)(W|X)r")>; + + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; + +def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; +def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; +def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; +def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; + + +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^EXTR(W|X)rri$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(S|U)?BFM(W|X)ri$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>; + + +// Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(W|X)l$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs LDRSWl)>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTR(B|H|W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDNP(W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDP(W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt_1ALUAB, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpost)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpre)>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFMl)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFUMi)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMui$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMro(W|X)$")>; + + +// Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STN?P(W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR(BB|HH|W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR(BB|HH|W|X)ui$")>; + +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; + + +// FP Data Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "F(ABS|NEG)(D|S)r")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCCMP(E)?(S|D)rr$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCSEL(S|D)rrr$")>; + +def : InstRW<[TSV110Wr_11cyc_1FSU1], (instrs FDIVSrr)>; +def : InstRW<[TSV110Wr_18cyc_1FSU1], (instrs FDIVDrr)>; +def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTSr)>; +def : InstRW<[TSV110Wr_31cyc_1FSU2], (instrs FSQRTDr)>; + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN).+rr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^FN?M(ADD|SUB)Hrrr")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FN?M(ADD|SUB)Srrr")>; +def : InstRW<[TSV110Wr_7cyc_1F], (instregex "^FN?M(ADD|SUB)Drrr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Hrr")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|SUB)Srr")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Drr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(N)?MULHrr$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULSrr$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULDrr$")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT.+r")>; + + +// FP Miscellaneous Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_5cyc_1ALU_1F], (instregex "^[SU]CVTF[SU][WX][SD]ri")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT[HSD][HSD]r")>; + +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^FMOV(DX|WS|XD|SW|DXHigh|XDHigh)r$")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOV[SD][ir]$")>; + + +// FP Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[DSQ]l")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDUR[BDHSQ]i")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[BDHSQ]ui")>; +def : InstRW<[TSV110Wr_6cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi], (instregex "^LDN?P[DQS]i")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi, WriteAdr], (instregex "^LDP[DQS](post|pre)")>; + + +// FP Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR[BHSDQ]i")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ](post|pre)")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR[BHSDQ]ui")>; +def : InstRW<[TSV110Wr_2cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ]ro[WX]")>; +def : InstRW<[TSV110Wr_2cyc_2LdSt], (instregex "^STN?P[SDQ]i")>; +def : InstRW<[TSV110Wr_2cyc_2LdSt, WriteAdr], (instregex "^STP[SDQ](post|pre)")>; + + +// ASIMD Integer Instructions +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 +// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 +// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 + +// ASIMD simple arithmetic +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(ABS|ADD(P)?|NEG|SUB)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](ADD(L|LP|W)|SUB(L|W))v")>; + +// ASIMD complex arithmetic +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]H(ADD|SUB)v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^R?(ADD|SUB)HN2?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]Q(ADD|SUB)v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^(SU|US)QADDv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]RHADDv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABAL?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABDL?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ADALPv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^((SQ)(ABS|NEG))v")>; + +// ASIMD compare +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT|TST)v")>; + +// ASIMD max/min +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)P?v")>; + +// ASIMD logical +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(AND|BIC|BIF|BIT|BSL|EOR|MVN|NOT|ORN|ORR)v")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[TSV110Wr_8cyc_2FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v16i8|v8i16|v4i32)")>; + +// ASIMD multiply accumulate long +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD shift +// ASIMD shift accumulate +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(S|SR|U|UR)SRA")>; +// ASIMD shift by immed, basic +def : InstRW<[TSV110Wr_4cyc_1FSU1], + (instregex "SHLv","SLIv","SRIv","SHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD reduction +// ASIMD arith, reduce, 4H/4S +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B/8H +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?Vv16i8v$")>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; + + +// Vector - Floating Point +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v2f32 +// Q form - v4f32, v2f64 +// D form - 32, 64 +// D form - v1i32, v1i64 +// D form - v2i32 +// Q form - v4i32, v2i64 + +// ASIMD FP sign manipulation +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FABSv")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FNEGv")>; + +// ASIMD FP compare +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v")>; + +// ASIMD FP convert +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FCVT[AMNPZ][SU]v")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT(L)v")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FCVT(N|XN)v")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[TSV110Wr_11cyc_1FSU1], (instregex "FDIVv2f32")>; +// ASIMD FP divide, Q-form, F32 +def : InstRW<[TSV110Wr_24cyc_1FSU1], (instregex "FDIVv4f32")>; +// ASIMD FP divide, Q-form, F64 +def : InstRW<[TSV110Wr_38cyc_1FSU1], (instregex "FDIVv2f64")>; + +// ASIMD FP SQRT +def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTv2f32)>; +def : InstRW<[TSV110Wr_36cyc_1FSU2], (instrs FSQRTv4f32)>; +def : InstRW<[TSV110Wr_64cyc_1FSU2], (instrs FSQRTv2f64)>; + +// ASIMD FP max,min +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?v")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(MAX|MIN)(NM)?Vv")>; + +// ASIMD FP add +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|ADDP|SUB)v")>; + +// ASIMD FP multiply +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FMULX?v")>; + + +// ASIMD Miscellaneous Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(CLS|CLZ|CNT)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(DUP|INS)v.+lane")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^REV(16|32|64)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(UZP|ZIP)[12]v")>; + +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^EXTv")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^XTNv")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^RBITv")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^(INS|DUP)v.+gpr")>; + +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^[SU]MOVv")>; + +// ASIMD table lookup, D-form +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v8i8Four")>; +// ASIMD table lookup, Q-form +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v16i8Four")>; + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOVv")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT[AIMNPXZ]v")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[SU]CVTFv")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[FU](RECP|RSQRT)(E|X)v")>; + + +// ASIMD Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; + +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_3LdSt], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_3LdSt, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_2LdSt], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_2LdSt, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_10cyc_4F_4LdSt], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_10cyc_4F_4LdSt, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + + +// ASIMD Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_1F], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_1F], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_8cyc_1F], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_1F, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +} // SchedModel = TSV110Model diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index a5bc3668ed..38ab512c56 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -82,8 +82,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, unsigned OffsetScaled = 0; while (OffsetScaled < ObjSizeScaled) { if (ObjSizeScaled - OffsetScaled >= 2) { - SDValue AddrNode = - DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl); + SDValue AddrNode = + DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl); SDValue St = DAG.getMemIntrinsicNode( OpCode2, dl, DAG.getVTList(MVT::Other), {Chain, TagSrc, AddrNode}, @@ -95,8 +95,8 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, } if (ObjSizeScaled - OffsetScaled > 0) { - SDValue AddrNode = - DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl); + SDValue AddrNode = + DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(OffsetScaled * 16), dl); SDValue St = DAG.getMemIntrinsicNode( OpCode1, dl, DAG.getVTList(MVT::Other), {Chain, TagSrc, AddrNode}, diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTagging.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTagging.cpp index ab49e0c3f9..93dfda439d 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTagging.cpp @@ -59,7 +59,7 @@ using namespace llvm; -#define DEBUG_TYPE "aarch64-stack-tagging" +#define DEBUG_TYPE "aarch64-stack-tagging" static cl::opt<bool> ClMergeInit( "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore, @@ -73,10 +73,10 @@ static cl::opt<bool> static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit", cl::init(40), cl::Hidden); -static cl::opt<unsigned> - ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272), - cl::Hidden); - +static cl::opt<unsigned> + ClMergeInitSizeLimit("stack-tagging-merge-init-size-limit", cl::init(272), + cl::Hidden); + static const Align kTagGranuleSize = Align(16); namespace { @@ -107,10 +107,10 @@ public: SetTagZeroFn(SetTagZeroFn), StgpFn(StgpFn) {} bool addRange(uint64_t Start, uint64_t End, Instruction *Inst) { - auto I = - llvm::lower_bound(Ranges, Start, [](const Range &LHS, uint64_t RHS) { - return LHS.End <= RHS; - }); + auto I = + llvm::lower_bound(Ranges, Start, [](const Range &LHS, uint64_t RHS) { + return LHS.End <= RHS; + }); if (I != Ranges.end() && End > I->Start) { // Overlap - bail. return false; @@ -439,8 +439,8 @@ void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, bool LittleEndian = Triple(AI->getModule()->getTargetTriple()).isLittleEndian(); // Current implementation of initializer merging assumes little endianness. - if (MergeInit && !F->hasOptNone() && LittleEndian && - Size < ClMergeInitSizeLimit) { + if (MergeInit && !F->hasOptNone() && LittleEndian && + Size < ClMergeInitSizeLimit) { LLVM_DEBUG(dbgs() << "collecting initializers for " << *AI << ", size = " << Size << "\n"); InsertBefore = collectInitializers(InsertBefore, Ptr, Size, IB); @@ -571,7 +571,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { auto *II = dyn_cast<IntrinsicInst>(I); if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end)) { - AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); + AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); if (!AI) { UnrecognizedLifetimes.push_back(I); continue; @@ -659,7 +659,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { IntrinsicInst *Start = Info.LifetimeStart[0]; IntrinsicInst *End = Info.LifetimeEnd[0]; uint64_t Size = - cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); + cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue(); Size = alignTo(Size, kTagGranuleSize); tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size); // We need to ensure that if we tag some object, we certainly untag it diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp index 41096a9613..4e64b6116e 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -49,12 +49,12 @@ cl::opt<UncheckedLdStMode> ClUncheckedLdSt( "apply unchecked-ld-st when the target is definitely within range"), clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st"))); -static cl::opt<bool> - ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true), - cl::ZeroOrMore, - cl::desc("Apply first slot optimization for stack tagging " - "(eliminate ADDG Rt, Rn, 0, 0).")); - +static cl::opt<bool> + ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true), + cl::ZeroOrMore, + cl::desc("Apply first slot optimization for stack tagging " + "(eliminate ADDG Rt, Rn, 0, 0).")); + namespace { class AArch64StackTaggingPreRA : public MachineFunctionPass { @@ -76,7 +76,7 @@ public: bool mayUseUncheckedLoadStore(); void uncheckUsesOf(unsigned TaggedReg, int FI); void uncheckLoadsAndStores(); - Optional<int> findFirstSlotCandidate(); + Optional<int> findFirstSlotCandidate(); bool runOnMachineFunction(MachineFunction &Func) override; StringRef getPassName() const override { @@ -203,141 +203,141 @@ void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { } } -struct SlotWithTag { - int FI; - int Tag; - SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} - explicit SlotWithTag(const MachineInstr &MI) - : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {} - bool operator==(const SlotWithTag &Other) const { - return FI == Other.FI && Tag == Other.Tag; - } -}; - -namespace llvm { -template <> struct DenseMapInfo<SlotWithTag> { - static inline SlotWithTag getEmptyKey() { return {-2, -2}; } - static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } - static unsigned getHashValue(const SlotWithTag &V) { - return hash_combine(DenseMapInfo<int>::getHashValue(V.FI), - DenseMapInfo<int>::getHashValue(V.Tag)); - } - static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { - return A == B; - } -}; -} // namespace llvm - -static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { - return MFI->getUseLocalStackAllocationBlock() && - MFI->isObjectPreAllocated(FI); -} - -// Pin one of the tagged slots to offset 0 from the tagged base pointer. -// This would make its address available in a virtual register (IRG's def), as -// opposed to requiring an ADDG instruction to materialize. This effectively -// eliminates a vreg (by replacing it with direct uses of IRG, which is usually -// live almost everywhere anyway), and therefore needs to happen before -// regalloc. -Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() { - // Find the best (FI, Tag) pair to pin to offset 0. - // Looking at the possible uses of a tagged address, the advantage of pinning - // is: - // - COPY to physical register. - // Does not matter, this would trade a MOV instruction for an ADDG. - // - ST*G matter, but those mostly appear near the function prologue where all - // the tagged addresses need to be materialized anyway; also, counting ST*G - // uses would overweight large allocas that require more than one ST*G - // instruction. - // - Load/Store instructions in the address operand do not require a tagged - // pointer, so they also do not benefit. These operands have already been - // eliminated (see uncheckLoadsAndStores) so all remaining load/store - // instructions count. - // - Any other instruction may benefit from being pinned to offset 0. - LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n"); - if (!ClFirstSlot) - return None; - - DenseMap<SlotWithTag, int> RetagScore; - SlotWithTag MaxScoreST{-1, -1}; - int MaxScore = -1; - for (auto *I : ReTags) { - SlotWithTag ST{*I}; - if (isSlotPreAllocated(MFI, ST.FI)) - continue; - - Register RetagReg = I->getOperand(0).getReg(); - if (!Register::isVirtualRegister(RetagReg)) - continue; - - int Score = 0; - SmallVector<Register, 8> WorkList; - WorkList.push_back(RetagReg); - - while (!WorkList.empty()) { - Register UseReg = WorkList.back(); - WorkList.pop_back(); - for (auto &UseI : MRI->use_instructions(UseReg)) { - unsigned Opcode = UseI.getOpcode(); - if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset || - Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset || - Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || - Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || - Opcode == AArch64::STZGloop_wback) - continue; - if (UseI.isCopy()) { - Register DstReg = UseI.getOperand(0).getReg(); - if (Register::isVirtualRegister(DstReg)) - WorkList.push_back(DstReg); - continue; - } - LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %" - << Register::virtReg2Index(UseReg) << " in " << UseI - << "\n"); - Score++; - } - } - - int TotalScore = RetagScore[ST] += Score; - if (TotalScore > MaxScore || - (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { - MaxScore = TotalScore; - MaxScoreST = ST; - } - } - - if (MaxScoreST.FI < 0) - return None; - - // If FI's tag is already 0, we are done. - if (MaxScoreST.Tag == 0) - return MaxScoreST.FI; - - // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. - SlotWithTag SwapST{-1, -1}; - for (auto *I : ReTags) { - SlotWithTag ST{*I}; - if (ST.Tag == 0) { - SwapST = ST; - break; - } - } - - // Swap tags between the victim and the highest scoring pair. - // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for - // the highest score slot without changing anything else. - for (auto *&I : ReTags) { - SlotWithTag ST{*I}; - MachineOperand &TagOp = I->getOperand(4); - if (ST == MaxScoreST) { - TagOp.setImm(0); - } else if (ST == SwapST) { - TagOp.setImm(MaxScoreST.Tag); - } - } - return MaxScoreST.FI; -} - +struct SlotWithTag { + int FI; + int Tag; + SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} + explicit SlotWithTag(const MachineInstr &MI) + : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {} + bool operator==(const SlotWithTag &Other) const { + return FI == Other.FI && Tag == Other.Tag; + } +}; + +namespace llvm { +template <> struct DenseMapInfo<SlotWithTag> { + static inline SlotWithTag getEmptyKey() { return {-2, -2}; } + static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } + static unsigned getHashValue(const SlotWithTag &V) { + return hash_combine(DenseMapInfo<int>::getHashValue(V.FI), + DenseMapInfo<int>::getHashValue(V.Tag)); + } + static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { + return A == B; + } +}; +} // namespace llvm + +static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { + return MFI->getUseLocalStackAllocationBlock() && + MFI->isObjectPreAllocated(FI); +} + +// Pin one of the tagged slots to offset 0 from the tagged base pointer. +// This would make its address available in a virtual register (IRG's def), as +// opposed to requiring an ADDG instruction to materialize. This effectively +// eliminates a vreg (by replacing it with direct uses of IRG, which is usually +// live almost everywhere anyway), and therefore needs to happen before +// regalloc. +Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() { + // Find the best (FI, Tag) pair to pin to offset 0. + // Looking at the possible uses of a tagged address, the advantage of pinning + // is: + // - COPY to physical register. + // Does not matter, this would trade a MOV instruction for an ADDG. + // - ST*G matter, but those mostly appear near the function prologue where all + // the tagged addresses need to be materialized anyway; also, counting ST*G + // uses would overweight large allocas that require more than one ST*G + // instruction. + // - Load/Store instructions in the address operand do not require a tagged + // pointer, so they also do not benefit. These operands have already been + // eliminated (see uncheckLoadsAndStores) so all remaining load/store + // instructions count. + // - Any other instruction may benefit from being pinned to offset 0. + LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n"); + if (!ClFirstSlot) + return None; + + DenseMap<SlotWithTag, int> RetagScore; + SlotWithTag MaxScoreST{-1, -1}; + int MaxScore = -1; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (isSlotPreAllocated(MFI, ST.FI)) + continue; + + Register RetagReg = I->getOperand(0).getReg(); + if (!Register::isVirtualRegister(RetagReg)) + continue; + + int Score = 0; + SmallVector<Register, 8> WorkList; + WorkList.push_back(RetagReg); + + while (!WorkList.empty()) { + Register UseReg = WorkList.back(); + WorkList.pop_back(); + for (auto &UseI : MRI->use_instructions(UseReg)) { + unsigned Opcode = UseI.getOpcode(); + if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset || + Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset || + Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || + Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || + Opcode == AArch64::STZGloop_wback) + continue; + if (UseI.isCopy()) { + Register DstReg = UseI.getOperand(0).getReg(); + if (Register::isVirtualRegister(DstReg)) + WorkList.push_back(DstReg); + continue; + } + LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %" + << Register::virtReg2Index(UseReg) << " in " << UseI + << "\n"); + Score++; + } + } + + int TotalScore = RetagScore[ST] += Score; + if (TotalScore > MaxScore || + (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { + MaxScore = TotalScore; + MaxScoreST = ST; + } + } + + if (MaxScoreST.FI < 0) + return None; + + // If FI's tag is already 0, we are done. + if (MaxScoreST.Tag == 0) + return MaxScoreST.FI; + + // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. + SlotWithTag SwapST{-1, -1}; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (ST.Tag == 0) { + SwapST = ST; + break; + } + } + + // Swap tags between the victim and the highest scoring pair. + // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for + // the highest score slot without changing anything else. + for (auto *&I : ReTags) { + SlotWithTag ST{*I}; + MachineOperand &TagOp = I->getOperand(4); + if (ST == MaxScoreST) { + TagOp.setImm(0); + } else if (ST == SwapST) { + TagOp.setImm(MaxScoreST.Tag); + } + } + return MaxScoreST.FI; +} + bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { MF = &Func; MRI = &MF->getRegInfo(); @@ -366,35 +366,35 @@ bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { } } - // Take over from SSP. It does nothing for tagged slots, and should not really - // have been enabled in the first place. - for (int FI : TaggedSlots) - MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None); - + // Take over from SSP. It does nothing for tagged slots, and should not really + // have been enabled in the first place. + for (int FI : TaggedSlots) + MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None); + if (ReTags.empty()) return false; if (mayUseUncheckedLoadStore()) uncheckLoadsAndStores(); - // Find a slot that is used with zero tag offset, like ADDG #fi, 0. - // If the base tagged pointer is set up to the address of this slot, - // the ADDG instruction can be eliminated. - Optional<int> BaseSlot = findFirstSlotCandidate(); - if (BaseSlot) - AFI->setTaggedBasePointerIndex(*BaseSlot); - - for (auto *I : ReTags) { - int FI = I->getOperand(1).getIndex(); - int Tag = I->getOperand(4).getImm(); - Register Base = I->getOperand(3).getReg(); - if (Tag == 0 && FI == BaseSlot) { - BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY), - I->getOperand(0).getReg()) - .addReg(Base); - I->eraseFromParent(); - } - } - + // Find a slot that is used with zero tag offset, like ADDG #fi, 0. + // If the base tagged pointer is set up to the address of this slot, + // the ADDG instruction can be eliminated. + Optional<int> BaseSlot = findFirstSlotCandidate(); + if (BaseSlot) + AFI->setTaggedBasePointerIndex(*BaseSlot); + + for (auto *I : ReTags) { + int FI = I->getOperand(1).getIndex(); + int Tag = I->getOperand(4).getImm(); + Register Base = I->getOperand(3).getReg(); + if (Tag == 0 && FI == BaseSlot) { + BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY), + I->getOperand(0).getReg()) + .addReg(Base); + I->eraseFromParent(); + } + } + return true; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.cpp index 71b2bb1964..f78643d8e7 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.cpp @@ -67,7 +67,7 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, if (CPUString.empty()) CPUString = "generic"; - ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); + ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); initializeProperties(); return *this; @@ -103,26 +103,26 @@ void AArch64Subtarget::initializeProperties() { case CortexA76: case CortexA77: case CortexA78: - case CortexA78C: - case CortexR82: + case CortexA78C: + case CortexR82: case CortexX1: PrefFunctionLogAlignment = 4; break; case A64FX: CacheLineSize = 256; - PrefFunctionLogAlignment = 3; - PrefLoopLogAlignment = 2; - MaxInterleaveFactor = 4; - PrefetchDistance = 128; - MinPrefetchStride = 1024; - MaxPrefetchIterationsAhead = 4; + PrefFunctionLogAlignment = 3; + PrefLoopLogAlignment = 2; + MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 4; break; case AppleA7: case AppleA10: case AppleA11: case AppleA12: case AppleA13: - case AppleA14: + case AppleA14: CacheLineSize = 64; PrefetchDistance = 280; MinPrefetchStride = 2048; @@ -157,8 +157,8 @@ void AArch64Subtarget::initializeProperties() { PrefFunctionLogAlignment = 3; break; case NeoverseN1: - case NeoverseN2: - case NeoverseV1: + case NeoverseN2: + case NeoverseV1: PrefFunctionLogAlignment = 4; break; case Saphira: @@ -209,7 +209,7 @@ void AArch64Subtarget::initializeProperties() { AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), + : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), IsLittle(LittleEndian), @@ -375,8 +375,8 @@ unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { return (SVEVectorBitsMin / 128) * 128; return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; } - -bool AArch64Subtarget::useSVEForFixedLengthVectors() const { - // Prefer NEON unless larger SVE registers are available. - return hasSVE() && getMinSVEVectorSizeInBits() >= 256; -} + +bool AArch64Subtarget::useSVEForFixedLengthVectors() const { + // Prefer NEON unless larger SVE registers are available. + return hasSVE() && getMinSVEVectorSizeInBits() >= 256; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.h index 8fe2f12598..ce401f4986 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64Subtarget.h @@ -45,7 +45,7 @@ public: AppleA11, AppleA12, AppleA13, - AppleA14, + AppleA14, Carmel, CortexA35, CortexA53, @@ -58,24 +58,24 @@ public: CortexA76, CortexA77, CortexA78, - CortexA78C, - CortexR82, + CortexA78C, + CortexR82, CortexX1, ExynosM3, Falkor, Kryo, NeoverseE1, NeoverseN1, - NeoverseN2, - NeoverseV1, + NeoverseN2, + NeoverseV1, Saphira, ThunderX2T99, ThunderX, ThunderXT81, ThunderXT83, ThunderXT88, - ThunderX3T110, - TSV110 + ThunderX3T110, + TSV110 }; protected: @@ -88,11 +88,11 @@ protected: bool HasV8_4aOps = false; bool HasV8_5aOps = false; bool HasV8_6aOps = false; - bool HasV8_7aOps = false; - - bool HasV8_0rOps = false; - bool HasCONTEXTIDREL2 = false; + bool HasV8_7aOps = false; + bool HasV8_0rOps = false; + bool HasCONTEXTIDREL2 = false; + bool HasFPARMv8 = false; bool HasNEON = false; bool HasCrypto = false; @@ -127,7 +127,7 @@ protected: bool HasAES = false; // ARMv8.3 extensions - bool HasPAuth = false; + bool HasPAuth = false; bool HasJS = false; bool HasCCIDX = false; bool HasComplxNum = false; @@ -141,7 +141,7 @@ protected: bool HasSEL2 = false; bool HasPMU = false; bool HasTLB_RMI = false; - bool HasFlagM = false; + bool HasFlagM = false; bool HasRCPC_IMMO = false; bool HasLSLFast = false; @@ -170,12 +170,12 @@ protected: bool HasFineGrainedTraps = false; bool HasEnhancedCounterVirtualization = false; - // Armv8.7-A Extensions - bool HasXS = false; - bool HasWFxT = false; - bool HasHCX = false; - bool HasLS64 = false; - + // Armv8.7-A Extensions + bool HasXS = false; + bool HasWFxT = false; + bool HasHCX = false; + bool HasLS64 = false; + // Arm SVE2 extensions bool HasSVE2 = false; bool HasSVE2AES = false; @@ -186,9 +186,9 @@ protected: // Future architecture extensions. bool HasETE = false; bool HasTRBE = false; - bool HasBRBE = false; - bool HasPAUTH = false; - bool HasSPE_EEF = false; + bool HasBRBE = false; + bool HasPAUTH = false; + bool HasSPE_EEF = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -208,7 +208,7 @@ protected: // Enable 64-bit vectorization in SLP. unsigned MinVectorRegisterBitWidth = 64; - bool OutlineAtomics = false; + bool OutlineAtomics = false; bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -221,7 +221,7 @@ protected: bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; - bool HasCmpBccFusion = false; + bool HasCmpBccFusion = false; bool HasFuseAddress = false; bool HasFuseAES = false; bool HasFuseArithmeticLogic = false; @@ -325,7 +325,7 @@ public: bool hasV8_3aOps() const { return HasV8_3aOps; } bool hasV8_4aOps() const { return HasV8_4aOps; } bool hasV8_5aOps() const { return HasV8_5aOps; } - bool hasV8_0rOps() const { return HasV8_0rOps; } + bool hasV8_0rOps() const { return HasV8_0rOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } @@ -363,7 +363,7 @@ public: bool hasSHA3() const { return HasSHA3; } bool hasSHA2() const { return HasSHA2; } bool hasAES() const { return HasAES; } - bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; } + bool hasCONTEXTIDREL2() const { return HasCONTEXTIDREL2; } bool balanceFPOps() const { return BalanceFPOps; } bool predictableSelectIsExpensive() const { return PredictableSelectIsExpensive; @@ -378,7 +378,7 @@ public: } bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } - bool hasCmpBccFusion() const { return HasCmpBccFusion; } + bool hasCmpBccFusion() const { return HasCmpBccFusion; } bool hasFuseAddress() const { return HasFuseAddress; } bool hasFuseAES() const { return HasFuseAES; } bool hasFuseArithmeticLogic() const { return HasFuseArithmeticLogic; } @@ -454,7 +454,7 @@ public: bool hasRandGen() const { return HasRandGen; } bool hasMTE() const { return HasMTE; } bool hasTME() const { return HasTME; } - bool hasPAUTH() const { return HasPAUTH; } + bool hasPAUTH() const { return HasPAUTH; } // Arm SVE2 extensions bool hasSVE2AES() const { return HasSVE2AES; } bool hasSVE2SM4() const { return HasSVE2SM4; } @@ -484,15 +484,15 @@ public: bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - bool isTargetILP32() const { - return TargetTriple.isArch32Bit() || - TargetTriple.getEnvironment() == Triple::GNUILP32; - } + bool isTargetILP32() const { + return TargetTriple.isArch32Bit() || + TargetTriple.getEnvironment() == Triple::GNUILP32; + } bool useAA() const override { return UseAA; } - bool outlineAtomics() const { return OutlineAtomics; } - + bool outlineAtomics() const { return OutlineAtomics; } + bool hasVH() const { return HasVH; } bool hasPAN() const { return HasPAN; } bool hasLOR() const { return HasLOR; } @@ -501,7 +501,7 @@ public: bool hasPAN_RWV() const { return HasPAN_RWV; } bool hasCCPP() const { return HasCCPP; } - bool hasPAuth() const { return HasPAuth; } + bool hasPAuth() const { return HasPAuth; } bool hasJS() const { return HasJS; } bool hasCCIDX() const { return HasCCIDX; } bool hasComplxNum() const { return HasComplxNum; } @@ -512,14 +512,14 @@ public: bool hasTRACEV8_4() const { return HasTRACEV8_4; } bool hasAM() const { return HasAM; } bool hasAMVS() const { return HasAMVS; } - bool hasXS() const { return HasXS; } - bool hasWFxT() const { return HasWFxT; } - bool hasHCX() const { return HasHCX; } - bool hasLS64() const { return HasLS64; } + bool hasXS() const { return HasXS; } + bool hasWFxT() const { return HasWFxT; } + bool hasHCX() const { return HasHCX; } + bool hasLS64() const { return HasLS64; } bool hasSEL2() const { return HasSEL2; } bool hasPMU() const { return HasPMU; } bool hasTLB_RMI() const { return HasTLB_RMI; } - bool hasFlagM() const { return HasFlagM; } + bool hasFlagM() const { return HasFlagM; } bool hasRCPC_IMMO() const { return HasRCPC_IMMO; } bool addrSinkUsingGEPs() const override { @@ -542,7 +542,7 @@ public: /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. - void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); /// ClassifyGlobalReference - Find the target operand flags that describe /// how a global value should be referenced for the current subtarget. @@ -581,7 +581,7 @@ public: // implied by the architecture. unsigned getMaxSVEVectorSizeInBits() const; unsigned getMinSVEVectorSizeInBits() const; - bool useSVEForFixedLengthVectors() const; + bool useSVEForFixedLengthVectors() const; }; } // End llvm namespace diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SystemOperands.td index 01ac52bd87..0b9c53a72f 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64SystemOperands.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64SystemOperands.td @@ -32,11 +32,11 @@ def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, AssemblerPredicate<(all_of FeaturePAN_RWV), "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; -def HasCONTEXTIDREL2 - : Predicate<"Subtarget->hasCONTEXTIDREL2()">, - AssemblerPredicate<(all_of FeatureCONTEXTIDREL2), - "Target contains CONTEXTIDR_EL2 RW operand">; - +def HasCONTEXTIDREL2 + : Predicate<"Subtarget->hasCONTEXTIDREL2()">, + AssemblerPredicate<(all_of FeatureCONTEXTIDREL2), + "Target contains CONTEXTIDR_EL2 RW operand">; + //===----------------------------------------------------------------------===// // AT (address translate) instruction options. //===----------------------------------------------------------------------===// @@ -98,21 +98,21 @@ def : DB<"ld", 0xd>; def : DB<"st", 0xe>; def : DB<"sy", 0xf>; -class DBnXS<string name, bits<4> encoding, bits<5> immValue> : SearchableTable { - let SearchableFields = ["Name", "Encoding", "ImmValue"]; - let EnumValueField = "Encoding"; - - string Name = name; - bits<4> Encoding = encoding; - bits<5> ImmValue = immValue; - code Requires = [{ {AArch64::FeatureXS} }]; -} - -def : DBnXS<"oshnxs", 0x3, 0x10>; -def : DBnXS<"nshnxs", 0x7, 0x14>; -def : DBnXS<"ishnxs", 0xb, 0x18>; -def : DBnXS<"synxs", 0xf, 0x1c>; - +class DBnXS<string name, bits<4> encoding, bits<5> immValue> : SearchableTable { + let SearchableFields = ["Name", "Encoding", "ImmValue"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding = encoding; + bits<5> ImmValue = immValue; + code Requires = [{ {AArch64::FeatureXS} }]; +} + +def : DBnXS<"oshnxs", 0x3, 0x10>; +def : DBnXS<"nshnxs", 0x7, 0x14>; +def : DBnXS<"ishnxs", 0xb, 0x18>; +def : DBnXS<"synxs", 0xf, 0x1c>; + //===----------------------------------------------------------------------===// // DC (data cache maintenance) instruction options. //===----------------------------------------------------------------------===// @@ -404,8 +404,8 @@ def : BTI<"jc", 0b11>; // TLBI (translation lookaside buffer invalidate) instruction options. //===----------------------------------------------------------------------===// -class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg> { +class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg> { string Name = name; bits<14> Encoding; let Encoding{13-11} = op1; @@ -413,122 +413,122 @@ class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm, let Encoding{6-3} = crm; let Encoding{2-0} = op2; bit NeedsReg = needsreg; - list<string> Requires = []; - list<string> ExtraRequires = []; - code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }]; -} - -def TLBITable : GenericTable { - let FilterClass = "TLBIEntry"; - let CppTypeName = "TLBI"; - let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; -} - -def lookupTLBIByName : SearchIndex { - let Table = TLBITable; - let Key = ["Name"]; + list<string> Requires = []; + list<string> ExtraRequires = []; + code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }]; } -def lookupTLBIByEncoding : SearchIndex { - let Table = TLBITable; - let Key = ["Encoding"]; -} - -multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg = 1> { - def : TLBIEntry<name, op1, crn, crm, op2, needsreg>; - def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { - let Encoding{7} = 1; - let ExtraRequires = ["AArch64::FeatureXS"]; - } -} - -defm : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; -defm : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; -defm : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; -defm : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; -defm : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; -defm : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; -defm : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; -defm : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; -defm : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; -defm : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; -defm : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; -defm : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; -defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; -defm : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; -defm : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; -defm : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; -defm : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; -defm : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; -defm : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; -defm : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; -defm : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; -defm : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; -defm : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; -defm : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; -defm : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; -defm : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; -defm : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; -defm : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; - +def TLBITable : GenericTable { + let FilterClass = "TLBIEntry"; + let CppTypeName = "TLBI"; + let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; +} + +def lookupTLBIByName : SearchIndex { + let Table = TLBITable; + let Key = ["Name"]; +} + +def lookupTLBIByEncoding : SearchIndex { + let Table = TLBITable; + let Key = ["Encoding"]; +} + +multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg = 1> { + def : TLBIEntry<name, op1, crn, crm, op2, needsreg>; + def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> { + let Encoding{7} = 1; + let ExtraRequires = ["AArch64::FeatureXS"]; + } +} + +defm : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; +defm : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; +defm : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; +defm : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; +defm : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; +defm : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; +defm : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; +defm : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; +defm : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; +defm : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; +defm : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; +defm : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; +defm : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; +defm : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; +defm : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; +defm : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; +defm : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; +defm : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; +defm : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; + // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI) -let Requires = ["AArch64::FeatureTLB_RMI"] in { +let Requires = ["AArch64::FeatureTLB_RMI"] in { // Armv8.4-A Outer Sharable TLB Maintenance instructions: // op1 CRn CRm op2 -defm : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>; -defm : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>; -defm : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>; -defm : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>; -defm : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>; -defm : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>; -defm : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>; -defm : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>; -defm : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>; -defm : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>; -defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>; -defm : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>; -defm : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>; -defm : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>; -defm : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>; -defm : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>; +defm : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>; +defm : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>; +defm : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>; +defm : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>; +defm : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>; +defm : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>; +defm : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>; +defm : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>; +defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>; +defm : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>; +defm : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>; +defm : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>; +defm : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>; // Armv8.4-A TLB Range Maintenance instructions: // op1 CRn CRm op2 -defm : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>; -defm : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>; -defm : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>; -defm : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>; -defm : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>; -defm : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>; -defm : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>; -defm : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>; -defm : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>; -defm : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>; -defm : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>; -defm : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>; -defm : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>; -defm : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>; -defm : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>; -defm : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>; -defm : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>; -defm : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>; -defm : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>; -defm : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>; -defm : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>; -defm : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>; -defm : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>; -defm : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>; -defm : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>; -defm : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>; -defm : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>; -defm : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>; -defm : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>; -defm : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>; +defm : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>; +defm : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>; +defm : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>; +defm : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>; +defm : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>; +defm : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>; +defm : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>; +defm : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>; +defm : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>; +defm : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>; +defm : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>; +defm : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>; } //FeatureTLB_RMI // Armv8.5-A Prediction Restriction by Context instruction options: @@ -643,7 +643,7 @@ def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>; def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>; def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>; def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>; -def : ROSysReg<"ID_AA64ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b010>; +def : ROSysReg<"ID_AA64ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b010>; def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>; def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>; def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010>; @@ -859,9 +859,9 @@ def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>; def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>; -def : RWSysReg<"HCRX_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b010> { - let Requires = [{ {AArch64::FeatureHCX} }]; -} +def : RWSysReg<"HCRX_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b010> { + let Requires = [{ {AArch64::FeatureHCX} }]; +} def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>; def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>; def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>; @@ -1293,10 +1293,10 @@ def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>; def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>; def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>; def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>; -let Requires = [{ {AArch64::FeatureCONTEXTIDREL2} }] in { - def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; -} +let Requires = [{ {AArch64::FeatureCONTEXTIDREL2} }] in { + def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; } +} // v8.2a registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::FeaturePsUAO} }] in @@ -1336,7 +1336,7 @@ def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; // v8.3a "Pointer authentication extension" registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::FeaturePAuth} }] in { +let Requires = [{ {AArch64::FeaturePAuth} }] in { def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>; def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>; def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>; @@ -1570,33 +1570,33 @@ def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>; def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>; } -// v8.7a LD64B/ST64B Accelerator Extension system register -let Requires = [{ {AArch64::FeatureLS64} }] in -def : RWSysReg<"ACCDATA_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b101>; - -// Branch Record Buffer system registers -let Requires = [{ {AArch64::FeatureBRBE} }] in { -def : RWSysReg<"BRBCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b000>; -def : RWSysReg<"BRBCR_EL12", 0b10, 0b101, 0b1001, 0b0000, 0b000>; -def : RWSysReg<"BRBCR_EL2", 0b10, 0b100, 0b1001, 0b0000, 0b000>; -def : RWSysReg<"BRBFCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b001>; -def : ROSysReg<"BRBIDR0_EL1", 0b10, 0b001, 0b1001, 0b0010, 0b000>; -def : RWSysReg<"BRBINFINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b000>; -def : RWSysReg<"BRBSRCINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b001>; -def : RWSysReg<"BRBTGTINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b010>; -def : RWSysReg<"BRBTS_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b010>; -foreach n = 0-31 in { - defvar nb = !cast<bits<5>>(n); - def : ROSysReg<"BRBINF"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b00}>; - def : ROSysReg<"BRBSRC"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b01}>; - def : ROSysReg<"BRBTGT"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b10}>; -} -} - -// Statistical Profiling Extension system register -let Requires = [{ {AArch64::FeatureSPE_EEF} }] in -def : RWSysReg<"PMSNEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b001>; - +// v8.7a LD64B/ST64B Accelerator Extension system register +let Requires = [{ {AArch64::FeatureLS64} }] in +def : RWSysReg<"ACCDATA_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b101>; + +// Branch Record Buffer system registers +let Requires = [{ {AArch64::FeatureBRBE} }] in { +def : RWSysReg<"BRBCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBCR_EL12", 0b10, 0b101, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBCR_EL2", 0b10, 0b100, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBFCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b001>; +def : ROSysReg<"BRBIDR0_EL1", 0b10, 0b001, 0b1001, 0b0010, 0b000>; +def : RWSysReg<"BRBINFINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b000>; +def : RWSysReg<"BRBSRCINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b001>; +def : RWSysReg<"BRBTGTINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b010>; +def : RWSysReg<"BRBTS_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b010>; +foreach n = 0-31 in { + defvar nb = !cast<bits<5>>(n); + def : ROSysReg<"BRBINF"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b00}>; + def : ROSysReg<"BRBSRC"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b01}>; + def : ROSysReg<"BRBTGT"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b10}>; +} +} + +// Statistical Profiling Extension system register +let Requires = [{ {AArch64::FeatureSPE_EEF} }] in +def : RWSysReg<"PMSNEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b001>; + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 let Requires = [{ {AArch64::ProcAppleA7} }] in diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.cpp index bec1758a93..5635b07fd6 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -148,10 +148,10 @@ static cl::opt<int> EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(0)); -static cl::opt<bool> - EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden, - cl::desc("Enable SVE intrinsic opts"), - cl::init(true)); +static cl::opt<bool> + EnableSVEIntrinsicOpts("aarch64-enable-sve-intrinsic-opts", cl::Hidden, + cl::desc("Enable SVE intrinsic opts"), + cl::init(true)); static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", cl::init(true), cl::Hidden); @@ -184,8 +184,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); initializeAArch64PostLegalizerCombinerPass(*PR); - initializeAArch64PostLegalizerLoweringPass(*PR); - initializeAArch64PostSelectOptimizePass(*PR); + initializeAArch64PostLegalizerLoweringPass(*PR); + initializeAArch64PostSelectOptimizePass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -222,18 +222,18 @@ static std::string computeDataLayout(const Triple &TT, } if (TT.isOSBinFormatCOFF()) return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; - std::string Endian = LittleEndian ? "e" : "E"; - std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : ""; - return Endian + "-m:e" + Ptr32 + - "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; -} - -static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) { - if (CPU.empty() && TT.isArm64e()) - return "apple-a12"; - return CPU; + std::string Endian = LittleEndian ? "e" : "E"; + std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : ""; + return Endian + "-m:e" + Ptr32 + + "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; } +static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) { + if (CPU.empty() && TT.isArm64e()) + return "apple-a12"; + return CPU; +} + static Reloc::Model getEffectiveRelocModel(const Triple &TT, Optional<Reloc::Model> RM) { // AArch64 Darwin and Windows are always PIC. @@ -281,8 +281,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, bool LittleEndian) : LLVMTargetMachine(T, computeDataLayout(TT, Options.MCOptions, LittleEndian), - TT, computeDefaultCPU(TT, CPU), FS, Options, - getEffectiveRelocModel(TT, RM), + TT, computeDefaultCPU(TT, CPU), FS, Options, + getEffectiveRelocModel(TT, RM), getEffectiveAArch64CodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) { initAsmInfo(); @@ -317,7 +317,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // MachO/CodeModel::Large, which GlobalISel does not support. if (getOptLevel() <= EnableGlobalISelAtO && TT.getArch() != Triple::aarch64_32 && - TT.getEnvironment() != Triple::GNUILP32 && + TT.getEnvironment() != Triple::GNUILP32 && !(getCodeModel() == CodeModel::Large && TT.isOSBinFormatMachO())) { setGlobalISel(true); setGlobalISelAbort(GlobalISelAbortMode::Disable); @@ -340,10 +340,10 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); - std::string CPU = - CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; - std::string FS = - FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; + std::string CPU = + CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; + std::string FS = + FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; auto &I = SubtargetMap[CPU + FS]; if (!I) { @@ -460,12 +460,12 @@ void AArch64PassConfig::addIRPasses() { // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) - addPass(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); + addPass(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); // Run LoopDataPrefetch // @@ -553,13 +553,13 @@ bool AArch64PassConfig::addInstSelector() { } bool AArch64PassConfig::addIRTranslator() { - addPass(new IRTranslator(getOptLevel())); + addPass(new IRTranslator(getOptLevel())); return false; } void AArch64PassConfig::addPreLegalizeMachineIR() { bool IsOptNone = getOptLevel() == CodeGenOpt::None; - addPass(createAArch64PreLegalizerCombiner(IsOptNone)); + addPass(createAArch64PreLegalizerCombiner(IsOptNone)); } bool AArch64PassConfig::addLegalizeMachineIR() { @@ -570,8 +570,8 @@ bool AArch64PassConfig::addLegalizeMachineIR() { void AArch64PassConfig::addPreRegBankSelect() { bool IsOptNone = getOptLevel() == CodeGenOpt::None; if (!IsOptNone) - addPass(createAArch64PostLegalizerCombiner(IsOptNone)); - addPass(createAArch64PostLegalizerLowering()); + addPass(createAArch64PostLegalizerCombiner(IsOptNone)); + addPass(createAArch64PostLegalizerLowering()); } bool AArch64PassConfig::addRegBankSelect() { @@ -585,8 +585,8 @@ void AArch64PassConfig::addPreGlobalInstructionSelect() { bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); - if (getOptLevel() != CodeGenOpt::None) - addPass(createAArch64PostSelectOptimize()); + if (getOptLevel() != CodeGenOpt::None) + addPass(createAArch64PostSelectOptimize()); return false; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.h index 25e6261343..2420658743 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetMachine.h @@ -57,12 +57,12 @@ public: SMDiagnostic &Error, SMRange &SourceRange) const override; - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } + private: bool isLittle; }; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7fda6b8fb6..d9f700a966 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "AArch64TargetTransformInfo.h" +#include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/Analysis/LoopInfo.h" @@ -16,11 +16,11 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAArch64.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include <algorithm> using namespace llvm; -using namespace llvm::PatternMatch; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "aarch64tti" @@ -86,8 +86,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind, - Instruction *Inst) { + TTI::TargetCostKind CostKind, + Instruction *Inst) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -195,10 +195,10 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) return TTI::TCC_Free; break; - case Intrinsic::experimental_gc_statepoint: - if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TTI::TCC_Free; - break; + case Intrinsic::experimental_gc_statepoint: + if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; } return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); } @@ -212,43 +212,43 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned -AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind) { - auto *RetTy = ICA.getReturnType(); - switch (ICA.getID()) { - case Intrinsic::umin: - case Intrinsic::umax: { - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); - // umin(x,y) -> sub(x,usubsat(x,y)) - // umax(x,y) -> add(x,usubsat(y,x)) - if (LT.second == MVT::v2i64) - return LT.first * 2; - LLVM_FALLTHROUGH; - } - case Intrinsic::smin: - case Intrinsic::smax: { - static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, - MVT::v8i16, MVT::v2i32, MVT::v4i32}; - auto LT = TLI->getTypeLegalizationCost(DL, RetTy); - if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) - return LT.first; - break; - } - default: - break; - } - return BaseT::getIntrinsicInstrCost(ICA, CostKind); -} - +unsigned +AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { + auto *RetTy = ICA.getReturnType(); + switch (ICA.getID()) { + case Intrinsic::umin: + case Intrinsic::umax: { + auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + // umin(x,y) -> sub(x,usubsat(x,y)) + // umax(x,y) -> add(x,usubsat(y,x)) + if (LT.second == MVT::v2i64) + return LT.first * 2; + LLVM_FALLTHROUGH; + } + case Intrinsic::smin: + case Intrinsic::smax: { + static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, + MVT::v8i16, MVT::v2i32, MVT::v4i32}; + auto LT = TLI->getTypeLegalizationCost(DL, RetTy); + if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) + return LT.first; + break; + } + default: + break; + } + return BaseT::getIntrinsicInstrCost(ICA, CostKind); +} + bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, ArrayRef<const Value *> Args) { // A helper that returns a vector type from the given type. The number of // elements in type Ty determine the vector width. auto toVectorTy = [&](Type *ArgTy) { - return VectorType::get(ArgTy->getScalarType(), - cast<VectorType>(DstTy)->getElementCount()); + return VectorType::get(ArgTy->getScalarType(), + cast<VectorType>(DstTy)->getElementCount()); }; // Exit early if DstTy is not a vector type whose elements are at least @@ -297,8 +297,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return false; // Get the total number of vector elements in the legalized types. - unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements(); - unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); + unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorMinNumElements(); + unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); // Return true if the legalized types have the same number of vector elements // and the destination element type size is twice that of the source type. @@ -306,7 +306,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, } int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -343,8 +343,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost( - BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); static const TypeConversionCostTblEntry ConversionTbl[] = { @@ -448,8 +448,8 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost); - return AdjustCost( - BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, @@ -481,14 +481,14 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, // we may get the extension for free. If not, get the default cost for the // extend. if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) - return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, - CostKind); + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); // The destination type should be larger than the element type. If not, get // the default cost for the extend. - if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) - return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, - CostKind); + if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); switch (Opcode) { default: @@ -507,8 +507,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, } // If we are unable to perform the extend for free, get the default cost. - return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, - CostKind); + return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, + CostKind); } unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode, @@ -644,19 +644,19 @@ int AArch64TTIImpl::getArithmeticInstrCost( } return Cost; - case ISD::MUL: - if (LT.second != MVT::v2i64) - return (Cost + 1) * LT.first; - // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive - // as elements are extracted from the vectors and the muls scalarized. - // As getScalarizationOverhead is a bit too pessimistic, we estimate the - // cost for a i64 vector directly here, which is: - // - four i64 extracts, - // - two i64 inserts, and - // - two muls. - // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with - // LT.first = 2 the cost is 16. - return LT.first * 8; + case ISD::MUL: + if (LT.second != MVT::v2i64) + return (Cost + 1) * LT.first; + // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive + // as elements are extracted from the vectors and the muls scalarized. + // As getScalarizationOverhead is a bit too pessimistic, we estimate the + // cost for a i64 vector directly here, which is: + // - four i64 extracts, + // - two i64 inserts, and + // - two muls. + // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with + // LT.first = 2 the cost is 16. + return LT.first * 8; case ISD::ADD: case ISD::XOR: case ISD::OR: @@ -696,40 +696,40 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy, CmpInst::Predicate VecPred, + Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, - I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, + I); int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register // width. - if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { + if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { // We would need this many instructions to hide the scalarization happening. const int AmortizationCost = 20; - - // If VecPred is not set, check if we can get a predicate from the context - // instruction, if its type matches the requested ValTy. - if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { - CmpInst::Predicate CurrentPred; - if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), - m_Value()))) - VecPred = CurrentPred; - } - // Check if we have a compare/select chain that can be lowered using CMxx & - // BFI pair. - if (CmpInst::isIntPredicate(VecPred)) { - static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, - MVT::v8i16, MVT::v2i32, MVT::v4i32, - MVT::v2i64}; - auto LT = TLI->getTypeLegalizationCost(DL, ValTy); - if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) - return LT.first; - } - + + // If VecPred is not set, check if we can get a predicate from the context + // instruction, if its type matches the requested ValTy. + if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { + CmpInst::Predicate CurrentPred; + if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), + m_Value()))) + VecPred = CurrentPred; + } + // Check if we have a compare/select chain that can be lowered using CMxx & + // BFI pair. + if (CmpInst::isIntPredicate(VecPred)) { + static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, + MVT::v8i16, MVT::v2i32, MVT::v4i32, + MVT::v2i64}; + auto LT = TLI->getTypeLegalizationCost(DL, ValTy); + if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) + return LT.first; + } + static const TypeConversionCostTblEntry VectorSelectTbl[] = { { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, @@ -749,9 +749,9 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - // The base case handles scalable vectors fine for now, since it treats the - // cost as 1 * legalization cost. - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); + // The base case handles scalable vectors fine for now, since it treats the + // cost as 1 * legalization cost. + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } AArch64TTIImpl::TTI::MemCmpExpansionOptions @@ -772,30 +772,30 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { return Options; } -unsigned AArch64TTIImpl::getGatherScatterOpCost( - unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { - - if (!isa<ScalableVectorType>(DataTy)) - return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, - Alignment, CostKind, I); - auto *VT = cast<VectorType>(DataTy); - auto LT = TLI->getTypeLegalizationCost(DL, DataTy); - ElementCount LegalVF = LT.second.getVectorElementCount(); - Optional<unsigned> MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - - unsigned MemOpCost = - getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - unsigned MaxNumElementsPerGather = - MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); - return LT.first * MaxNumElementsPerGather * MemOpCost; -} - -bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { - return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); -} - +unsigned AArch64TTIImpl::getGatherScatterOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { + + if (!isa<ScalableVectorType>(DataTy)) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + auto *VT = cast<VectorType>(DataTy); + auto LT = TLI->getTypeLegalizationCost(DL, DataTy); + ElementCount LegalVF = LT.second.getVectorElementCount(); + Optional<unsigned> MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + + unsigned MemOpCost = + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); + unsigned MaxNumElementsPerGather = + MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); + return LT.first * MaxNumElementsPerGather * MemOpCost; +} + +bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { + return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); +} + int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -823,7 +823,7 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, return LT.first * 2 * AmortizationCost; } - if (useNeonVector(Ty) && + if (useNeonVector(Ty) && cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) { unsigned ProfitableNumElements; if (Opcode == Instruction::Store) @@ -1098,70 +1098,70 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return false; } -int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, - TTI::TargetCostKind CostKind) { - if (!isa<ScalableVectorType>(Ty)) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, - CostKind); - assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) && - "Both vector needs to be scalable"); - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - int LegalizationCost = 0; - if (LT.first > 1) { - Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); - unsigned CmpOpcode = - Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; - LegalizationCost = - getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind) + - getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); - LegalizationCost *= LT.first - 1; - } - - return LegalizationCost + /*Cost of horizontal reduction*/ 2; -} - -int AArch64TTIImpl::getArithmeticReductionCostSVE( - unsigned Opcode, VectorType *ValTy, bool IsPairwise, - TTI::TargetCostKind CostKind) { - assert(!IsPairwise && "Cannot be pair wise to continue"); - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - int LegalizationCost = 0; - if (LT.first > 1) { - Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); - LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); - LegalizationCost *= LT.first - 1; - } - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - // Add the final reduction cost for the legal horizontal reduction - switch (ISD) { - case ISD::ADD: - case ISD::AND: - case ISD::OR: - case ISD::XOR: - case ISD::FADD: - return LegalizationCost + 2; - default: - // TODO: Replace for invalid when InstructionCost is used - // cases not supported by SVE - return 16; - } -} - +int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (!isa<ScalableVectorType>(Ty)) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); + assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) && + "Both vector needs to be scalable"); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + return LegalizationCost + /*Cost of horizontal reduction*/ 2; +} + +int AArch64TTIImpl::getArithmeticReductionCostSVE( + unsigned Opcode, VectorType *ValTy, bool IsPairwise, + TTI::TargetCostKind CostKind) { + assert(!IsPairwise && "Cannot be pair wise to continue"); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + default: + // TODO: Replace for invalid when InstructionCost is used + // cases not supported by SVE + return 16; + } +} + int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind) { - if (isa<ScalableVectorType>(ValTy)) - return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, - CostKind); + if (isa<ScalableVectorType>(ValTy)) + return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, + CostKind); if (IsPairwiseForm) return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, CostKind); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7c9360ada9..f669e3f595 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -74,8 +74,8 @@ public: int getIntImmCost(int64_t Val); int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty, TTI::TargetCostKind CostKind, - Instruction *Inst = nullptr); + Type *Ty, TTI::TargetCostKind CostKind, + Instruction *Inst = nullptr); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); @@ -97,9 +97,9 @@ public: return 31; } - unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind); - + unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + unsigned getRegisterBitWidth(bool Vector) const { if (Vector) { if (ST->hasSVE()) @@ -115,21 +115,21 @@ public: return ST->getMinVectorRegisterBitWidth(); } - Optional<unsigned> getMaxVScale() const { - if (ST->hasSVE()) - return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; - return BaseT::getMaxVScale(); - } - + Optional<unsigned> getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, - const Value *Ptr, bool VariableMask, - Align Alignment, TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); - + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, @@ -139,14 +139,14 @@ public: int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, - TTI::TargetCostKind CostKind); - - int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind); - + int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, @@ -160,13 +160,13 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; - bool useNeonVector(const Type *Ty) const; + bool useNeonVector(const Type *Ty) const; int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, @@ -191,9 +191,9 @@ public: return false; Type *Ty = cast<ScalableVectorType>(DataType)->getElementType(); - if (Ty->isPointerTy()) - return true; - + if (Ty->isPointerTy()) + return true; + if (Ty->isBFloatTy() || Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true; @@ -241,14 +241,14 @@ public: shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader); - bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } + bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } unsigned getGISelRematGlobalCost() const { return 2; } - bool supportsScalableVectors() const { return ST->hasSVE(); } - + bool supportsScalableVectors() const { return ST->hasSVE(); } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 96c50ff3f8..d69e2b127c 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AArch64AddressingModes.h" -#include "MCTargetDesc/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64InstPrinter.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "MCTargetDesc/AArch64TargetStreamer.h" @@ -159,13 +159,13 @@ private: bool parseSymbolicImmVal(const MCExpr *&ImmVal); bool parseNeonVectorList(OperandVector &Operands); bool parseOptionalMulOperand(OperandVector &Operands); - bool parseKeywordOperand(OperandVector &Operands); + bool parseKeywordOperand(OperandVector &Operands); bool parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode); - bool parseImmExpr(int64_t &Out); - bool parseComma(); - bool parseRegisterInRange(unsigned &Out, unsigned Base, unsigned First, - unsigned Last); + bool parseImmExpr(int64_t &Out); + bool parseComma(); + bool parseRegisterInRange(unsigned &Out, unsigned Base, unsigned First, + unsigned Last); bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo, OperandVector &Operands); @@ -187,31 +187,31 @@ private: bool parseDirectiveVariantPCS(SMLoc L); - bool parseDirectiveSEHAllocStack(SMLoc L); - bool parseDirectiveSEHPrologEnd(SMLoc L); - bool parseDirectiveSEHSaveR19R20X(SMLoc L); - bool parseDirectiveSEHSaveFPLR(SMLoc L); - bool parseDirectiveSEHSaveFPLRX(SMLoc L); - bool parseDirectiveSEHSaveReg(SMLoc L); - bool parseDirectiveSEHSaveRegX(SMLoc L); - bool parseDirectiveSEHSaveRegP(SMLoc L); - bool parseDirectiveSEHSaveRegPX(SMLoc L); - bool parseDirectiveSEHSaveLRPair(SMLoc L); - bool parseDirectiveSEHSaveFReg(SMLoc L); - bool parseDirectiveSEHSaveFRegX(SMLoc L); - bool parseDirectiveSEHSaveFRegP(SMLoc L); - bool parseDirectiveSEHSaveFRegPX(SMLoc L); - bool parseDirectiveSEHSetFP(SMLoc L); - bool parseDirectiveSEHAddFP(SMLoc L); - bool parseDirectiveSEHNop(SMLoc L); - bool parseDirectiveSEHSaveNext(SMLoc L); - bool parseDirectiveSEHEpilogStart(SMLoc L); - bool parseDirectiveSEHEpilogEnd(SMLoc L); - bool parseDirectiveSEHTrapFrame(SMLoc L); - bool parseDirectiveSEHMachineFrame(SMLoc L); - bool parseDirectiveSEHContext(SMLoc L); - bool parseDirectiveSEHClearUnwoundToCall(SMLoc L); - + bool parseDirectiveSEHAllocStack(SMLoc L); + bool parseDirectiveSEHPrologEnd(SMLoc L); + bool parseDirectiveSEHSaveR19R20X(SMLoc L); + bool parseDirectiveSEHSaveFPLR(SMLoc L); + bool parseDirectiveSEHSaveFPLRX(SMLoc L); + bool parseDirectiveSEHSaveReg(SMLoc L); + bool parseDirectiveSEHSaveRegX(SMLoc L); + bool parseDirectiveSEHSaveRegP(SMLoc L); + bool parseDirectiveSEHSaveRegPX(SMLoc L); + bool parseDirectiveSEHSaveLRPair(SMLoc L); + bool parseDirectiveSEHSaveFReg(SMLoc L); + bool parseDirectiveSEHSaveFRegX(SMLoc L); + bool parseDirectiveSEHSaveFRegP(SMLoc L); + bool parseDirectiveSEHSaveFRegPX(SMLoc L); + bool parseDirectiveSEHSetFP(SMLoc L); + bool parseDirectiveSEHAddFP(SMLoc L); + bool parseDirectiveSEHNop(SMLoc L); + bool parseDirectiveSEHSaveNext(SMLoc L); + bool parseDirectiveSEHEpilogStart(SMLoc L); + bool parseDirectiveSEHEpilogEnd(SMLoc L); + bool parseDirectiveSEHTrapFrame(SMLoc L); + bool parseDirectiveSEHMachineFrame(SMLoc L); + bool parseDirectiveSEHContext(SMLoc L); + bool parseDirectiveSEHClearUnwoundToCall(SMLoc L); + bool validateInstruction(MCInst &Inst, SMLoc &IDLoc, SmallVectorImpl<SMLoc> &Loc); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -231,7 +231,7 @@ private: RegKind MatchKind); OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); - OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands); + OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands); OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); @@ -258,7 +258,7 @@ private: OperandMatchResultTy tryParseVectorList(OperandVector &Operands, bool ExpectMatch = false); OperandMatchResultTy tryParseSVEPattern(OperandVector &Operands); - OperandMatchResultTy tryParseGPR64x8(OperandVector &Operands); + OperandMatchResultTy tryParseGPR64x8(OperandVector &Operands); public: enum AArch64MatchResultTy { @@ -271,7 +271,7 @@ public: AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, STI, MII) { - IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; + IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); if (S.getTargetStreamer() == nullptr) @@ -404,7 +404,7 @@ private: const char *Data; unsigned Length; unsigned Val; // Not the enum since not all values have names. - bool HasnXSModifier; + bool HasnXSModifier; }; struct SysRegOp { @@ -574,11 +574,11 @@ public: return StringRef(Barrier.Data, Barrier.Length); } - bool getBarriernXSModifier() const { - assert(Kind == k_Barrier && "Invalid access!"); - return Barrier.HasnXSModifier; - } - + bool getBarriernXSModifier() const { + assert(Kind == k_Barrier && "Invalid access!"); + return Barrier.HasnXSModifier; + } + unsigned getReg() const override { assert(Kind == k_Register && "Invalid access!"); return Reg.RegNum; @@ -750,8 +750,8 @@ public: ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 || ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 || - ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 || - ELFRefKind == AArch64MCExpr::VK_GOT_PAGE_LO15) { + ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_GOT_PAGE_LO15) { // Note that we don't range-check the addend. It's adjusted modulo page // size when converted, so there is no "out of range" condition when using // @pageoff. @@ -897,8 +897,8 @@ public: if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value || - std::is_same<int8_t, T>::value; + bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value || + std::is_same<int8_t, T>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first) @@ -915,8 +915,8 @@ public: if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm()))) return DiagnosticPredicateTy::NoMatch; - bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value || - std::is_same<int8_t, T>::value; + bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value || + std::is_same<int8_t, T>::value; if (auto ShiftedImm = getShiftedVal<8>()) if (!(IsByte && ShiftedImm->second) && AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first @@ -1041,12 +1041,12 @@ public: AArch64_AM::getFP64Imm(getFPImm().bitcastToAPInt()) != -1; } - bool isBarrier() const { - return Kind == k_Barrier && !getBarriernXSModifier(); - } - bool isBarriernXS() const { - return Kind == k_Barrier && getBarriernXSModifier(); - } + bool isBarrier() const { + return Kind == k_Barrier && !getBarriernXSModifier(); + } + bool isBarriernXS() const { + return Kind == k_Barrier && getBarriernXSModifier(); + } bool isSysReg() const { return Kind == k_SysReg; } bool isMRSSystemRegister() const { @@ -1173,12 +1173,12 @@ public: AArch64MCRegisterClasses[AArch64::GPR32RegClassID].contains(Reg.RegNum); } - bool isGPR64x8() const { - return Kind == k_Register && Reg.Kind == RegKind::Scalar && - AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains( - Reg.RegNum); - } - + bool isGPR64x8() const { + return Kind == k_Register && Reg.Kind == RegKind::Scalar && + AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID].contains( + Reg.RegNum); + } + bool isWSeqPair() const { return Kind == k_Register && Reg.Kind == RegKind::Scalar && AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( @@ -1742,11 +1742,11 @@ public: Inst.addOperand(MCOperand::createImm(getBarrier())); } - void addBarriernXSOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(getBarrier())); - } - + void addBarriernXSOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getBarrier())); + } + void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); @@ -1982,13 +1982,13 @@ public: static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, StringRef Str, SMLoc S, - MCContext &Ctx, - bool HasnXSModifier) { + MCContext &Ctx, + bool HasnXSModifier) { auto Op = std::make_unique<AArch64Operand>(k_Barrier, Ctx); Op->Barrier.Val = Val; Op->Barrier.Data = Str.data(); Op->Barrier.Length = Str.size(); - Op->Barrier.HasnXSModifier = HasnXSModifier; + Op->Barrier.HasnXSModifier = HasnXSModifier; Op->StartLoc = S; Op->EndLoc = S; return Op; @@ -2133,9 +2133,9 @@ void AArch64Operand::print(raw_ostream &OS) const { case k_PSBHint: OS << getPSBHintName(); break; - case k_BTIHint: - OS << getBTIHintName(); - break; + case k_BTIHint: + OS << getBTIHintName(); + break; case k_Register: OS << "<register " << getReg() << ">"; if (!getShiftExtendAmount() && !hasShiftExtendAmount()) @@ -2570,7 +2570,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && ELFRefKind != AArch64MCExpr::VK_ABS_PAGE_NC && ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && - ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 && + ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 && ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { // The operand must be an @page or @gotpage qualified symbolref. @@ -2904,7 +2904,7 @@ static const struct Extension { {"predres", {AArch64::FeaturePredRes}}, {"ccdp", {AArch64::FeatureCacheDeepPersist}}, {"mte", {AArch64::FeatureMTE}}, - {"memtag", {AArch64::FeatureMTE}}, + {"memtag", {AArch64::FeatureMTE}}, {"tlb-rmi", {AArch64::FeatureTLB_RMI}}, {"pan-rwv", {AArch64::FeaturePAN_RWV}}, {"ccpp", {AArch64::FeatureCCPP}}, @@ -2915,10 +2915,10 @@ static const struct Extension { {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, - {"ls64", {AArch64::FeatureLS64}}, - {"xs", {AArch64::FeatureXS}}, - {"pauth", {AArch64::FeaturePAuth}}, - {"flagm", {AArch64::FeatureFlagM}}, + {"ls64", {AArch64::FeatureLS64}}, + {"xs", {AArch64::FeatureXS}}, + {"pauth", {AArch64::FeaturePAuth}}, + {"flagm", {AArch64::FeatureFlagM}}, // FIXME: Unsupported extensions {"pan", {}}, {"lor", {}}, @@ -2939,16 +2939,16 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { Str += "ARMv8.5a"; else if (FBS[AArch64::HasV8_6aOps]) Str += "ARMv8.6a"; - else if (FBS[AArch64::HasV8_7aOps]) - Str += "ARMv8.7a"; + else if (FBS[AArch64::HasV8_7aOps]) + Str += "ARMv8.7a"; else { - SmallVector<std::string, 2> ExtMatches; - for (const auto& Ext : ExtensionMap) { + SmallVector<std::string, 2> ExtMatches; + for (const auto& Ext : ExtensionMap) { // Use & in case multiple features are enabled - if ((FBS & Ext.Features) != FeatureBitset()) - ExtMatches.push_back(Ext.Name); - } - Str += !ExtMatches.empty() ? llvm::join(ExtMatches, ", ") : "(unknown)"; + if ((FBS & Ext.Features) != FeatureBitset()) + ExtMatches.push_back(Ext.Name); + } + Str += !ExtMatches.empty() ? llvm::join(ExtMatches, ", ") : "(unknown)"; } } @@ -2993,7 +2993,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, if (!IC) return TokError("invalid operand for IC instruction"); else if (!IC->haveFeatures(getSTI().getFeatureBits())) { - std::string Str("IC " + std::string(IC->Name) + " requires: "); + std::string Str("IC " + std::string(IC->Name) + " requires: "); setRequiredFeatureString(IC->getRequiredFeatures(), Str); return TokError(Str.c_str()); } @@ -3003,7 +3003,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, if (!DC) return TokError("invalid operand for DC instruction"); else if (!DC->haveFeatures(getSTI().getFeatureBits())) { - std::string Str("DC " + std::string(DC->Name) + " requires: "); + std::string Str("DC " + std::string(DC->Name) + " requires: "); setRequiredFeatureString(DC->getRequiredFeatures(), Str); return TokError(Str.c_str()); } @@ -3013,7 +3013,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, if (!AT) return TokError("invalid operand for AT instruction"); else if (!AT->haveFeatures(getSTI().getFeatureBits())) { - std::string Str("AT " + std::string(AT->Name) + " requires: "); + std::string Str("AT " + std::string(AT->Name) + " requires: "); setRequiredFeatureString(AT->getRequiredFeatures(), Str); return TokError(Str.c_str()); } @@ -3023,7 +3023,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, if (!TLBI) return TokError("invalid operand for TLBI instruction"); else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) { - std::string Str("TLBI " + std::string(TLBI->Name) + " requires: "); + std::string Str("TLBI " + std::string(TLBI->Name) + " requires: "); setRequiredFeatureString(TLBI->getRequiredFeatures(), Str); return TokError(Str.c_str()); } @@ -3034,7 +3034,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, return TokError("invalid operand for prediction restriction instruction"); else if (!PRCTX->haveFeatures(getSTI().getFeatureBits())) { std::string Str( - Mnemonic.upper() + std::string(PRCTX->Name) + " requires: "); + Mnemonic.upper() + std::string(PRCTX->Name) + " requires: "); setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str); return TokError(Str.c_str()); } @@ -3082,7 +3082,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { // Immediate operand. const MCExpr *ImmVal; SMLoc ExprLoc = getLoc(); - AsmToken IntTok = Tok; + AsmToken IntTok = Tok; if (getParser().parseExpression(ImmVal)) return MatchOperand_ParseFail; const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); @@ -3090,22 +3090,22 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { Error(ExprLoc, "immediate value expected for barrier operand"); return MatchOperand_ParseFail; } - int64_t Value = MCE->getValue(); - if (Mnemonic == "dsb" && Value > 15) { - // This case is a no match here, but it might be matched by the nXS - // variant. Deliberately not unlex the optional '#' as it is not necessary - // to characterize an integer immediate. - Parser.getLexer().UnLex(IntTok); - return MatchOperand_NoMatch; - } - if (Value < 0 || Value > 15) { + int64_t Value = MCE->getValue(); + if (Mnemonic == "dsb" && Value > 15) { + // This case is a no match here, but it might be matched by the nXS + // variant. Deliberately not unlex the optional '#' as it is not necessary + // to characterize an integer immediate. + Parser.getLexer().UnLex(IntTok); + return MatchOperand_NoMatch; + } + if (Value < 0 || Value > 15) { Error(ExprLoc, "barrier operand out of range"); return MatchOperand_ParseFail; } - auto DB = AArch64DB::lookupDBByEncoding(Value); - Operands.push_back(AArch64Operand::CreateBarrier(Value, DB ? DB->Name : "", - ExprLoc, getContext(), - false /*hasnXSModifier*/)); + auto DB = AArch64DB::lookupDBByEncoding(Value); + Operands.push_back(AArch64Operand::CreateBarrier(Value, DB ? DB->Name : "", + ExprLoc, getContext(), + false /*hasnXSModifier*/)); return MatchOperand_Success; } @@ -3114,9 +3114,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - StringRef Operand = Tok.getString(); - auto TSB = AArch64TSB::lookupTSBByName(Operand); - auto DB = AArch64DB::lookupDBByName(Operand); + StringRef Operand = Tok.getString(); + auto TSB = AArch64TSB::lookupTSBByName(Operand); + auto DB = AArch64DB::lookupDBByName(Operand); // The only valid named option for ISB is 'sy' if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) { TokError("'sy' or #imm operand expected"); @@ -3126,79 +3126,79 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { TokError("'csync' operand expected"); return MatchOperand_ParseFail; } else if (!DB && !TSB) { - if (Mnemonic == "dsb") { - // This case is a no match here, but it might be matched by the nXS - // variant. - return MatchOperand_NoMatch; - } + if (Mnemonic == "dsb") { + // This case is a no match here, but it might be matched by the nXS + // variant. + return MatchOperand_NoMatch; + } TokError("invalid barrier option name"); return MatchOperand_ParseFail; } Operands.push_back(AArch64Operand::CreateBarrier( - DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), - getContext(), false /*hasnXSModifier*/)); - Parser.Lex(); // Consume the option - - return MatchOperand_Success; -} - -OperandMatchResultTy -AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) { - MCAsmParser &Parser = getParser(); - const AsmToken &Tok = Parser.getTok(); - - assert(Mnemonic == "dsb" && "Instruction does not accept nXS operands"); - if (Mnemonic != "dsb") - return MatchOperand_ParseFail; - - if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) { - // Immediate operand. - const MCExpr *ImmVal; - SMLoc ExprLoc = getLoc(); - if (getParser().parseExpression(ImmVal)) - return MatchOperand_ParseFail; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); - if (!MCE) { - Error(ExprLoc, "immediate value expected for barrier operand"); - return MatchOperand_ParseFail; - } - int64_t Value = MCE->getValue(); - // v8.7-A DSB in the nXS variant accepts only the following immediate - // values: 16, 20, 24, 28. - if (Value != 16 && Value != 20 && Value != 24 && Value != 28) { - Error(ExprLoc, "barrier operand out of range"); - return MatchOperand_ParseFail; - } - auto DB = AArch64DBnXS::lookupDBnXSByImmValue(Value); - Operands.push_back(AArch64Operand::CreateBarrier(DB->Encoding, DB->Name, - ExprLoc, getContext(), - true /*hasnXSModifier*/)); - return MatchOperand_Success; - } - - if (Tok.isNot(AsmToken::Identifier)) { - TokError("invalid operand for instruction"); - return MatchOperand_ParseFail; - } - - StringRef Operand = Tok.getString(); - auto DB = AArch64DBnXS::lookupDBnXSByName(Operand); - - if (!DB) { - TokError("invalid barrier option name"); - return MatchOperand_ParseFail; - } - - Operands.push_back( - AArch64Operand::CreateBarrier(DB->Encoding, Tok.getString(), getLoc(), - getContext(), true /*hasnXSModifier*/)); + DB ? DB->Encoding : TSB->Encoding, Tok.getString(), getLoc(), + getContext(), false /*hasnXSModifier*/)); Parser.Lex(); // Consume the option return MatchOperand_Success; } OperandMatchResultTy +AArch64AsmParser::tryParseBarriernXSOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + + assert(Mnemonic == "dsb" && "Instruction does not accept nXS operands"); + if (Mnemonic != "dsb") + return MatchOperand_ParseFail; + + if (parseOptionalToken(AsmToken::Hash) || Tok.is(AsmToken::Integer)) { + // Immediate operand. + const MCExpr *ImmVal; + SMLoc ExprLoc = getLoc(); + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + Error(ExprLoc, "immediate value expected for barrier operand"); + return MatchOperand_ParseFail; + } + int64_t Value = MCE->getValue(); + // v8.7-A DSB in the nXS variant accepts only the following immediate + // values: 16, 20, 24, 28. + if (Value != 16 && Value != 20 && Value != 24 && Value != 28) { + Error(ExprLoc, "barrier operand out of range"); + return MatchOperand_ParseFail; + } + auto DB = AArch64DBnXS::lookupDBnXSByImmValue(Value); + Operands.push_back(AArch64Operand::CreateBarrier(DB->Encoding, DB->Name, + ExprLoc, getContext(), + true /*hasnXSModifier*/)); + return MatchOperand_Success; + } + + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + StringRef Operand = Tok.getString(); + auto DB = AArch64DBnXS::lookupDBnXSByName(Operand); + + if (!DB) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AArch64Operand::CreateBarrier(DB->Encoding, Tok.getString(), getLoc(), + getContext(), true /*hasnXSModifier*/)); + Parser.Lex(); // Consume the option + + return MatchOperand_Success; +} + +OperandMatchResultTy AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); @@ -3438,7 +3438,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC) .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12) .Case("got", AArch64MCExpr::VK_GOT_PAGE) - .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15) + .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15) .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12) .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE) .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC) @@ -3707,17 +3707,17 @@ bool AArch64AsmParser::parseOptionalMulOperand(OperandVector &Operands) { return Error(getLoc(), "expected 'vl' or '#<imm>'"); } -bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) { - MCAsmParser &Parser = getParser(); - auto Tok = Parser.getTok(); - if (Tok.isNot(AsmToken::Identifier)) - return true; - Operands.push_back(AArch64Operand::CreateToken(Tok.getString(), false, - Tok.getLoc(), getContext())); - Parser.Lex(); - return false; -} - +bool AArch64AsmParser::parseKeywordOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + auto Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return true; + Operands.push_back(AArch64Operand::CreateToken(Tok.getString(), false, + Tok.getLoc(), getContext())); + Parser.Lex(); + return false; +} + /// parseOperand - Parse a arm instruction operand. For now this parses the /// operand regardless of the mnemonic. bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, @@ -3782,11 +3782,11 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, if (GotShift != MatchOperand_NoMatch) return GotShift; - // If this is a two-word mnemonic, parse its special keyword - // operand as an identifier. - if (Mnemonic == "brb") - return parseKeywordOperand(Operands); - + // If this is a two-word mnemonic, parse its special keyword + // operand as an identifier. + if (Mnemonic == "brb") + return parseKeywordOperand(Operands); + // This was not a register so parse other operands that start with an // identifier (like labels) as expressions and create them as immediates. const MCExpr *IdVal; @@ -3895,66 +3895,66 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, } } -bool AArch64AsmParser::parseImmExpr(int64_t &Out) { - const MCExpr *Expr = nullptr; - SMLoc L = getLoc(); - if (check(getParser().parseExpression(Expr), L, "expected expression")) - return true; - const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); - if (check(!Value, L, "expected constant expression")) - return true; - Out = Value->getValue(); - return false; -} - -bool AArch64AsmParser::parseComma() { - if (check(getParser().getTok().isNot(AsmToken::Comma), getLoc(), - "expected comma")) - return true; - // Eat the comma - getParser().Lex(); - return false; -} - -bool AArch64AsmParser::parseRegisterInRange(unsigned &Out, unsigned Base, - unsigned First, unsigned Last) { - unsigned Reg; - SMLoc Start, End; - if (check(ParseRegister(Reg, Start, End), getLoc(), "expected register")) - return true; - - // Special handling for FP and LR; they aren't linearly after x28 in - // the registers enum. - unsigned RangeEnd = Last; - if (Base == AArch64::X0) { - if (Last == AArch64::FP) { - RangeEnd = AArch64::X28; - if (Reg == AArch64::FP) { - Out = 29; - return false; - } - } - if (Last == AArch64::LR) { - RangeEnd = AArch64::X28; - if (Reg == AArch64::FP) { - Out = 29; - return false; - } else if (Reg == AArch64::LR) { - Out = 30; - return false; - } - } - } - - if (check(Reg < First || Reg > RangeEnd, Start, - Twine("expected register in range ") + - AArch64InstPrinter::getRegisterName(First) + " to " + - AArch64InstPrinter::getRegisterName(Last))) - return true; - Out = Reg - Base; - return false; -} - +bool AArch64AsmParser::parseImmExpr(int64_t &Out) { + const MCExpr *Expr = nullptr; + SMLoc L = getLoc(); + if (check(getParser().parseExpression(Expr), L, "expected expression")) + return true; + const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); + if (check(!Value, L, "expected constant expression")) + return true; + Out = Value->getValue(); + return false; +} + +bool AArch64AsmParser::parseComma() { + if (check(getParser().getTok().isNot(AsmToken::Comma), getLoc(), + "expected comma")) + return true; + // Eat the comma + getParser().Lex(); + return false; +} + +bool AArch64AsmParser::parseRegisterInRange(unsigned &Out, unsigned Base, + unsigned First, unsigned Last) { + unsigned Reg; + SMLoc Start, End; + if (check(ParseRegister(Reg, Start, End), getLoc(), "expected register")) + return true; + + // Special handling for FP and LR; they aren't linearly after x28 in + // the registers enum. + unsigned RangeEnd = Last; + if (Base == AArch64::X0) { + if (Last == AArch64::FP) { + RangeEnd = AArch64::X28; + if (Reg == AArch64::FP) { + Out = 29; + return false; + } + } + if (Last == AArch64::LR) { + RangeEnd = AArch64::X28; + if (Reg == AArch64::FP) { + Out = 29; + return false; + } else if (Reg == AArch64::LR) { + Out = 30; + return false; + } + } + } + + if (check(Reg < First || Reg > RangeEnd, Start, + Twine("expected register in range ") + + AArch64InstPrinter::getRegisterName(First) + " to " + + AArch64InstPrinter::getRegisterName(Last))) + return true; + Out = Reg - Base; + return false; +} + bool AArch64AsmParser::regsEqual(const MCParsedAsmOperand &Op1, const MCParsedAsmOperand &Op2) const { auto &AOp1 = static_cast<const AArch64Operand&>(Op1); @@ -5273,7 +5273,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { const MCObjectFileInfo::Environment Format = getContext().getObjectFileInfo()->getObjectFileType(); bool IsMachO = Format == MCObjectFileInfo::IsMachO; - bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; + bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; auto IDVal = DirectiveID.getIdentifier().lower(); SMLoc Loc = DirectiveID.getLoc(); @@ -5302,57 +5302,57 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { parseDirectiveLOH(IDVal, Loc); else return true; - } else if (IsCOFF) { - if (IDVal == ".seh_stackalloc") - parseDirectiveSEHAllocStack(Loc); - else if (IDVal == ".seh_endprologue") - parseDirectiveSEHPrologEnd(Loc); - else if (IDVal == ".seh_save_r19r20_x") - parseDirectiveSEHSaveR19R20X(Loc); - else if (IDVal == ".seh_save_fplr") - parseDirectiveSEHSaveFPLR(Loc); - else if (IDVal == ".seh_save_fplr_x") - parseDirectiveSEHSaveFPLRX(Loc); - else if (IDVal == ".seh_save_reg") - parseDirectiveSEHSaveReg(Loc); - else if (IDVal == ".seh_save_reg_x") - parseDirectiveSEHSaveRegX(Loc); - else if (IDVal == ".seh_save_regp") - parseDirectiveSEHSaveRegP(Loc); - else if (IDVal == ".seh_save_regp_x") - parseDirectiveSEHSaveRegPX(Loc); - else if (IDVal == ".seh_save_lrpair") - parseDirectiveSEHSaveLRPair(Loc); - else if (IDVal == ".seh_save_freg") - parseDirectiveSEHSaveFReg(Loc); - else if (IDVal == ".seh_save_freg_x") - parseDirectiveSEHSaveFRegX(Loc); - else if (IDVal == ".seh_save_fregp") - parseDirectiveSEHSaveFRegP(Loc); - else if (IDVal == ".seh_save_fregp_x") - parseDirectiveSEHSaveFRegPX(Loc); - else if (IDVal == ".seh_set_fp") - parseDirectiveSEHSetFP(Loc); - else if (IDVal == ".seh_add_fp") - parseDirectiveSEHAddFP(Loc); - else if (IDVal == ".seh_nop") - parseDirectiveSEHNop(Loc); - else if (IDVal == ".seh_save_next") - parseDirectiveSEHSaveNext(Loc); - else if (IDVal == ".seh_startepilogue") - parseDirectiveSEHEpilogStart(Loc); - else if (IDVal == ".seh_endepilogue") - parseDirectiveSEHEpilogEnd(Loc); - else if (IDVal == ".seh_trap_frame") - parseDirectiveSEHTrapFrame(Loc); - else if (IDVal == ".seh_pushframe") - parseDirectiveSEHMachineFrame(Loc); - else if (IDVal == ".seh_context") - parseDirectiveSEHContext(Loc); - else if (IDVal == ".seh_clear_unwound_to_call") - parseDirectiveSEHClearUnwoundToCall(Loc); - else - return true; + } else if (IsCOFF) { + if (IDVal == ".seh_stackalloc") + parseDirectiveSEHAllocStack(Loc); + else if (IDVal == ".seh_endprologue") + parseDirectiveSEHPrologEnd(Loc); + else if (IDVal == ".seh_save_r19r20_x") + parseDirectiveSEHSaveR19R20X(Loc); + else if (IDVal == ".seh_save_fplr") + parseDirectiveSEHSaveFPLR(Loc); + else if (IDVal == ".seh_save_fplr_x") + parseDirectiveSEHSaveFPLRX(Loc); + else if (IDVal == ".seh_save_reg") + parseDirectiveSEHSaveReg(Loc); + else if (IDVal == ".seh_save_reg_x") + parseDirectiveSEHSaveRegX(Loc); + else if (IDVal == ".seh_save_regp") + parseDirectiveSEHSaveRegP(Loc); + else if (IDVal == ".seh_save_regp_x") + parseDirectiveSEHSaveRegPX(Loc); + else if (IDVal == ".seh_save_lrpair") + parseDirectiveSEHSaveLRPair(Loc); + else if (IDVal == ".seh_save_freg") + parseDirectiveSEHSaveFReg(Loc); + else if (IDVal == ".seh_save_freg_x") + parseDirectiveSEHSaveFRegX(Loc); + else if (IDVal == ".seh_save_fregp") + parseDirectiveSEHSaveFRegP(Loc); + else if (IDVal == ".seh_save_fregp_x") + parseDirectiveSEHSaveFRegPX(Loc); + else if (IDVal == ".seh_set_fp") + parseDirectiveSEHSetFP(Loc); + else if (IDVal == ".seh_add_fp") + parseDirectiveSEHAddFP(Loc); + else if (IDVal == ".seh_nop") + parseDirectiveSEHNop(Loc); + else if (IDVal == ".seh_save_next") + parseDirectiveSEHSaveNext(Loc); + else if (IDVal == ".seh_startepilogue") + parseDirectiveSEHEpilogStart(Loc); + else if (IDVal == ".seh_endepilogue") + parseDirectiveSEHEpilogEnd(Loc); + else if (IDVal == ".seh_trap_frame") + parseDirectiveSEHTrapFrame(Loc); + else if (IDVal == ".seh_pushframe") + parseDirectiveSEHMachineFrame(Loc); + else if (IDVal == ".seh_context") + parseDirectiveSEHContext(Loc); + else if (IDVal == ".seh_clear_unwound_to_call") + parseDirectiveSEHClearUnwoundToCall(Loc); + else + return true; } else return true; return false; @@ -5360,8 +5360,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, SmallVector<StringRef, 4> &RequestedExtensions) { - const bool NoCrypto = llvm::is_contained(RequestedExtensions, "nocrypto"); - const bool Crypto = llvm::is_contained(RequestedExtensions, "crypto"); + const bool NoCrypto = llvm::is_contained(RequestedExtensions, "nocrypto"); + const bool Crypto = llvm::is_contained(RequestedExtensions, "crypto"); if (!NoCrypto && Crypto) { switch (ArchKind) { @@ -5377,8 +5377,8 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: case AArch64::ArchKind::ARMV8_6A: - case AArch64::ArchKind::ARMV8_7A: - case AArch64::ArchKind::ARMV8R: + case AArch64::ArchKind::ARMV8_7A: + case AArch64::ArchKind::ARMV8R: RequestedExtensions.push_back("sm4"); RequestedExtensions.push_back("sha3"); RequestedExtensions.push_back("sha2"); @@ -5399,7 +5399,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind, case AArch64::ArchKind::ARMV8_4A: case AArch64::ArchKind::ARMV8_5A: case AArch64::ArchKind::ARMV8_6A: - case AArch64::ArchKind::ARMV8_7A: + case AArch64::ArchKind::ARMV8_7A: RequestedExtensions.push_back("nosm4"); RequestedExtensions.push_back("nosha3"); RequestedExtensions.push_back("nosha2"); @@ -5433,8 +5433,8 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { MCSubtargetInfo &STI = copySTI(); std::vector<std::string> ArchFeatures(AArch64Features.begin(), AArch64Features.end()); - STI.setDefaultFeatures("generic", /*TuneCPU*/ "generic", - join(ArchFeatures.begin(), ArchFeatures.end(), ",")); + STI.setDefaultFeatures("generic", /*TuneCPU*/ "generic", + join(ArchFeatures.begin(), ArchFeatures.end(), ",")); SmallVector<StringRef, 4> RequestedExtensions; if (!ExtensionString.empty()) @@ -5536,7 +5536,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { } MCSubtargetInfo &STI = copySTI(); - STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, ""); + STI.setDefaultFeatures(CPU, /*TuneCPU*/ CPU, ""); CurLoc = incrementLoc(CurLoc, CPU.size()); ExpandCryptoAEK(llvm::AArch64::getCPUArchKind(CPU), RequestedExtensions); @@ -5804,238 +5804,238 @@ bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) { return false; } -/// parseDirectiveSEHAllocStack -/// ::= .seh_stackalloc -bool AArch64AsmParser::parseDirectiveSEHAllocStack(SMLoc L) { - int64_t Size; - if (parseImmExpr(Size)) - return true; - getTargetStreamer().EmitARM64WinCFIAllocStack(Size); - return false; -} - -/// parseDirectiveSEHPrologEnd -/// ::= .seh_endprologue -bool AArch64AsmParser::parseDirectiveSEHPrologEnd(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIPrologEnd(); - return false; -} - -/// parseDirectiveSEHSaveR19R20X -/// ::= .seh_save_r19r20_x -bool AArch64AsmParser::parseDirectiveSEHSaveR19R20X(SMLoc L) { - int64_t Offset; - if (parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveR19R20X(Offset); - return false; -} - -/// parseDirectiveSEHSaveFPLR -/// ::= .seh_save_fplr -bool AArch64AsmParser::parseDirectiveSEHSaveFPLR(SMLoc L) { - int64_t Offset; - if (parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFPLR(Offset); - return false; -} - -/// parseDirectiveSEHSaveFPLRX -/// ::= .seh_save_fplr_x -bool AArch64AsmParser::parseDirectiveSEHSaveFPLRX(SMLoc L) { - int64_t Offset; - if (parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFPLRX(Offset); - return false; -} - -/// parseDirectiveSEHSaveReg -/// ::= .seh_save_reg -bool AArch64AsmParser::parseDirectiveSEHSaveReg(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveReg(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveRegX -/// ::= .seh_save_reg_x -bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveRegX(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveRegP -/// ::= .seh_save_regp -bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveRegPX -/// ::= .seh_save_regp_x -bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveLRPair -/// ::= .seh_save_lrpair -bool AArch64AsmParser::parseDirectiveSEHSaveLRPair(SMLoc L) { - unsigned Reg; - int64_t Offset; - L = getLoc(); - if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || - parseComma() || parseImmExpr(Offset)) - return true; - if (check(((Reg - 19) % 2 != 0), L, - "expected register with even offset from x19")) - return true; - getTargetStreamer().EmitARM64WinCFISaveLRPair(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveFReg -/// ::= .seh_save_freg -bool AArch64AsmParser::parseDirectiveSEHSaveFReg(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFReg(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveFRegX -/// ::= .seh_save_freg_x -bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFRegX(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveFRegP -/// ::= .seh_save_fregp -bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSaveFRegPX -/// ::= .seh_save_fregp_x -bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) { - unsigned Reg; - int64_t Offset; - if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || - parseComma() || parseImmExpr(Offset)) - return true; - getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset); - return false; -} - -/// parseDirectiveSEHSetFP -/// ::= .seh_set_fp -bool AArch64AsmParser::parseDirectiveSEHSetFP(SMLoc L) { - getTargetStreamer().EmitARM64WinCFISetFP(); - return false; -} - -/// parseDirectiveSEHAddFP -/// ::= .seh_add_fp -bool AArch64AsmParser::parseDirectiveSEHAddFP(SMLoc L) { - int64_t Size; - if (parseImmExpr(Size)) - return true; - getTargetStreamer().EmitARM64WinCFIAddFP(Size); - return false; -} - -/// parseDirectiveSEHNop -/// ::= .seh_nop -bool AArch64AsmParser::parseDirectiveSEHNop(SMLoc L) { - getTargetStreamer().EmitARM64WinCFINop(); - return false; -} - -/// parseDirectiveSEHSaveNext -/// ::= .seh_save_next -bool AArch64AsmParser::parseDirectiveSEHSaveNext(SMLoc L) { - getTargetStreamer().EmitARM64WinCFISaveNext(); - return false; -} - -/// parseDirectiveSEHEpilogStart -/// ::= .seh_startepilogue -bool AArch64AsmParser::parseDirectiveSEHEpilogStart(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIEpilogStart(); - return false; -} - -/// parseDirectiveSEHEpilogEnd -/// ::= .seh_endepilogue -bool AArch64AsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIEpilogEnd(); - return false; -} - -/// parseDirectiveSEHTrapFrame -/// ::= .seh_trap_frame -bool AArch64AsmParser::parseDirectiveSEHTrapFrame(SMLoc L) { - getTargetStreamer().EmitARM64WinCFITrapFrame(); - return false; -} - -/// parseDirectiveSEHMachineFrame -/// ::= .seh_pushframe -bool AArch64AsmParser::parseDirectiveSEHMachineFrame(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIMachineFrame(); - return false; -} - -/// parseDirectiveSEHContext -/// ::= .seh_context -bool AArch64AsmParser::parseDirectiveSEHContext(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIContext(); - return false; -} - -/// parseDirectiveSEHClearUnwoundToCall -/// ::= .seh_clear_unwound_to_call -bool AArch64AsmParser::parseDirectiveSEHClearUnwoundToCall(SMLoc L) { - getTargetStreamer().EmitARM64WinCFIClearUnwoundToCall(); - return false; -} - +/// parseDirectiveSEHAllocStack +/// ::= .seh_stackalloc +bool AArch64AsmParser::parseDirectiveSEHAllocStack(SMLoc L) { + int64_t Size; + if (parseImmExpr(Size)) + return true; + getTargetStreamer().EmitARM64WinCFIAllocStack(Size); + return false; +} + +/// parseDirectiveSEHPrologEnd +/// ::= .seh_endprologue +bool AArch64AsmParser::parseDirectiveSEHPrologEnd(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIPrologEnd(); + return false; +} + +/// parseDirectiveSEHSaveR19R20X +/// ::= .seh_save_r19r20_x +bool AArch64AsmParser::parseDirectiveSEHSaveR19R20X(SMLoc L) { + int64_t Offset; + if (parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveR19R20X(Offset); + return false; +} + +/// parseDirectiveSEHSaveFPLR +/// ::= .seh_save_fplr +bool AArch64AsmParser::parseDirectiveSEHSaveFPLR(SMLoc L) { + int64_t Offset; + if (parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFPLR(Offset); + return false; +} + +/// parseDirectiveSEHSaveFPLRX +/// ::= .seh_save_fplr_x +bool AArch64AsmParser::parseDirectiveSEHSaveFPLRX(SMLoc L) { + int64_t Offset; + if (parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFPLRX(Offset); + return false; +} + +/// parseDirectiveSEHSaveReg +/// ::= .seh_save_reg +bool AArch64AsmParser::parseDirectiveSEHSaveReg(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveReg(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveRegX +/// ::= .seh_save_reg_x +bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveRegX(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveRegP +/// ::= .seh_save_regp +bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveRegPX +/// ::= .seh_save_regp_x +bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveLRPair +/// ::= .seh_save_lrpair +bool AArch64AsmParser::parseDirectiveSEHSaveLRPair(SMLoc L) { + unsigned Reg; + int64_t Offset; + L = getLoc(); + if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) || + parseComma() || parseImmExpr(Offset)) + return true; + if (check(((Reg - 19) % 2 != 0), L, + "expected register with even offset from x19")) + return true; + getTargetStreamer().EmitARM64WinCFISaveLRPair(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveFReg +/// ::= .seh_save_freg +bool AArch64AsmParser::parseDirectiveSEHSaveFReg(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFReg(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveFRegX +/// ::= .seh_save_freg_x +bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFRegX(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveFRegP +/// ::= .seh_save_fregp +bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSaveFRegPX +/// ::= .seh_save_fregp_x +bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) { + unsigned Reg; + int64_t Offset; + if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) || + parseComma() || parseImmExpr(Offset)) + return true; + getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset); + return false; +} + +/// parseDirectiveSEHSetFP +/// ::= .seh_set_fp +bool AArch64AsmParser::parseDirectiveSEHSetFP(SMLoc L) { + getTargetStreamer().EmitARM64WinCFISetFP(); + return false; +} + +/// parseDirectiveSEHAddFP +/// ::= .seh_add_fp +bool AArch64AsmParser::parseDirectiveSEHAddFP(SMLoc L) { + int64_t Size; + if (parseImmExpr(Size)) + return true; + getTargetStreamer().EmitARM64WinCFIAddFP(Size); + return false; +} + +/// parseDirectiveSEHNop +/// ::= .seh_nop +bool AArch64AsmParser::parseDirectiveSEHNop(SMLoc L) { + getTargetStreamer().EmitARM64WinCFINop(); + return false; +} + +/// parseDirectiveSEHSaveNext +/// ::= .seh_save_next +bool AArch64AsmParser::parseDirectiveSEHSaveNext(SMLoc L) { + getTargetStreamer().EmitARM64WinCFISaveNext(); + return false; +} + +/// parseDirectiveSEHEpilogStart +/// ::= .seh_startepilogue +bool AArch64AsmParser::parseDirectiveSEHEpilogStart(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIEpilogStart(); + return false; +} + +/// parseDirectiveSEHEpilogEnd +/// ::= .seh_endepilogue +bool AArch64AsmParser::parseDirectiveSEHEpilogEnd(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIEpilogEnd(); + return false; +} + +/// parseDirectiveSEHTrapFrame +/// ::= .seh_trap_frame +bool AArch64AsmParser::parseDirectiveSEHTrapFrame(SMLoc L) { + getTargetStreamer().EmitARM64WinCFITrapFrame(); + return false; +} + +/// parseDirectiveSEHMachineFrame +/// ::= .seh_pushframe +bool AArch64AsmParser::parseDirectiveSEHMachineFrame(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIMachineFrame(); + return false; +} + +/// parseDirectiveSEHContext +/// ::= .seh_context +bool AArch64AsmParser::parseDirectiveSEHContext(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIContext(); + return false; +} + +/// parseDirectiveSEHClearUnwoundToCall +/// ::= .seh_clear_unwound_to_call +bool AArch64AsmParser::parseDirectiveSEHClearUnwoundToCall(SMLoc L) { + getTargetStreamer().EmitARM64WinCFIClearUnwoundToCall(); + return false; +} + bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::VariantKind &ELFRefKind, @@ -6323,26 +6323,26 @@ AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) { return MatchOperand_Success; } - -OperandMatchResultTy -AArch64AsmParser::tryParseGPR64x8(OperandVector &Operands) { - SMLoc SS = getLoc(); - - unsigned XReg; - if (tryParseScalarRegister(XReg) != MatchOperand_Success) - return MatchOperand_NoMatch; - - MCContext &ctx = getContext(); - const MCRegisterInfo *RI = ctx.getRegisterInfo(); - int X8Reg = RI->getMatchingSuperReg( - XReg, AArch64::x8sub_0, - &AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID]); - if (!X8Reg) { - Error(SS, "expected an even-numbered x-register in the range [x0,x22]"); - return MatchOperand_ParseFail; - } - - Operands.push_back( - AArch64Operand::CreateReg(X8Reg, RegKind::Scalar, SS, getLoc(), ctx)); - return MatchOperand_Success; -} + +OperandMatchResultTy +AArch64AsmParser::tryParseGPR64x8(OperandVector &Operands) { + SMLoc SS = getLoc(); + + unsigned XReg; + if (tryParseScalarRegister(XReg) != MatchOperand_Success) + return MatchOperand_NoMatch; + + MCContext &ctx = getContext(); + const MCRegisterInfo *RI = ctx.getRegisterInfo(); + int X8Reg = RI->getMatchingSuperReg( + XReg, AArch64::x8sub_0, + &AArch64MCRegisterClasses[AArch64::GPR64x8ClassRegClassID]); + if (!X8Reg) { + Error(SS, "expected an even-numbered x-register in the range [x0,x22]"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AArch64Operand::CreateReg(X8Reg, RegKind::Scalar, SS, getLoc(), ctx)); + return MatchOperand_Success; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/ya.make index 512f510d85..c9421c4c06 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/AsmParser/ya.make @@ -12,20 +12,20 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCParser - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc - contrib/libs/llvm12/lib/Target/AArch64/TargetInfo - contrib/libs/llvm12/lib/Target/AArch64/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCParser + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc + contrib/libs/llvm12/lib/Target/AArch64/TargetInfo + contrib/libs/llvm12/lib/Target/AArch64/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64/AsmParser + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64/AsmParser ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index dca76f8457..72f9968681 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -62,10 +62,10 @@ static DecodeStatus DecodeGPR64commonRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder); +static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -271,16 +271,16 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, uint32_t Insn = (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); - const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32}; - - for (auto Table : Tables) { - DecodeStatus Result = - decodeInstruction(Table, MI, Insn, Address, this, STI); - if (Result != MCDisassembler::Fail) - return Result; - } - - return MCDisassembler::Fail; + const uint8_t *Tables[] = {DecoderTable32, DecoderTableFallback32}; + + for (auto Table : Tables) { + DecodeStatus Result = + decodeInstruction(Table, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) + return Result; + } + + return MCDisassembler::Fail; } static MCSymbolizer * @@ -461,35 +461,35 @@ static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, return Success; } -static const unsigned GPR64x8DecoderTable[] = { - AArch64::X0_X1_X2_X3_X4_X5_X6_X7, - AArch64::X2_X3_X4_X5_X6_X7_X8_X9, - AArch64::X4_X5_X6_X7_X8_X9_X10_X11, - AArch64::X6_X7_X8_X9_X10_X11_X12_X13, - AArch64::X8_X9_X10_X11_X12_X13_X14_X15, - AArch64::X10_X11_X12_X13_X14_X15_X16_X17, - AArch64::X12_X13_X14_X15_X16_X17_X18_X19, - AArch64::X14_X15_X16_X17_X18_X19_X20_X21, - AArch64::X16_X17_X18_X19_X20_X21_X22_X23, - AArch64::X18_X19_X20_X21_X22_X23_X24_X25, - AArch64::X20_X21_X22_X23_X24_X25_X26_X27, - AArch64::X22_X23_X24_X25_X26_X27_X28_FP, -}; - -static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder) { - if (RegNo > 22) - return Fail; - if (RegNo & 1) - return Fail; - - unsigned Register = GPR64x8DecoderTable[RegNo >> 1]; - Inst.addOperand(MCOperand::createReg(Register)); - return Success; -} - +static const unsigned GPR64x8DecoderTable[] = { + AArch64::X0_X1_X2_X3_X4_X5_X6_X7, + AArch64::X2_X3_X4_X5_X6_X7_X8_X9, + AArch64::X4_X5_X6_X7_X8_X9_X10_X11, + AArch64::X6_X7_X8_X9_X10_X11_X12_X13, + AArch64::X8_X9_X10_X11_X12_X13_X14_X15, + AArch64::X10_X11_X12_X13_X14_X15_X16_X17, + AArch64::X12_X13_X14_X15_X16_X17_X18_X19, + AArch64::X14_X15_X16_X17_X18_X19_X20_X21, + AArch64::X16_X17_X18_X19_X20_X21_X22_X23, + AArch64::X18_X19_X20_X21_X22_X23_X24_X25, + AArch64::X20_X21_X22_X23_X24_X25_X26_X27, + AArch64::X22_X23_X24_X25_X26_X27_X28_FP, +}; + +static DecodeStatus DecodeGPR64x8ClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo > 22) + return Fail; + if (RegNo & 1) + return Fail; + + unsigned Register = GPR64x8DecoderTable[RegNo >> 1]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr, const void *Decoder) { diff --git a/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/ya.make index 096b55cd68..e4da353a77 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/Disassembler/ya.make @@ -12,20 +12,20 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/MC/MCDisassembler - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc - contrib/libs/llvm12/lib/Target/AArch64/TargetInfo - contrib/libs/llvm12/lib/Target/AArch64/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/MC/MCDisassembler + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc + contrib/libs/llvm12/lib/Target/AArch64/TargetInfo + contrib/libs/llvm12/lib/Target/AArch64/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64/Disassembler + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64/Disassembler ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 0f8b1d6584..7b05f70a73 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -52,10 +52,10 @@ AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) : CallLowering(&TLI) {} namespace { -struct IncomingArgHandler : public CallLowering::IncomingValueHandler { +struct IncomingArgHandler : public CallLowering::IncomingValueHandler { IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, CCAssignFn *AssignFn) - : IncomingValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} + : IncomingValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -101,7 +101,7 @@ struct IncomingArgHandler : public CallLowering::IncomingValueHandler { /// How the physical register gets marked varies between formal /// parameters (it's a basic-block live-in), and a call instruction /// (it's an implicit-def of the BL). - virtual void markPhysRegUsed(MCRegister PhysReg) = 0; + virtual void markPhysRegUsed(MCRegister PhysReg) = 0; uint64_t StackUsed; }; @@ -111,7 +111,7 @@ struct FormalArgHandler : public IncomingArgHandler { CCAssignFn *AssignFn) : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} - void markPhysRegUsed(MCRegister PhysReg) override { + void markPhysRegUsed(MCRegister PhysReg) override { MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } @@ -122,19 +122,19 @@ struct CallReturnHandler : public IncomingArgHandler { MachineInstrBuilder MIB, CCAssignFn *AssignFn) : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} - void markPhysRegUsed(MCRegister PhysReg) override { + void markPhysRegUsed(MCRegister PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); } MachineInstrBuilder MIB; }; -struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { +struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, MachineInstrBuilder MIB, CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, bool IsTailCall = false, int FPDiff = 0) - : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + : OutgoingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff), StackSize(0), SPReg(0) {} @@ -187,8 +187,8 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { if (!Arg.IsFixed) MaxSize = 0; - assert(Arg.Regs.size() == 1); - + assert(Arg.Regs.size() == 1); + Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt ? extendRegister(Arg.Regs[0], VA, MaxSize) : Arg.Regs[0]; @@ -274,7 +274,7 @@ void AArch64CallLowering::splitToValueTypes( bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, ArrayRef<Register> VRegs, - FunctionLoweringInfo &FLI, + FunctionLoweringInfo &FLI, Register SwiftErrorVReg) const { auto MIB = MIRBuilder.buildInstrNoInsert(AArch64::RET_ReallyLR); assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && @@ -420,7 +420,7 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, // Conservatively forward X8, since it might be used for an aggregate // return. if (!CCInfo.isAllocated(AArch64::X8)) { - Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); + Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass); Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64)); } @@ -441,7 +441,7 @@ bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const { bool AArch64CallLowering::lowerFormalArguments( MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const { + ArrayRef<ArrayRef<Register>> VRegs, FunctionLoweringInfo &FLI) const { MachineFunction &MF = MIRBuilder.getMF(); MachineBasicBlock &MBB = MIRBuilder.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -623,25 +623,25 @@ bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable( const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC); MachineRegisterInfo &MRI = MF.getRegInfo(); - if (Info.IsVarArg) { - // Be conservative and disallow variadic memory operands to match SDAG's - // behaviour. - // FIXME: If the caller's calling convention is C, then we can - // potentially use its argument area. However, for cases like fastcc, - // we can't do anything. - for (unsigned i = 0; i < OutLocs.size(); ++i) { - auto &ArgLoc = OutLocs[i]; - if (ArgLoc.isRegLoc()) - continue; + if (Info.IsVarArg) { + // Be conservative and disallow variadic memory operands to match SDAG's + // behaviour. + // FIXME: If the caller's calling convention is C, then we can + // potentially use its argument area. However, for cases like fastcc, + // we can't do anything. + for (unsigned i = 0; i < OutLocs.size(); ++i) { + auto &ArgLoc = OutLocs[i]; + if (ArgLoc.isRegLoc()) + continue; LLVM_DEBUG( dbgs() - << "... Cannot tail call vararg function with stack arguments\n"); + << "... Cannot tail call vararg function with stack arguments\n"); return false; } } - return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs); + return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs); } bool AArch64CallLowering::isEligibleForTailCallOptimization( @@ -756,7 +756,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use // x16 or x17. - if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) + if (CallerF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) return AArch64::TCRETURNriBTI; return AArch64::TCRETURNri; @@ -776,7 +776,7 @@ bool AArch64CallLowering::lowerTailCall( // TODO: Right now, regbankselect doesn't know how to handle the rtcGPR64 // register class. Until we can do that, we should fall back here. - if (MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) { + if (MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement()) { LLVM_DEBUG( dbgs() << "Cannot lower indirect tail calls with BTI enabled yet.\n"); return false; @@ -894,9 +894,9 @@ bool AArch64CallLowering::lowerTailCall( // If Callee is a reg, since it is used by a target specific instruction, // it must have a register class matching the constraint of that instruction. if (Info.Callee.isReg()) - constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, - MIB->getDesc(), Info.Callee, 0); + constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, + MIB->getDesc(), Info.Callee, 0); MF.getFrameInfo().setHasTailCall(); Info.LoweredTailCall = true; @@ -978,9 +978,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // instruction, it must have a register class matching the // constraint of that instruction. if (Info.Callee.isReg()) - constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, - MIB->getDesc(), Info.Callee, 0); + constrainOperandRegClass(MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, + MIB->getDesc(), Info.Callee, 0); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.h b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.h index 1f45c9ebc0..8054cf6b99 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64CallLowering.h @@ -34,14 +34,14 @@ public: AArch64CallLowering(const AArch64TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, - ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI, + ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI, Register SwiftErrorVReg) const override; bool fallBackToDAGISel(const Function &F) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, - ArrayRef<ArrayRef<Register>> VRegs, - FunctionLoweringInfo &FLI) const override; + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const override; bool lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info) const override; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h index bed1136c7a..9536f0a596 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.h @@ -1,29 +1,29 @@ -//===- AArch64GlobalISelUtils.h ----------------------------------*- C++ -*-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// \file APIs for AArch64-specific helper functions used in the GlobalISel -/// pipeline. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H -#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H - -#include <cstdint> - -namespace llvm { -namespace AArch64GISelUtils { - -/// \returns true if \p C is a legal immediate operand for an arithmetic -/// instruction. -constexpr bool isLegalArithImmed(const uint64_t C) { - return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); -} - -} // namespace AArch64GISelUtils -} // namespace llvm - -#endif +//===- AArch64GlobalISelUtils.h ----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file APIs for AArch64-specific helper functions used in the GlobalISel +/// pipeline. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H +#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H + +#include <cstdint> + +namespace llvm { +namespace AArch64GISelUtils { + +/// \returns true if \p C is a legal immediate operand for an arithmetic +/// instruction. +constexpr bool isLegalArithImmed(const uint64_t C) { + return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); +} + +} // namespace AArch64GISelUtils +} // namespace llvm + +#endif diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index fc5ef02e84..72f92065f3 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -18,7 +18,7 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" -#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" @@ -34,18 +34,18 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/IntrinsicsAArch64.h" -#include "llvm/Pass.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "aarch64-isel" using namespace llvm; -using namespace MIPatternMatch; +using namespace MIPatternMatch; namespace { @@ -103,23 +103,23 @@ private: bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; - ///@{ - /// Helper functions for selectCompareBranch. - bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, - MachineIRBuilder &MIB) const; - bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, - MachineIRBuilder &MIB) const; - bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, - MachineIRBuilder &MIB) const; - bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, + ///@{ + /// Helper functions for selectCompareBranch. + bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp, + MachineIRBuilder &MIB) const; + bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, + MachineIRBuilder &MIB) const; + bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp, + MachineIRBuilder &MIB) const; + bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const; - ///@} - + ///@} + bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const; - bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const; // Helper to generate an equivalent of scalar_to_vector into a new register, @@ -160,7 +160,7 @@ private: bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI) const; unsigned emitConstantPoolEntry(const Constant *CPVal, MachineFunction &MF) const; @@ -173,72 +173,72 @@ private: MachineIRBuilder &MIRBuilder) const; // Emit an integer compare between LHS and RHS, which checks for Predicate. - MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, - MachineOperand &Predicate, - MachineIRBuilder &MIRBuilder) const; - - /// Emit a floating point comparison between \p LHS and \p RHS. - /// \p Pred if given is the intended predicate to use. - MachineInstr *emitFPCompare(Register LHS, Register RHS, - MachineIRBuilder &MIRBuilder, - Optional<CmpInst::Predicate> = None) const; - - MachineInstr *emitInstr(unsigned Opcode, - std::initializer_list<llvm::DstOp> DstOps, - std::initializer_list<llvm::SrcOp> SrcOps, - MachineIRBuilder &MIRBuilder, - const ComplexRendererFns &RenderFns = None) const; - /// Helper function to emit an add or sub instruction. - /// - /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above - /// in a specific order. - /// - /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. - /// - /// \code - /// const std::array<std::array<unsigned, 2>, 4> Table { - /// {{AArch64::ADDXri, AArch64::ADDWri}, - /// {AArch64::ADDXrs, AArch64::ADDWrs}, - /// {AArch64::ADDXrr, AArch64::ADDWrr}, - /// {AArch64::SUBXri, AArch64::SUBWri}, - /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; - /// \endcode - /// - /// Each row in the table corresponds to a different addressing mode. Each - /// column corresponds to a different register size. - /// - /// \attention Rows must be structured as follows: - /// - Row 0: The ri opcode variants - /// - Row 1: The rs opcode variants - /// - Row 2: The rr opcode variants - /// - Row 3: The ri opcode variants for negative immediates - /// - Row 4: The rx opcode variants - /// - /// \attention Columns must be structured as follows: - /// - Column 0: The 64-bit opcode variants - /// - Column 1: The 32-bit opcode variants - /// - /// \p Dst is the destination register of the binop to emit. - /// \p LHS is the left-hand operand of the binop to emit. - /// \p RHS is the right-hand operand of the binop to emit. - MachineInstr *emitAddSub( - const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, - Register Dst, MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, - MachineOperand &RHS, + MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS, + MachineOperand &Predicate, + MachineIRBuilder &MIRBuilder) const; + + /// Emit a floating point comparison between \p LHS and \p RHS. + /// \p Pred if given is the intended predicate to use. + MachineInstr *emitFPCompare(Register LHS, Register RHS, + MachineIRBuilder &MIRBuilder, + Optional<CmpInst::Predicate> = None) const; + + MachineInstr *emitInstr(unsigned Opcode, + std::initializer_list<llvm::DstOp> DstOps, + std::initializer_list<llvm::SrcOp> SrcOps, + MachineIRBuilder &MIRBuilder, + const ComplexRendererFns &RenderFns = None) const; + /// Helper function to emit an add or sub instruction. + /// + /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above + /// in a specific order. + /// + /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode. + /// + /// \code + /// const std::array<std::array<unsigned, 2>, 4> Table { + /// {{AArch64::ADDXri, AArch64::ADDWri}, + /// {AArch64::ADDXrs, AArch64::ADDWrs}, + /// {AArch64::ADDXrr, AArch64::ADDWrr}, + /// {AArch64::SUBXri, AArch64::SUBWri}, + /// {AArch64::ADDXrx, AArch64::ADDWrx}}}; + /// \endcode + /// + /// Each row in the table corresponds to a different addressing mode. Each + /// column corresponds to a different register size. + /// + /// \attention Rows must be structured as follows: + /// - Row 0: The ri opcode variants + /// - Row 1: The rs opcode variants + /// - Row 2: The rr opcode variants + /// - Row 3: The ri opcode variants for negative immediates + /// - Row 4: The rx opcode variants + /// + /// \attention Columns must be structured as follows: + /// - Column 0: The 64-bit opcode variants + /// - Column 1: The 32-bit opcode variants + /// + /// \p Dst is the destination register of the binop to emit. + /// \p LHS is the left-hand operand of the binop to emit. + /// \p RHS is the right-hand operand of the binop to emit. + MachineInstr *emitAddSub( + const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, + Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, + MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, + MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; - MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, - AArch64CC::CondCode CC, - MachineIRBuilder &MIRBuilder) const; + MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS, + AArch64CC::CondCode CC, + MachineIRBuilder &MIRBuilder) const; MachineInstr *emitExtractVectorElt(Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy, Register VecReg, unsigned LaneIdx, @@ -250,25 +250,25 @@ private: MachineInstr *emitFMovForFConstant(MachineInstr &MI, MachineRegisterInfo &MRI) const; - /// Emit a CSet for an integer compare. - /// - /// \p DefReg is expected to be a 32-bit scalar register. + /// Emit a CSet for an integer compare. + /// + /// \p DefReg is expected to be a 32-bit scalar register. MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred, MachineIRBuilder &MIRBuilder) const; - /// Emit a CSet for a FP compare. - /// - /// \p Dst is expected to be a 32-bit scalar register. - MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, - MachineIRBuilder &MIRBuilder) const; - - /// Emit the overflow op for \p Opcode. - /// - /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, - /// G_USUBO, etc. - std::pair<MachineInstr *, AArch64CC::CondCode> - emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, - MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; - + /// Emit a CSet for a FP compare. + /// + /// \p Dst is expected to be a 32-bit scalar register. + MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred, + MachineIRBuilder &MIRBuilder) const; + + /// Emit the overflow op for \p Opcode. + /// + /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO, + /// G_USUBO, etc. + std::pair<MachineInstr *, AArch64CC::CondCode> + emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS, + MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const; + /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg. /// \p IsNegative is true if the test should be "not zero". /// This will also optimize the test bit instruction when possible. @@ -276,11 +276,11 @@ private: MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const; - /// Emit a CB(N)Z instruction which branches to \p DestMBB. - MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, - MachineBasicBlock *DestMBB, - MachineIRBuilder &MIB) const; - + /// Emit a CB(N)Z instruction which branches to \p DestMBB. + MachineInstr *emitCBZ(Register CompareReg, bool IsNegative, + MachineBasicBlock *DestMBB, + MachineIRBuilder &MIB) const; + // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td. // We use these manually instead of using the importer since it doesn't // support SDNodeXForm. @@ -577,7 +577,7 @@ static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) { getConstantVRegValWithLookThrough(Root.getReg(), MRI, true); if (!ValAndVReg) return None; - Immed = ValAndVReg->Value.getSExtValue(); + Immed = ValAndVReg->Value.getSExtValue(); } else return None; return Immed; @@ -865,7 +865,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, #ifndef NDEBUG ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI); assert(ValidCopy && "Invalid copy."); - (void)KnownValid; + (void)KnownValid; #endif return ValidCopy; }; @@ -1012,173 +1012,173 @@ static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) { return GenericOpc; } -MachineInstr * -AArch64InstructionSelector::emitSelect(Register Dst, Register True, - Register False, AArch64CC::CondCode CC, - MachineIRBuilder &MIB) const { - MachineRegisterInfo &MRI = *MIB.getMRI(); - assert(RBI.getRegBank(False, MRI, TRI)->getID() == - RBI.getRegBank(True, MRI, TRI)->getID() && - "Expected both select operands to have the same regbank?"); - LLT Ty = MRI.getType(True); - if (Ty.isVector()) - return nullptr; - const unsigned Size = Ty.getSizeInBits(); - assert((Size == 32 || Size == 64) && - "Expected 32 bit or 64 bit select only?"); - const bool Is32Bit = Size == 32; - if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { - unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; - auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); - constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); - return &*FCSel; - } - - // By default, we'll try and emit a CSEL. - unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; - bool Optimized = false; - auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, - &Optimized](Register &Reg, Register &OtherReg, - bool Invert) { - if (Optimized) - return false; - - // Attempt to fold: - // - // %sub = G_SUB 0, %x - // %select = G_SELECT cc, %reg, %sub - // - // Into: - // %select = CSNEG %reg, %x, cc - Register MatchReg; - if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { - Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; - Reg = MatchReg; - if (Invert) { - CC = AArch64CC::getInvertedCondCode(CC); - std::swap(Reg, OtherReg); - } - return true; - } - - // Attempt to fold: - // - // %xor = G_XOR %x, -1 - // %select = G_SELECT cc, %reg, %xor - // - // Into: - // %select = CSINV %reg, %x, cc - if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { - Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; - Reg = MatchReg; - if (Invert) { - CC = AArch64CC::getInvertedCondCode(CC); - std::swap(Reg, OtherReg); - } - return true; - } - - // Attempt to fold: - // - // %add = G_ADD %x, 1 - // %select = G_SELECT cc, %reg, %add - // - // Into: - // %select = CSINC %reg, %x, cc - if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) { - Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; - Reg = MatchReg; - if (Invert) { - CC = AArch64CC::getInvertedCondCode(CC); - std::swap(Reg, OtherReg); - } - return true; - } - +MachineInstr * +AArch64InstructionSelector::emitSelect(Register Dst, Register True, + Register False, AArch64CC::CondCode CC, + MachineIRBuilder &MIB) const { + MachineRegisterInfo &MRI = *MIB.getMRI(); + assert(RBI.getRegBank(False, MRI, TRI)->getID() == + RBI.getRegBank(True, MRI, TRI)->getID() && + "Expected both select operands to have the same regbank?"); + LLT Ty = MRI.getType(True); + if (Ty.isVector()) + return nullptr; + const unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "Expected 32 bit or 64 bit select only?"); + const bool Is32Bit = Size == 32; + if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) { + unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr; + auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); + constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI); + return &*FCSel; + } + + // By default, we'll try and emit a CSEL. + unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr; + bool Optimized = false; + auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI, + &Optimized](Register &Reg, Register &OtherReg, + bool Invert) { + if (Optimized) + return false; + + // Attempt to fold: + // + // %sub = G_SUB 0, %x + // %select = G_SELECT cc, %reg, %sub + // + // Into: + // %select = CSNEG %reg, %x, cc + Register MatchReg; + if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) { + Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr; + Reg = MatchReg; + if (Invert) { + CC = AArch64CC::getInvertedCondCode(CC); + std::swap(Reg, OtherReg); + } + return true; + } + + // Attempt to fold: + // + // %xor = G_XOR %x, -1 + // %select = G_SELECT cc, %reg, %xor + // + // Into: + // %select = CSINV %reg, %x, cc + if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) { + Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; + Reg = MatchReg; + if (Invert) { + CC = AArch64CC::getInvertedCondCode(CC); + std::swap(Reg, OtherReg); + } + return true; + } + + // Attempt to fold: + // + // %add = G_ADD %x, 1 + // %select = G_SELECT cc, %reg, %add + // + // Into: + // %select = CSINC %reg, %x, cc + if (mi_match(Reg, MRI, m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)))) { + Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; + Reg = MatchReg; + if (Invert) { + CC = AArch64CC::getInvertedCondCode(CC); + std::swap(Reg, OtherReg); + } + return true; + } + return false; - }; - - // Helper lambda which tries to use CSINC/CSINV for the instruction when its - // true/false values are constants. - // FIXME: All of these patterns already exist in tablegen. We should be - // able to import these. - auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, - &Optimized]() { - if (Optimized) - return false; - auto TrueCst = getConstantVRegValWithLookThrough(True, MRI); - auto FalseCst = getConstantVRegValWithLookThrough(False, MRI); - if (!TrueCst && !FalseCst) - return false; - - Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; - if (TrueCst && FalseCst) { - int64_t T = TrueCst->Value.getSExtValue(); - int64_t F = FalseCst->Value.getSExtValue(); - - if (T == 0 && F == 1) { - // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc - Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; - True = ZReg; - False = ZReg; - return true; - } - - if (T == 0 && F == -1) { - // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc - Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; - True = ZReg; - False = ZReg; - return true; - } - } - - if (TrueCst) { - int64_t T = TrueCst->Value.getSExtValue(); - if (T == 1) { - // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc - Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; - True = False; - False = ZReg; - CC = AArch64CC::getInvertedCondCode(CC); - return true; - } - - if (T == -1) { - // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc - Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; - True = False; - False = ZReg; - CC = AArch64CC::getInvertedCondCode(CC); - return true; - } - } - - if (FalseCst) { - int64_t F = FalseCst->Value.getSExtValue(); - if (F == 1) { - // G_SELECT cc, t, 1 -> CSINC t, zreg, cc - Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; - False = ZReg; - return true; - } - - if (F == -1) { - // G_SELECT cc, t, -1 -> CSINC t, zreg, cc - Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; - False = ZReg; - return true; - } - } - return false; - }; - - Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); - Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); - Optimized |= TryOptSelectCst(); - auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); - constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); - return &*SelectInst; + }; + + // Helper lambda which tries to use CSINC/CSINV for the instruction when its + // true/false values are constants. + // FIXME: All of these patterns already exist in tablegen. We should be + // able to import these. + auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI, + &Optimized]() { + if (Optimized) + return false; + auto TrueCst = getConstantVRegValWithLookThrough(True, MRI); + auto FalseCst = getConstantVRegValWithLookThrough(False, MRI); + if (!TrueCst && !FalseCst) + return false; + + Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR; + if (TrueCst && FalseCst) { + int64_t T = TrueCst->Value.getSExtValue(); + int64_t F = FalseCst->Value.getSExtValue(); + + if (T == 0 && F == 1) { + // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc + Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; + True = ZReg; + False = ZReg; + return true; + } + + if (T == 0 && F == -1) { + // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc + Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; + True = ZReg; + False = ZReg; + return true; + } + } + + if (TrueCst) { + int64_t T = TrueCst->Value.getSExtValue(); + if (T == 1) { + // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc + Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; + True = False; + False = ZReg; + CC = AArch64CC::getInvertedCondCode(CC); + return true; + } + + if (T == -1) { + // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc + Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; + True = False; + False = ZReg; + CC = AArch64CC::getInvertedCondCode(CC); + return true; + } + } + + if (FalseCst) { + int64_t F = FalseCst->Value.getSExtValue(); + if (F == 1) { + // G_SELECT cc, t, 1 -> CSINC t, zreg, cc + Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr; + False = ZReg; + return true; + } + + if (F == -1) { + // G_SELECT cc, t, -1 -> CSINC t, zreg, cc + Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr; + False = ZReg; + return true; + } + } + return false; + }; + + Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false); + Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true); + Optimized |= TryOptSelectCst(); + auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC); + constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI); + return &*SelectInst; } static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) { @@ -1308,7 +1308,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); } if (VRegAndVal) - C = VRegAndVal->Value.getSExtValue(); + C = VRegAndVal->Value.getSExtValue(); break; } case TargetOpcode::G_ASHR: @@ -1318,7 +1318,7 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert, auto VRegAndVal = getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI); if (VRegAndVal) - C = VRegAndVal->Value.getSExtValue(); + C = VRegAndVal->Value.getSExtValue(); break; } } @@ -1420,9 +1420,9 @@ MachineInstr *AArch64InstructionSelector::emitTestBit( } bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( - MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, - MachineIRBuilder &MIB) const { - assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); + MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB, + MachineIRBuilder &MIB) const { + assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?"); // Given something like this: // // %x = ...Something... @@ -1444,92 +1444,92 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch( // Check if the AND has a constant on its RHS which we can use as a mask. // If it's a power of 2, then it's the same as checking a specific bit. // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set) - auto MaybeBit = getConstantVRegValWithLookThrough( - AndInst.getOperand(2).getReg(), *MIB.getMRI()); - if (!MaybeBit) + auto MaybeBit = getConstantVRegValWithLookThrough( + AndInst.getOperand(2).getReg(), *MIB.getMRI()); + if (!MaybeBit) return false; - int32_t Bit = MaybeBit->Value.exactLogBase2(); - if (Bit < 0) - return false; - - Register TestReg = AndInst.getOperand(1).getReg(); + int32_t Bit = MaybeBit->Value.exactLogBase2(); + if (Bit < 0) + return false; + Register TestReg = AndInst.getOperand(1).getReg(); + // Emit a TB(N)Z. emitTestBit(TestReg, Bit, Invert, DstMBB, MIB); return true; } -MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, - bool IsNegative, - MachineBasicBlock *DestMBB, - MachineIRBuilder &MIB) const { - assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); - MachineRegisterInfo &MRI = *MIB.getMRI(); - assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == - AArch64::GPRRegBankID && - "Expected GPRs only?"); - auto Ty = MRI.getType(CompareReg); - unsigned Width = Ty.getSizeInBits(); - assert(!Ty.isVector() && "Expected scalar only?"); - assert(Width <= 64 && "Expected width to be at most 64?"); - static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, - {AArch64::CBNZW, AArch64::CBNZX}}; - unsigned Opc = OpcTable[IsNegative][Width == 64]; - auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); - constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); - return &*BranchMI; -} - -bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( - MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { - assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); - assert(I.getOpcode() == TargetOpcode::G_BRCOND); - // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't - // totally clean. Some of them require two branches to implement. - auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); - emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, - Pred); - AArch64CC::CondCode CC1, CC2; - changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); +MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg, + bool IsNegative, + MachineBasicBlock *DestMBB, + MachineIRBuilder &MIB) const { + assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() == + AArch64::GPRRegBankID && + "Expected GPRs only?"); + auto Ty = MRI.getType(CompareReg); + unsigned Width = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected scalar only?"); + assert(Width <= 64 && "Expected width to be at most 64?"); + static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX}, + {AArch64::CBNZW, AArch64::CBNZX}}; + unsigned Opc = OpcTable[IsNegative][Width == 64]; + auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB); + constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI); + return &*BranchMI; +} + +bool AArch64InstructionSelector::selectCompareBranchFedByFCmp( + MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const { + assert(FCmp.getOpcode() == TargetOpcode::G_FCMP); + assert(I.getOpcode() == TargetOpcode::G_BRCOND); + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't + // totally clean. Some of them require two branches to implement. + auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate(); + emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB, + Pred); + AArch64CC::CondCode CC1, CC2; + changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); - if (CC2 != AArch64CC::AL) - MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( - MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { - assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); - assert(I.getOpcode() == TargetOpcode::G_BRCOND); - // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. - // - // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z - // instructions will not be produced, as they are conditional branch - // instructions that do not set flags. - if (!ProduceNonFlagSettingCondBr) + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB); + if (CC2 != AArch64CC::AL) + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( + MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { + assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); + assert(I.getOpcode() == TargetOpcode::G_BRCOND); + // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z. + // + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z + // instructions will not be produced, as they are conditional branch + // instructions that do not set flags. + if (!ProduceNonFlagSettingCondBr) return false; - MachineRegisterInfo &MRI = *MIB.getMRI(); - MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - auto Pred = - static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); - Register LHS = ICmp.getOperand(2).getReg(); - Register RHS = ICmp.getOperand(3).getReg(); - - // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. + MachineRegisterInfo &MRI = *MIB.getMRI(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + auto Pred = + static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate()); + Register LHS = ICmp.getOperand(2).getReg(); + Register RHS = ICmp.getOperand(3).getReg(); + + // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that. auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); + MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); // When we can emit a TB(N)Z, prefer that. // // Handle non-commutative condition codes first. // Note that we don't want to do this when we have a G_AND because it can // become a tst. The tst will make the test bit in the TB(N)Z redundant. - if (VRegAndVal && !AndInst) { - int64_t C = VRegAndVal->Value.getSExtValue(); + if (VRegAndVal && !AndInst) { + int64_t C = VRegAndVal->Value.getSExtValue(); // When we have a greater-than comparison, we can just test if the msb is // zero. @@ -1550,97 +1550,97 @@ bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp( } } - // Attempt to handle commutative condition codes. Right now, that's only - // eq/ne. - if (ICmpInst::isEquality(Pred)) { - if (!VRegAndVal) { - std::swap(RHS, LHS); - VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); - AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); - } - - if (VRegAndVal && VRegAndVal->Value == 0) { - // If there's a G_AND feeding into this branch, try to fold it away by - // emitting a TB(N)Z instead. - // - // Note: If we have LT, then it *is* possible to fold, but it wouldn't be - // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding - // would be redundant. - if (AndInst && - tryOptAndIntoCompareBranch( - *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { - I.eraseFromParent(); - return true; - } - - // Otherwise, try to emit a CB(N)Z instead. - auto LHSTy = MRI.getType(LHS); - if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { - emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); - I.eraseFromParent(); - return true; - } - } - } - - return false; -} - -bool AArch64InstructionSelector::selectCompareBranchFedByICmp( - MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { - assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); - assert(I.getOpcode() == TargetOpcode::G_BRCOND); - if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) + // Attempt to handle commutative condition codes. Right now, that's only + // eq/ne. + if (ICmpInst::isEquality(Pred)) { + if (!VRegAndVal) { + std::swap(RHS, LHS); + VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI); + AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI); + } + + if (VRegAndVal && VRegAndVal->Value == 0) { + // If there's a G_AND feeding into this branch, try to fold it away by + // emitting a TB(N)Z instead. + // + // Note: If we have LT, then it *is* possible to fold, but it wouldn't be + // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding + // would be redundant. + if (AndInst && + tryOptAndIntoCompareBranch( + *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) { + I.eraseFromParent(); + return true; + } + + // Otherwise, try to emit a CB(N)Z instead. + auto LHSTy = MRI.getType(LHS); + if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) { + emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB); + I.eraseFromParent(); + return true; + } + } + } + + return false; +} + +bool AArch64InstructionSelector::selectCompareBranchFedByICmp( + MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const { + assert(ICmp.getOpcode() == TargetOpcode::G_ICMP); + assert(I.getOpcode() == TargetOpcode::G_BRCOND); + if (tryOptCompareBranchFedByICmp(I, ICmp, MIB)) return true; - - // Couldn't optimize. Emit a compare + a Bcc. - MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); - auto PredOp = ICmp.getOperand(1); - emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); - const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( - static_cast<CmpInst::Predicate>(PredOp.getPredicate())); - MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); - I.eraseFromParent(); - return true; -} - -bool AArch64InstructionSelector::selectCompareBranch( - MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { - Register CondReg = I.getOperand(0).getReg(); - MachineInstr *CCMI = MRI.getVRegDef(CondReg); - if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { - CondReg = CCMI->getOperand(1).getReg(); - CCMI = MRI.getVRegDef(CondReg); - } - - // Try to select the G_BRCOND using whatever is feeding the condition if - // possible. - MachineIRBuilder MIB(I); - unsigned CCMIOpc = CCMI->getOpcode(); - if (CCMIOpc == TargetOpcode::G_FCMP) - return selectCompareBranchFedByFCmp(I, *CCMI, MIB); - if (CCMIOpc == TargetOpcode::G_ICMP) - return selectCompareBranchFedByICmp(I, *CCMI, MIB); - - // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z - // instructions will not be produced, as they are conditional branch - // instructions that do not set flags. - if (ProduceNonFlagSettingCondBr) { - emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, - I.getOperand(1).getMBB(), MIB); + + // Couldn't optimize. Emit a compare + a Bcc. + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + auto PredOp = ICmp.getOperand(1); + emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB); + const AArch64CC::CondCode CC = changeICMPPredToAArch64CC( + static_cast<CmpInst::Predicate>(PredOp.getPredicate())); + MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectCompareBranch( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + Register CondReg = I.getOperand(0).getReg(); + MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) { + CondReg = CCMI->getOperand(1).getReg(); + CCMI = MRI.getVRegDef(CondReg); + } + + // Try to select the G_BRCOND using whatever is feeding the condition if + // possible. + MachineIRBuilder MIB(I); + unsigned CCMIOpc = CCMI->getOpcode(); + if (CCMIOpc == TargetOpcode::G_FCMP) + return selectCompareBranchFedByFCmp(I, *CCMI, MIB); + if (CCMIOpc == TargetOpcode::G_ICMP) + return selectCompareBranchFedByICmp(I, *CCMI, MIB); + + // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z + // instructions will not be produced, as they are conditional branch + // instructions that do not set flags. + if (ProduceNonFlagSettingCondBr) { + emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true, + I.getOperand(1).getMBB(), MIB); I.eraseFromParent(); return true; } - // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. - auto TstMI = - MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); - constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); - auto Bcc = MIB.buildInstr(AArch64::Bcc) - .addImm(AArch64CC::EQ) - .addMBB(I.getOperand(1).getMBB()); + // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead. + auto TstMI = + MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1); + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + auto Bcc = MIB.buildInstr(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addMBB(I.getOperand(1).getMBB()); I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); + return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI); } /// Returns the element immediate value of a vector shift operand if found. @@ -1661,8 +1661,8 @@ static Optional<int64_t> getVectorShiftImm(Register Reg, return None; if (Idx == 1) - ImmVal = VRegAndVal->Value.getSExtValue(); - if (ImmVal != VRegAndVal->Value.getSExtValue()) + ImmVal = VRegAndVal->Value.getSExtValue(); + if (ImmVal != VRegAndVal->Value.getSExtValue()) return None; } @@ -1725,14 +1725,14 @@ bool AArch64InstructionSelector::selectVectorSHL( Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; } else if (Ty == LLT::vector(2, 32)) { Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; - } else if (Ty == LLT::vector(4, 16)) { - Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; - } else if (Ty == LLT::vector(8, 16)) { - Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; - } else if (Ty == LLT::vector(16, 8)) { - Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; - } else if (Ty == LLT::vector(8, 8)) { - Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; + } else if (Ty == LLT::vector(4, 16)) { + Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16; + } else if (Ty == LLT::vector(8, 16)) { + Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16; + } else if (Ty == LLT::vector(16, 8)) { + Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8; + } else if (Ty == LLT::vector(8, 8)) { + Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8; } else { LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); return false; @@ -1749,10 +1749,10 @@ bool AArch64InstructionSelector::selectVectorSHL( return true; } -bool AArch64InstructionSelector::selectVectorAshrLshr( +bool AArch64InstructionSelector::selectVectorAshrLshr( MachineInstr &I, MachineRegisterInfo &MRI) const { - assert(I.getOpcode() == TargetOpcode::G_ASHR || - I.getOpcode() == TargetOpcode::G_LSHR); + assert(I.getOpcode() == TargetOpcode::G_ASHR || + I.getOpcode() == TargetOpcode::G_LSHR); Register DstReg = I.getOperand(0).getReg(); const LLT Ty = MRI.getType(DstReg); Register Src1Reg = I.getOperand(1).getReg(); @@ -1761,40 +1761,40 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( if (!Ty.isVector()) return false; - bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; - - // We expect the immediate case to be lowered in the PostLegalCombiner to - // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. - + bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR; + + // We expect the immediate case to be lowered in the PostLegalCombiner to + // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents. + // There is not a shift right register instruction, but the shift left // register instruction takes a signed value, where negative numbers specify a // right shift. unsigned Opc = 0; unsigned NegOpc = 0; - const TargetRegisterClass *RC = - getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI); + const TargetRegisterClass *RC = + getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI); if (Ty == LLT::vector(2, 64)) { - Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; + Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64; NegOpc = AArch64::NEGv2i64; } else if (Ty == LLT::vector(4, 32)) { - Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; + Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32; NegOpc = AArch64::NEGv4i32; } else if (Ty == LLT::vector(2, 32)) { - Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; + Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32; NegOpc = AArch64::NEGv2i32; - } else if (Ty == LLT::vector(4, 16)) { - Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; - NegOpc = AArch64::NEGv4i16; - } else if (Ty == LLT::vector(8, 16)) { - Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; - NegOpc = AArch64::NEGv8i16; - } else if (Ty == LLT::vector(16, 8)) { - Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; - NegOpc = AArch64::NEGv16i8; - } else if (Ty == LLT::vector(8, 8)) { - Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; - NegOpc = AArch64::NEGv8i8; + } else if (Ty == LLT::vector(4, 16)) { + Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16; + NegOpc = AArch64::NEGv4i16; + } else if (Ty == LLT::vector(8, 16)) { + Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16; + NegOpc = AArch64::NEGv8i16; + } else if (Ty == LLT::vector(16, 8)) { + Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8; + NegOpc = AArch64::NEGv16i8; + } else if (Ty == LLT::vector(8, 8)) { + Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8; + NegOpc = AArch64::NEGv8i8; } else { LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type"); return false; @@ -1931,40 +1931,40 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) { MRI.setType(DstReg, LLT::scalar(64)); return true; } - case AArch64::G_DUP: { - // Convert the type from p0 to s64 to help selection. - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - if (!DstTy.getElementType().isPointer()) - return false; - MachineIRBuilder MIB(I); - auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); - MRI.setType(I.getOperand(0).getReg(), - DstTy.changeElementType(LLT::scalar(64))); - MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); - I.getOperand(1).setReg(NewSrc.getReg(0)); - return true; - } - case TargetOpcode::G_UITOFP: - case TargetOpcode::G_SITOFP: { - // If both source and destination regbanks are FPR, then convert the opcode - // to G_SITOF so that the importer can select it to an fpr variant. - // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank - // copy. - Register SrcReg = I.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - LLT DstTy = MRI.getType(I.getOperand(0).getReg()); - if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) - return false; - - if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { - if (I.getOpcode() == TargetOpcode::G_SITOFP) - I.setDesc(TII.get(AArch64::G_SITOF)); - else - I.setDesc(TII.get(AArch64::G_UITOF)); - return true; - } - return false; - } + case AArch64::G_DUP: { + // Convert the type from p0 to s64 to help selection. + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + if (!DstTy.getElementType().isPointer()) + return false; + MachineIRBuilder MIB(I); + auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg()); + MRI.setType(I.getOperand(0).getReg(), + DstTy.changeElementType(LLT::scalar(64))); + MRI.setRegBank(NewSrc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID)); + I.getOperand(1).setReg(NewSrc.getReg(0)); + return true; + } + case TargetOpcode::G_UITOFP: + case TargetOpcode::G_SITOFP: { + // If both source and destination regbanks are FPR, then convert the opcode + // to G_SITOF so that the importer can select it to an fpr variant. + // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank + // copy. + Register SrcReg = I.getOperand(1).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits()) + return false; + + if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) { + if (I.getOpcode() == TargetOpcode::G_SITOFP) + I.setDesc(TII.get(AArch64::G_SITOF)); + else + I.setDesc(TII.get(AArch64::G_UITOF)); + return true; + } + return false; + } default: return false; } @@ -2005,14 +2005,14 @@ bool AArch64InstructionSelector::convertPtrAddToAdd( LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd"); return false; } - - // Also take the opportunity here to try to do some optimization. - // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. - Register NegatedReg; - if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) - return true; - I.getOperand(2).setReg(NegatedReg); - I.setDesc(TII.get(TargetOpcode::G_SUB)); + + // Also take the opportunity here to try to do some optimization. + // Try to convert this into a G_SUB if the offset is a 0-x negate idiom. + Register NegatedReg; + if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg)))) + return true; + I.getOperand(2).setReg(NegatedReg); + I.setDesc(TII.get(TargetOpcode::G_SUB)); return true; } @@ -2102,17 +2102,17 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const { MachineRegisterInfo &MRI = MF.getRegInfo(); switch (I.getOpcode()) { - case TargetOpcode::G_BR: { - // If the branch jumps to the fallthrough block, don't bother emitting it. - // Only do this for -O0 for a good code size improvement, because when - // optimizations are enabled we want to leave this choice to - // MachineBlockPlacement. - bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None; - if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB())) - return false; - I.eraseFromParent(); - return true; - } + case TargetOpcode::G_BR: { + // If the branch jumps to the fallthrough block, don't bother emitting it. + // Only do this for -O0 for a good code size improvement, because when + // optimizations are enabled we want to leave this choice to + // MachineBlockPlacement. + bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None; + if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB())) + return false; + I.eraseFromParent(); + return true; + } case TargetOpcode::G_SHL: return earlySelectSHL(I, MRI); case TargetOpcode::G_CONSTANT: { @@ -2232,8 +2232,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { MachineIRBuilder MIB(I); switch (Opcode) { - case TargetOpcode::G_BRCOND: - return selectCompareBranch(I, MF, MRI); + case TargetOpcode::G_BRCOND: + return selectCompareBranch(I, MF, MRI); case TargetOpcode::G_BRINDIRECT: { I.setDesc(TII.get(AArch64::BR)); @@ -2313,7 +2313,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); - const LLT s128 = LLT::scalar(128); + const LLT s128 = LLT::scalar(128); const LLT p0 = LLT::pointer(0, 64); const Register DefReg = I.getOperand(0).getReg(); @@ -2323,10 +2323,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // FIXME: Redundant check, but even less readable when factored out. if (isFP) { - if (Ty != s32 && Ty != s64 && Ty != s128) { + if (Ty != s32 && Ty != s64 && Ty != s128) { LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty << " constant, expected: " << s32 << " or " << s64 - << " or " << s128 << '\n'); + << " or " << s128 << '\n'); return false; } @@ -2339,9 +2339,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // The case when we have 0.0 is covered by tablegen. Reject it here so we // can be sure tablegen works correctly and isn't rescued by this code. - // 0.0 is not covered by tablegen for FP128. So we will handle this - // scenario in the code here. - if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) + // 0.0 is not covered by tablegen for FP128. So we will handle this + // scenario in the code here. + if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0)) return false; } else { // s32 and s64 are covered by tablegen. @@ -2368,17 +2368,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // Either emit a FMOV, or emit a copy to emit a normal mov. const TargetRegisterClass &GPRRC = DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass; - const TargetRegisterClass &FPRRC = - DefSize == 32 ? AArch64::FPR32RegClass - : (DefSize == 64 ? AArch64::FPR64RegClass - : AArch64::FPR128RegClass); + const TargetRegisterClass &FPRRC = + DefSize == 32 ? AArch64::FPR32RegClass + : (DefSize == 64 ? AArch64::FPR64RegClass + : AArch64::FPR128RegClass); // Can we use a FMOV instruction to represent the immediate? if (emitFMovForFConstant(I, MRI)) return true; // For 64b values, emit a constant pool load instead. - if (DefSize == 64 || DefSize == 128) { + if (DefSize == 64 || DefSize == 128) { auto *FPImm = I.getOperand(1).getFPImm(); MachineIRBuilder MIB(I); auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB); @@ -2571,21 +2571,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } auto &MemOp = **I.memoperands_begin(); - uint64_t MemSizeInBytes = MemOp.getSize(); + uint64_t MemSizeInBytes = MemOp.getSize(); if (MemOp.isAtomic()) { // For now we just support s8 acquire loads to be able to compile stack // protector code. if (MemOp.getOrdering() == AtomicOrdering::Acquire && - MemSizeInBytes == 1) { + MemSizeInBytes == 1) { I.setDesc(TII.get(AArch64::LDARB)); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } LLVM_DEBUG(dbgs() << "Atomic load/store not fully supported yet\n"); return false; } - unsigned MemSizeInBits = MemSizeInBytes * 8; + unsigned MemSizeInBits = MemSizeInBytes * 8; -#ifndef NDEBUG +#ifndef NDEBUG const Register PtrReg = I.getOperand(1).getReg(); const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); // Sanity-check the pointer register. @@ -2598,78 +2598,78 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { const Register ValReg = I.getOperand(0).getReg(); const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI); - // Helper lambda for partially selecting I. Either returns the original - // instruction with an updated opcode, or a new instruction. - auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { - bool IsStore = I.getOpcode() == TargetOpcode::G_STORE; - const unsigned NewOpc = - selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); - if (NewOpc == I.getOpcode()) - return nullptr; - // Check if we can fold anything into the addressing mode. - auto AddrModeFns = - selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); - if (!AddrModeFns) { - // Can't fold anything. Use the original instruction. - I.setDesc(TII.get(NewOpc)); - I.addOperand(MachineOperand::CreateImm(0)); - return &I; + // Helper lambda for partially selecting I. Either returns the original + // instruction with an updated opcode, or a new instruction. + auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * { + bool IsStore = I.getOpcode() == TargetOpcode::G_STORE; + const unsigned NewOpc = + selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits); + if (NewOpc == I.getOpcode()) + return nullptr; + // Check if we can fold anything into the addressing mode. + auto AddrModeFns = + selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes); + if (!AddrModeFns) { + // Can't fold anything. Use the original instruction. + I.setDesc(TII.get(NewOpc)); + I.addOperand(MachineOperand::CreateImm(0)); + return &I; } - // Folded something. Create a new instruction and return it. - auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); - IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg); - NewInst.cloneMemRefs(I); - for (auto &Fn : *AddrModeFns) - Fn(NewInst); - I.eraseFromParent(); - return &*NewInst; - }; + // Folded something. Create a new instruction and return it. + auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags()); + IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg); + NewInst.cloneMemRefs(I); + for (auto &Fn : *AddrModeFns) + Fn(NewInst); + I.eraseFromParent(); + return &*NewInst; + }; - MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); - if (!LoadStore) - return false; + MachineInstr *LoadStore = SelectLoadStoreAddressingMode(); + if (!LoadStore) + return false; // If we're storing a 0, use WZR/XZR. - if (Opcode == TargetOpcode::G_STORE) { - auto CVal = getConstantVRegValWithLookThrough( - LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true, - /*HandleFConstants = */ false); - if (CVal && CVal->Value == 0) { - switch (LoadStore->getOpcode()) { - case AArch64::STRWui: - case AArch64::STRHHui: - case AArch64::STRBBui: - LoadStore->getOperand(0).setReg(AArch64::WZR); - break; - case AArch64::STRXui: - LoadStore->getOperand(0).setReg(AArch64::XZR); - break; - } + if (Opcode == TargetOpcode::G_STORE) { + auto CVal = getConstantVRegValWithLookThrough( + LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true, + /*HandleFConstants = */ false); + if (CVal && CVal->Value == 0) { + switch (LoadStore->getOpcode()) { + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + LoadStore->getOperand(0).setReg(AArch64::WZR); + break; + case AArch64::STRXui: + LoadStore->getOperand(0).setReg(AArch64::XZR); + break; + } } } if (IsZExtLoad) { - // The zextload from a smaller type to i32 should be handled by the - // importer. - if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) + // The zextload from a smaller type to i32 should be handled by the + // importer. + if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64) return false; // If we have a ZEXTLOAD then change the load's type to be a narrower reg - // and zero_extend with SUBREG_TO_REG. + // and zero_extend with SUBREG_TO_REG. Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - Register DstReg = LoadStore->getOperand(0).getReg(); - LoadStore->getOperand(0).setReg(LdReg); + Register DstReg = LoadStore->getOperand(0).getReg(); + LoadStore->getOperand(0).setReg(LdReg); - MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); + MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator())); MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {}) .addImm(0) .addUse(LdReg) .addImm(AArch64::sub_32); - constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); + constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass, MRI); } - return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); + return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI); } case TargetOpcode::G_SMULH: @@ -2700,21 +2700,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { // operands to use appropriate classes. return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_LSHR: + case TargetOpcode::G_LSHR: case TargetOpcode::G_ASHR: if (MRI.getType(I.getOperand(0).getReg()).isVector()) - return selectVectorAshrLshr(I, MRI); + return selectVectorAshrLshr(I, MRI); LLVM_FALLTHROUGH; case TargetOpcode::G_SHL: if (Opcode == TargetOpcode::G_SHL && MRI.getType(I.getOperand(0).getReg()).isVector()) return selectVectorSHL(I, MRI); LLVM_FALLTHROUGH; - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_OR: { + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_OR: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; @@ -2743,24 +2743,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { I.eraseFromParent(); return true; } - case TargetOpcode::G_SADDO: - case TargetOpcode::G_UADDO: - case TargetOpcode::G_SSUBO: - case TargetOpcode::G_USUBO: { - // Emit the operation and get the correct condition code. + case TargetOpcode::G_SADDO: + case TargetOpcode::G_UADDO: + case TargetOpcode::G_SSUBO: + case TargetOpcode::G_USUBO: { + // Emit the operation and get the correct condition code. MachineIRBuilder MIRBuilder(I); - auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), - I.getOperand(2), I.getOperand(3), MIRBuilder); + auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(), + I.getOperand(2), I.getOperand(3), MIRBuilder); // Now, put the overflow result in the register given by the first operand - // to the overflow op. CSINC increments the result when the predicate is - // false, so to get the increment when it's true, we need to use the - // inverse. In this case, we want to increment when carry is set. - Register ZReg = AArch64::WZR; + // to the overflow op. CSINC increments the result when the predicate is + // false, so to get the increment when it's true, we need to use the + // inverse. In this case, we want to increment when carry is set. + Register ZReg = AArch64::WZR; auto CsetMI = MIRBuilder .buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()}, - {ZReg, ZReg}) - .addImm(getInvertedCondCode(OpAndCC.second)); + {ZReg, ZReg}) + .addImm(getInvertedCondCode(OpAndCC.second)); constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI); I.eraseFromParent(); return true; @@ -2768,7 +2768,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_PTRMASK: { Register MaskReg = I.getOperand(2).getReg(); - Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI); + Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI); // TODO: Implement arbitrary cases if (!MaskVal || !isShiftedMask_64(*MaskVal)) return false; @@ -3059,15 +3059,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { if (tryOptSelect(I)) return true; - // Make sure to use an unused vreg instead of wzr, so that the peephole - // optimizations will be able to optimize these. - MachineIRBuilder MIB(I); - Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); - constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); - if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB)) - return false; + // Make sure to use an unused vreg instead of wzr, so that the peephole + // optimizations will be able to optimize these. + MachineIRBuilder MIB(I); + Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg}) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB)) + return false; I.eraseFromParent(); return true; } @@ -3082,21 +3082,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } MachineIRBuilder MIRBuilder(I); - auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); - emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), - MIRBuilder); + auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); + emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), + MIRBuilder); emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder); I.eraseFromParent(); return true; } case TargetOpcode::G_FCMP: { - MachineIRBuilder MIRBuilder(I); - CmpInst::Predicate Pred = - static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); - if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), - MIRBuilder, Pred) || - !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder)) + MachineIRBuilder MIRBuilder(I); + CmpInst::Predicate Pred = + static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate()); + if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), + MIRBuilder, Pred) || + !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIRBuilder)) return false; I.eraseFromParent(); return true; @@ -3136,24 +3136,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI); } } - case AArch64::G_DUP: { - // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by - // imported patterns. Do it manually here. Avoiding generating s16 gpr is - // difficult because at RBS we may end up pessimizing the fpr case if we - // decided to add an anyextend to fix this. Manual selection is the most - // robust solution for now. - Register SrcReg = I.getOperand(1).getReg(); - if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID) - return false; // We expect the fpr regbank case to be imported. - LLT SrcTy = MRI.getType(SrcReg); - if (SrcTy.getSizeInBits() == 16) - I.setDesc(TII.get(AArch64::DUPv8i16gpr)); - else if (SrcTy.getSizeInBits() == 8) - I.setDesc(TII.get(AArch64::DUPv16i8gpr)); - else - return false; - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } + case AArch64::G_DUP: { + // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by + // imported patterns. Do it manually here. Avoiding generating s16 gpr is + // difficult because at RBS we may end up pessimizing the fpr case if we + // decided to add an anyextend to fix this. Manual selection is the most + // robust solution for now. + Register SrcReg = I.getOperand(1).getReg(); + if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::GPRRegBankID) + return false; // We expect the fpr regbank case to be imported. + LLT SrcTy = MRI.getType(SrcReg); + if (SrcTy.getSizeInBits() == 16) + I.setDesc(TII.get(AArch64::DUPv8i16gpr)); + else if (SrcTy.getSizeInBits() == 8) + I.setDesc(TII.get(AArch64::DUPv16i8gpr)); + else + return false; + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } case TargetOpcode::G_INTRINSIC_TRUNC: return selectIntrinsicTrunc(I, MRI); case TargetOpcode::G_INTRINSIC_ROUND: @@ -3174,52 +3174,52 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return selectConcatVectors(I, MRI); case TargetOpcode::G_JUMP_TABLE: return selectJumpTable(I, MRI); - case TargetOpcode::G_VECREDUCE_FADD: - case TargetOpcode::G_VECREDUCE_ADD: - return selectReduction(I, MRI); - } - - return false; -} - -bool AArch64InstructionSelector::selectReduction( - MachineInstr &I, MachineRegisterInfo &MRI) const { - Register VecReg = I.getOperand(1).getReg(); - LLT VecTy = MRI.getType(VecReg); - if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { - unsigned Opc = 0; - if (VecTy == LLT::vector(16, 8)) - Opc = AArch64::ADDVv16i8v; - else if (VecTy == LLT::vector(8, 16)) - Opc = AArch64::ADDVv8i16v; - else if (VecTy == LLT::vector(4, 32)) - Opc = AArch64::ADDVv4i32v; - else if (VecTy == LLT::vector(2, 64)) - Opc = AArch64::ADDPv2i64p; - else { - LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); - return false; - } - I.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + case TargetOpcode::G_VECREDUCE_FADD: + case TargetOpcode::G_VECREDUCE_ADD: + return selectReduction(I, MRI); } - if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { - unsigned Opc = 0; - if (VecTy == LLT::vector(2, 32)) - Opc = AArch64::FADDPv2i32p; - else if (VecTy == LLT::vector(2, 64)) - Opc = AArch64::FADDPv2i64p; - else { - LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); - return false; - } - I.setDesc(TII.get(Opc)); - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); - } return false; } +bool AArch64InstructionSelector::selectReduction( + MachineInstr &I, MachineRegisterInfo &MRI) const { + Register VecReg = I.getOperand(1).getReg(); + LLT VecTy = MRI.getType(VecReg); + if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) { + unsigned Opc = 0; + if (VecTy == LLT::vector(16, 8)) + Opc = AArch64::ADDVv16i8v; + else if (VecTy == LLT::vector(8, 16)) + Opc = AArch64::ADDVv8i16v; + else if (VecTy == LLT::vector(4, 32)) + Opc = AArch64::ADDVv4i32v; + else if (VecTy == LLT::vector(2, 64)) + Opc = AArch64::ADDPv2i64p; + else { + LLVM_DEBUG(dbgs() << "Unhandled type for add reduction"); + return false; + } + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + + if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) { + unsigned Opc = 0; + if (VecTy == LLT::vector(2, 32)) + Opc = AArch64::FADDPv2i32p; + else if (VecTy == LLT::vector(2, 64)) + Opc = AArch64::FADDPv2i64p; + else { + LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction"); + return false; + } + I.setDesc(TII.get(Opc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + return false; +} + bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT"); @@ -3230,8 +3230,8 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I, Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); - - MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); + + MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr); auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg}, {JTAddr, Index}) .addJumpTableIndex(JTI); @@ -3268,20 +3268,20 @@ bool AArch64InstructionSelector::selectTLSGlobalValue( const GlobalValue &GV = *I.getOperand(1).getGlobal(); MachineIRBuilder MIB(I); - auto LoadGOT = - MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) - .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); + auto LoadGOT = + MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {}) + .addGlobalAddress(&GV, 0, AArch64II::MO_TLS); auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass}, - {LoadGOT.getReg(0)}) + {LoadGOT.getReg(0)}) .addImm(0); - MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); + MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0)); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) - .addUse(AArch64::X0, RegState::Implicit) + .addUse(AArch64::X0, RegState::Implicit) .addDef(AArch64::X0, RegState::Implicit) .addRegMask(TRI.getTLSCallPreservedMask()); @@ -3767,7 +3767,7 @@ bool AArch64InstructionSelector::selectExtractElt( (void)WideTy; assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() && "source register size too small!"); - assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); + assert(!NarrowTy.isVector() && "cannot extract vector into vector!"); // Need the lane index to determine the correct copy opcode. MachineOperand &LaneIdxOp = I.getOperand(2); @@ -3782,7 +3782,7 @@ bool AArch64InstructionSelector::selectExtractElt( auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI); if (!VRegAndVal) return false; - unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); + unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); MachineIRBuilder MIRBuilder(I); @@ -4005,10 +4005,10 @@ static std::pair<unsigned, unsigned> getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { unsigned Opc, SubregIdx; if (RB.getID() == AArch64::GPRRegBankID) { - if (EltSize == 16) { - Opc = AArch64::INSvi16gpr; - SubregIdx = AArch64::ssub; - } else if (EltSize == 32) { + if (EltSize == 16) { + Opc = AArch64::INSvi16gpr; + SubregIdx = AArch64::ssub; + } else if (EltSize == 32) { Opc = AArch64::INSvi32gpr; SubregIdx = AArch64::ssub; } else if (EltSize == 64) { @@ -4037,93 +4037,93 @@ getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) { return std::make_pair(Opc, SubregIdx); } -MachineInstr *AArch64InstructionSelector::emitInstr( - unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, - std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, - const ComplexRendererFns &RenderFns) const { - assert(Opcode && "Expected an opcode?"); - assert(!isPreISelGenericOpcode(Opcode) && - "Function should only be used to produce selected instructions!"); - auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); - if (RenderFns) - for (auto &Fn : *RenderFns) - Fn(MI); - constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); - return &*MI; -} - -MachineInstr *AArch64InstructionSelector::emitAddSub( - const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, - Register Dst, MachineOperand &LHS, MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); - auto Ty = MRI.getType(LHS.getReg()); - assert(!Ty.isVector() && "Expected a scalar or pointer?"); - unsigned Size = Ty.getSizeInBits(); - assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); - bool Is32Bit = Size == 32; - - // INSTRri form with positive arithmetic immediate. - if (auto Fns = selectArithImmed(RHS)) - return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, - MIRBuilder, Fns); - - // INSTRri form with negative arithmetic immediate. - if (auto Fns = selectNegArithImmed(RHS)) - return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, - MIRBuilder, Fns); - - // INSTRrx form. - if (auto Fns = selectArithExtendedRegister(RHS)) - return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, - MIRBuilder, Fns); - - // INSTRrs form. - if (auto Fns = selectShiftedRegister(RHS)) - return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, - MIRBuilder, Fns); - return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, - MIRBuilder); -} - +MachineInstr *AArch64InstructionSelector::emitInstr( + unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps, + std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder, + const ComplexRendererFns &RenderFns) const { + assert(Opcode && "Expected an opcode?"); + assert(!isPreISelGenericOpcode(Opcode) && + "Function should only be used to produce selected instructions!"); + auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps); + if (RenderFns) + for (auto &Fn : *RenderFns) + Fn(MI); + constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); + return &*MI; +} + +MachineInstr *AArch64InstructionSelector::emitAddSub( + const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode, + Register Dst, MachineOperand &LHS, MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); + assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); + auto Ty = MRI.getType(LHS.getReg()); + assert(!Ty.isVector() && "Expected a scalar or pointer?"); + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only"); + bool Is32Bit = Size == 32; + + // INSTRri form with positive arithmetic immediate. + if (auto Fns = selectArithImmed(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + + // INSTRri form with negative arithmetic immediate. + if (auto Fns = selectNegArithImmed(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + + // INSTRrx form. + if (auto Fns = selectArithExtendedRegister(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + + // INSTRrs form. + if (auto Fns = selectShiftedRegister(RHS)) + return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS}, + MIRBuilder, Fns); + return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS}, + MIRBuilder); +} + MachineInstr * AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { - const std::array<std::array<unsigned, 2>, 5> OpcTable{ - {{AArch64::ADDXri, AArch64::ADDWri}, - {AArch64::ADDXrs, AArch64::ADDWrs}, - {AArch64::ADDXrr, AArch64::ADDWrr}, - {AArch64::SUBXri, AArch64::SUBWri}, - {AArch64::ADDXrx, AArch64::ADDWrx}}}; - return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); -} - -MachineInstr * -AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, - MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - const std::array<std::array<unsigned, 2>, 5> OpcTable{ - {{AArch64::ADDSXri, AArch64::ADDSWri}, - {AArch64::ADDSXrs, AArch64::ADDSWrs}, - {AArch64::ADDSXrr, AArch64::ADDSWrr}, - {AArch64::SUBSXri, AArch64::SUBSWri}, - {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; - return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); -} - -MachineInstr * -AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, - MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - const std::array<std::array<unsigned, 2>, 5> OpcTable{ - {{AArch64::SUBSXri, AArch64::SUBSWri}, - {AArch64::SUBSXrs, AArch64::SUBSWrs}, - {AArch64::SUBSXrr, AArch64::SUBSWrr}, - {AArch64::ADDSXri, AArch64::ADDSWri}, - {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; - return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); + const std::array<std::array<unsigned, 2>, 5> OpcTable{ + {{AArch64::ADDXri, AArch64::ADDWri}, + {AArch64::ADDXrs, AArch64::ADDWrs}, + {AArch64::ADDXrr, AArch64::ADDWrr}, + {AArch64::SUBXri, AArch64::SUBWri}, + {AArch64::ADDXrx, AArch64::ADDWrx}}}; + return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder); +} + +MachineInstr * +AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + const std::array<std::array<unsigned, 2>, 5> OpcTable{ + {{AArch64::ADDSXri, AArch64::ADDSWri}, + {AArch64::ADDSXrs, AArch64::ADDSWrs}, + {AArch64::ADDSXrr, AArch64::ADDSWrr}, + {AArch64::SUBSXri, AArch64::SUBSWri}, + {AArch64::ADDSXrx, AArch64::ADDSWrx}}}; + return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); +} + +MachineInstr * +AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + const std::array<std::array<unsigned, 2>, 5> OpcTable{ + {{AArch64::SUBSXri, AArch64::SUBSWri}, + {AArch64::SUBSXrs, AArch64::SUBSWrs}, + {AArch64::SUBSXrr, AArch64::SUBSWrr}, + {AArch64::ADDSXri, AArch64::ADDSWri}, + {AArch64::SUBSXrx, AArch64::SUBSWrx}}}; + return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder); } MachineInstr * @@ -4131,129 +4131,129 @@ AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32); - auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; - return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); + auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass; + return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder); } MachineInstr * -AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, +AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const { - assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); + assert(LHS.isReg() && RHS.isReg() && "Expected register operands?"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - LLT Ty = MRI.getType(LHS.getReg()); - unsigned RegSize = Ty.getSizeInBits(); + LLT Ty = MRI.getType(LHS.getReg()); + unsigned RegSize = Ty.getSizeInBits(); bool Is32Bit = (RegSize == 32); - const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, - {AArch64::ANDSXrs, AArch64::ANDSWrs}, - {AArch64::ANDSXrr, AArch64::ANDSWrr}}; - // ANDS needs a logical immediate for its immediate form. Check if we can - // fold one in. - if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { - int64_t Imm = ValAndVReg->Value.getSExtValue(); - - if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { - auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); - TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); - constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); - return &*TstMI; - } - } - - if (auto Fns = selectLogicalShiftedRegister(RHS)) - return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); - return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); + const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri}, + {AArch64::ANDSXrs, AArch64::ANDSWrs}, + {AArch64::ANDSXrr, AArch64::ANDSWrr}}; + // ANDS needs a logical immediate for its immediate form. Check if we can + // fold one in. + if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) { + int64_t Imm = ValAndVReg->Value.getSExtValue(); + + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) { + auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS}); + TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); + constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI); + return &*TstMI; + } + } + + if (auto Fns = selectLogicalShiftedRegister(RHS)) + return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns); + return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder); } -MachineInstr *AArch64InstructionSelector::emitIntegerCompare( +MachineInstr *AArch64InstructionSelector::emitIntegerCompare( MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate, MachineIRBuilder &MIRBuilder) const { assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!"); assert(Predicate.isPredicate() && "Expected predicate?"); MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo(); - LLT CmpTy = MRI.getType(LHS.getReg()); - assert(!CmpTy.isVector() && "Expected scalar or pointer"); - unsigned Size = CmpTy.getSizeInBits(); - (void)Size; - assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); - // Fold the compare into a cmn or tst if possible. - if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) - return FoldCmp; - auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); - return emitSUBS(Dst, LHS, RHS, MIRBuilder); -} - -MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( - Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); -#ifndef NDEBUG - LLT Ty = MRI.getType(Dst); - assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && - "Expected a 32-bit scalar register?"); -#endif - const Register ZeroReg = AArch64::WZR; - auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) { - auto CSet = - MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg}) - .addImm(getInvertedCondCode(CC)); - constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI); - return &*CSet; - }; - - AArch64CC::CondCode CC1, CC2; - changeFCMPPredToAArch64CC(Pred, CC1, CC2); - if (CC2 == AArch64CC::AL) - return EmitCSet(Dst, CC1); - - const TargetRegisterClass *RC = &AArch64::GPR32RegClass; - Register Def1Reg = MRI.createVirtualRegister(RC); - Register Def2Reg = MRI.createVirtualRegister(RC); - EmitCSet(Def1Reg, CC1); - EmitCSet(Def2Reg, CC2); - auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); - constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); - return &*OrMI; -} - -MachineInstr * -AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS, - MachineIRBuilder &MIRBuilder, - Optional<CmpInst::Predicate> Pred) const { - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - LLT Ty = MRI.getType(LHS); - if (Ty.isVector()) - return nullptr; - unsigned OpSize = Ty.getSizeInBits(); - if (OpSize != 32 && OpSize != 64) - return nullptr; - - // If this is a compare against +0.0, then we don't have - // to explicitly materialize a constant. - const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); - bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); - - auto IsEqualityPred = [](CmpInst::Predicate P) { - return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || - P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; - }; - if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { - // Try commutating the operands. - const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); - if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { - ShouldUseImm = true; - std::swap(LHS, RHS); - } - } - unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, - {AArch64::FCMPSri, AArch64::FCMPDri}}; - unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; - - // Partially build the compare. Decide if we need to add a use for the - // third operand based off whether or not we're comparing against 0.0. - auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); - if (!ShouldUseImm) - CmpMI.addUse(RHS); + LLT CmpTy = MRI.getType(LHS.getReg()); + assert(!CmpTy.isVector() && "Expected scalar or pointer"); + unsigned Size = CmpTy.getSizeInBits(); + (void)Size; + assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?"); + // Fold the compare into a cmn or tst if possible. + if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder)) + return FoldCmp; + auto Dst = MRI.cloneVirtualRegister(LHS.getReg()); + return emitSUBS(Dst, LHS, RHS, MIRBuilder); +} + +MachineInstr *AArch64InstructionSelector::emitCSetForFCmp( + Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); +#ifndef NDEBUG + LLT Ty = MRI.getType(Dst); + assert(!Ty.isVector() && Ty.getSizeInBits() == 32 && + "Expected a 32-bit scalar register?"); +#endif + const Register ZeroReg = AArch64::WZR; + auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) { + auto CSet = + MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg}) + .addImm(getInvertedCondCode(CC)); + constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI); + return &*CSet; + }; + + AArch64CC::CondCode CC1, CC2; + changeFCMPPredToAArch64CC(Pred, CC1, CC2); + if (CC2 == AArch64CC::AL) + return EmitCSet(Dst, CC1); + + const TargetRegisterClass *RC = &AArch64::GPR32RegClass; + Register Def1Reg = MRI.createVirtualRegister(RC); + Register Def2Reg = MRI.createVirtualRegister(RC); + EmitCSet(Def1Reg, CC1); + EmitCSet(Def2Reg, CC2); + auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg}); + constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI); + return &*OrMI; +} + +MachineInstr * +AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS, + MachineIRBuilder &MIRBuilder, + Optional<CmpInst::Predicate> Pred) const { + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + LLT Ty = MRI.getType(LHS); + if (Ty.isVector()) + return nullptr; + unsigned OpSize = Ty.getSizeInBits(); + if (OpSize != 32 && OpSize != 64) + return nullptr; + + // If this is a compare against +0.0, then we don't have + // to explicitly materialize a constant. + const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI); + bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative()); + + auto IsEqualityPred = [](CmpInst::Predicate P) { + return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE || + P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE; + }; + if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) { + // Try commutating the operands. + const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI); + if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) { + ShouldUseImm = true; + std::swap(LHS, RHS); + } + } + unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr}, + {AArch64::FCMPSri, AArch64::FCMPDri}}; + unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64]; + + // Partially build the compare. Decide if we need to add a use for the + // third operand based off whether or not we're comparing against 0.0. + auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS); + if (!ShouldUseImm) + CmpMI.addUse(RHS); constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI); - return &*CmpMI; + return &*CmpMI; } MachineInstr *AArch64InstructionSelector::emitVectorConcat( @@ -4363,25 +4363,25 @@ AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred, return &*I; } -std::pair<MachineInstr *, AArch64CC::CondCode> -AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, - MachineOperand &LHS, - MachineOperand &RHS, - MachineIRBuilder &MIRBuilder) const { - switch (Opcode) { - default: - llvm_unreachable("Unexpected opcode!"); - case TargetOpcode::G_SADDO: - return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); - case TargetOpcode::G_UADDO: - return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); - case TargetOpcode::G_SSUBO: - return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); - case TargetOpcode::G_USUBO: - return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); - } -} - +std::pair<MachineInstr *, AArch64CC::CondCode> +AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst, + MachineOperand &LHS, + MachineOperand &RHS, + MachineIRBuilder &MIRBuilder) const { + switch (Opcode) { + default: + llvm_unreachable("Unexpected opcode!"); + case TargetOpcode::G_SADDO: + return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); + case TargetOpcode::G_UADDO: + return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS); + case TargetOpcode::G_SSUBO: + return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS); + case TargetOpcode::G_USUBO: + return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO); + } +} + bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { MachineIRBuilder MIB(I); MachineRegisterInfo &MRI = *MIB.getMRI(); @@ -4441,17 +4441,17 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { AArch64CC::CondCode CondCode; if (CondOpc == TargetOpcode::G_ICMP) { - auto Pred = - static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); + auto Pred = + static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); CondCode = changeICMPPredToAArch64CC(Pred); - emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), - CondDef->getOperand(1), MIB); + emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), + CondDef->getOperand(1), MIB); } else { // Get the condition code for the select. - auto Pred = - static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); + auto Pred = + static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate()); AArch64CC::CondCode CondCode2; - changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); + changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2); // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two // instructions to emit the comparison. @@ -4460,16 +4460,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const { if (CondCode2 != AArch64CC::AL) return false; - if (!emitFPCompare(CondDef->getOperand(2).getReg(), - CondDef->getOperand(3).getReg(), MIB)) { - LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); + if (!emitFPCompare(CondDef->getOperand(2).getReg(), + CondDef->getOperand(3).getReg(), MIB)) { + LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n"); return false; - } + } } // Emit the select. - emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), - I.getOperand(3).getReg(), CondCode, MIB); + emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(), + I.getOperand(3).getReg(), CondCode, MIB); I.eraseFromParent(); return true; } @@ -4552,15 +4552,15 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare( // Produce this if the compare is signed: // // tst x, y - if (!CmpInst::isUnsigned(P) && LHSDef && + if (!CmpInst::isUnsigned(P) && LHSDef && LHSDef->getOpcode() == TargetOpcode::G_AND) { // Make sure that the RHS is 0. auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI); if (!ValAndVReg || ValAndVReg->Value != 0) return nullptr; - return emitTST(LHSDef->getOperand(1), - LHSDef->getOperand(2), MIRBuilder); + return emitTST(LHSDef->getOperand(1), + LHSDef->getOperand(2), MIRBuilder); } return nullptr; @@ -4708,7 +4708,7 @@ bool AArch64InstructionSelector::selectInsertElt( auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI); if (!VRegAndVal) return false; - unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); + unsigned LaneIdx = VRegAndVal->Value.getSExtValue(); // Perform the lane insert. Register SrcReg = I.getOperand(1).getReg(); @@ -4765,9 +4765,9 @@ bool AArch64InstructionSelector::selectInsertElt( bool AArch64InstructionSelector::tryOptConstantBuildVec( MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR); - unsigned DstSize = DstTy.getSizeInBits(); - assert(DstSize <= 128 && "Unexpected build_vec type!"); - if (DstSize < 32) + unsigned DstSize = DstTy.getSizeInBits(); + assert(DstSize <= 128 && "Unexpected build_vec type!"); + if (DstSize < 32) return false; // Check if we're building a constant vector, in which case we want to // generate a constant pool load instead of a vector insert sequence. @@ -4788,24 +4788,24 @@ bool AArch64InstructionSelector::tryOptConstantBuildVec( } Constant *CV = ConstantVector::get(Csts); MachineIRBuilder MIB(I); - if (CV->isNullValue()) { - // Until the importer can support immAllZerosV in pattern leaf nodes, - // select a zero move manually here. - Register DstReg = I.getOperand(0).getReg(); - if (DstSize == 128) { - auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); - } else if (DstSize == 64) { - auto Mov = - MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) - .addImm(0); - MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) - .addReg(Mov.getReg(0), 0, AArch64::dsub); - I.eraseFromParent(); - return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI); - } - } + if (CV->isNullValue()) { + // Until the importer can support immAllZerosV in pattern leaf nodes, + // select a zero move manually here. + Register DstReg = I.getOperand(0).getReg(); + if (DstSize == 128) { + auto Mov = MIB.buildInstr(AArch64::MOVIv2d_ns, {DstReg}, {}).addImm(0); + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI); + } else if (DstSize == 64) { + auto Mov = + MIB.buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {}) + .addImm(0); + MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}) + .addReg(Mov.getReg(0), 0, AArch64::dsub); + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AArch64::FPR64RegClass, MRI); + } + } auto *CPLoad = emitLoadFromConstantPool(CV, MIB); if (!CPLoad) { LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector"); @@ -4927,10 +4927,10 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects( case Intrinsic::debugtrap: MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000); break; - case Intrinsic::ubsantrap: - MIRBuilder.buildInstr(AArch64::BRK, {}, {}) - .addImm(I.getOperand(1).getImm() | ('U' << 8)); - break; + case Intrinsic::ubsantrap: + MIRBuilder.buildInstr(AArch64::BRK, {}, {}) + .addImm(I.getOperand(1).getImm() | ('U' << 8)); + break; } I.eraseFromParent(); @@ -4996,22 +4996,22 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI); if (Depth == 0 && IntrinID == Intrinsic::returnaddress) { - if (!MFReturnAddr) { - // Insert the copy from LR/X30 into the entry block, before it can be - // clobbered by anything. - MFI.setReturnAddressIsTaken(true); - MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR, - AArch64::GPR64RegClass); - } - - if (STI.hasPAuth()) { - MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); - } else { - MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); - MIRBuilder.buildInstr(AArch64::XPACLRI); - MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + if (!MFReturnAddr) { + // Insert the copy from LR/X30 into the entry block, before it can be + // clobbered by anything. + MFI.setReturnAddressIsTaken(true); + MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR, + AArch64::GPR64RegClass); } - + + if (STI.hasPAuth()) { + MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr}); + } else { + MIRBuilder.buildCopy({Register(AArch64::LR)}, {MFReturnAddr}); + MIRBuilder.buildInstr(AArch64::XPACLRI); + MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + } + I.eraseFromParent(); return true; } @@ -5031,16 +5031,16 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I, MIRBuilder.buildCopy({DstReg}, {FrameAddr}); else { MFI.setReturnAddressIsTaken(true); - - if (STI.hasPAuth()) { - Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); - MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); - } else { - MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1); - MIRBuilder.buildInstr(AArch64::XPACLRI); - MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); - } + + if (STI.hasPAuth()) { + Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + MIRBuilder.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1); + MIRBuilder.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg}); + } else { + MIRBuilder.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr}).addImm(1); + MIRBuilder.buildInstr(AArch64::XPACLRI); + MIRBuilder.buildCopy({DstReg}, {Register(AArch64::LR)}); + } } I.eraseFromParent(); @@ -5248,7 +5248,7 @@ AArch64InstructionSelector::selectExtendedSHL( // The value must fit into 3 bits, and must be positive. Make sure that is // true. - int64_t ImmVal = ValAndVReg->Value.getSExtValue(); + int64_t ImmVal = ValAndVReg->Value.getSExtValue(); // Since we're going to pull this into a shift, the constant value must be // a power of 2. If we got a multiply, then we need to check this. @@ -5388,60 +5388,60 @@ InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, unsigned SizeInBytes) const { MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - if (!Root.isReg()) - return None; - MachineInstr *PtrAdd = - getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); - if (!PtrAdd) + if (!Root.isReg()) return None; - - // Check for an immediates which cannot be encoded in the [base + imm] - // addressing mode, and can't be encoded in an add/sub. If this happens, we'll - // end up with code like: - // - // mov x0, wide - // add x1 base, x0 - // ldr x2, [x1, x0] - // - // In this situation, we can use the [base, xreg] addressing mode to save an - // add/sub: - // - // mov x0, wide - // ldr x2, [base, x0] - auto ValAndVReg = - getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); - if (ValAndVReg) { - unsigned Scale = Log2_32(SizeInBytes); - int64_t ImmOff = ValAndVReg->Value.getSExtValue(); - - // Skip immediates that can be selected in the load/store addresing - // mode. - if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && - ImmOff < (0x1000 << Scale)) - return None; - - // Helper lambda to decide whether or not it is preferable to emit an add. - auto isPreferredADD = [](int64_t ImmOff) { - // Constants in [0x0, 0xfff] can be encoded in an add. - if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) - return true; - - // Can it be encoded in an add lsl #12? - if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) - return false; - - // It can be encoded in an add lsl #12, but we may not want to. If it is - // possible to select this as a single movz, then prefer that. A single - // movz is faster than an add with a shift. - return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && - (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; - }; - - // If the immediate can be encoded in a single add/sub, then bail out. - if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) - return None; - } - + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd) + return None; + + // Check for an immediates which cannot be encoded in the [base + imm] + // addressing mode, and can't be encoded in an add/sub. If this happens, we'll + // end up with code like: + // + // mov x0, wide + // add x1 base, x0 + // ldr x2, [x1, x0] + // + // In this situation, we can use the [base, xreg] addressing mode to save an + // add/sub: + // + // mov x0, wide + // ldr x2, [base, x0] + auto ValAndVReg = + getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI); + if (ValAndVReg) { + unsigned Scale = Log2_32(SizeInBytes); + int64_t ImmOff = ValAndVReg->Value.getSExtValue(); + + // Skip immediates that can be selected in the load/store addresing + // mode. + if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 && + ImmOff < (0x1000 << Scale)) + return None; + + // Helper lambda to decide whether or not it is preferable to emit an add. + auto isPreferredADD = [](int64_t ImmOff) { + // Constants in [0x0, 0xfff] can be encoded in an add. + if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) + return true; + + // Can it be encoded in an add lsl #12? + if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL) + return false; + + // It can be encoded in an add lsl #12, but we may not want to. If it is + // possible to select this as a single movz, then prefer that. A single + // movz is faster than an add with a shift. + return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && + (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; + }; + + // If the immediate can be encoded in a single add/sub, then bail out. + if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) + return None; + } + // Try to fold shifts into the addressing mode. auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes); if (AddrModeFns) @@ -5871,8 +5871,8 @@ void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); - Optional<int64_t> CstVal = - getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); + Optional<int64_t> CstVal = + getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI); assert(CstVal && "Expected constant value"); MIB.addImm(CstVal.getValue()); } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 5a6c904e3f..af24267bf2 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -14,7 +14,7 @@ #include "AArch64LegalizerInfo.h" #include "AArch64Subtarget.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" -#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineInstr.h" @@ -23,8 +23,8 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" -#include <initializer_list> -#include "llvm/Support/MathExtras.h" +#include <initializer_list> +#include "llvm/Support/MathExtras.h" #define DEBUG_TYPE "aarch64-legalinfo" @@ -56,13 +56,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) const LLT v2s64 = LLT::vector(2, 64); const LLT v2p0 = LLT::vector(2, p0); - std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ - v16s8, v8s16, v4s32, - v2s64, v2p0, - /* End 128bit types */ - /* Begin 64bit types */ - v8s8, v4s16, v2s32}; - + std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */ + v16s8, v8s16, v4s32, + v2s64, v2p0, + /* End 128bit types */ + /* Begin 64bit types */ + v8s8, v4s16, v2s32}; + const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine(); // FIXME: support subtargets which have neon/fp-armv8 disabled. @@ -71,31 +71,31 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return; } - // Some instructions only support s16 if the subtarget has full 16-bit FP - // support. - const bool HasFP16 = ST.hasFullFP16(); - const LLT &MinFPScalar = HasFP16 ? s16 : s32; - + // Some instructions only support s16 if the subtarget has full 16-bit FP + // support. + const bool HasFP16 = ST.hasFullFP16(); + const LLT &MinFPScalar = HasFP16 ? s16 : s32; + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) - .legalFor({p0, s1, s8, s16, s32, s64}) - .legalFor(PackedVectorAllTypeList) - .clampScalar(0, s1, s64) - .widenScalarToNextPow2(0, 8) - .fewerElementsIf( - [=](const LegalityQuery &Query) { - return Query.Types[0].isVector() && - (Query.Types[0].getElementType() != s64 || - Query.Types[0].getNumElements() != 2); - }, - [=](const LegalityQuery &Query) { - LLT EltTy = Query.Types[0].getElementType(); - if (EltTy == s64) - return std::make_pair(0, LLT::vector(2, 64)); - return std::make_pair(0, EltTy); - }); - - getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64}) - .legalFor(PackedVectorAllTypeList) + .legalFor({p0, s1, s8, s16, s32, s64}) + .legalFor(PackedVectorAllTypeList) + .clampScalar(0, s1, s64) + .widenScalarToNextPow2(0, 8) + .fewerElementsIf( + [=](const LegalityQuery &Query) { + return Query.Types[0].isVector() && + (Query.Types[0].getElementType() != s64 || + Query.Types[0].getNumElements() != 2); + }, + [=](const LegalityQuery &Query) { + LLT EltTy = Query.Types[0].getElementType(); + if (EltTy == s64) + return std::make_pair(0, LLT::vector(2, 64)); + return std::make_pair(0, EltTy); + }); + + getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64}) + .legalFor(PackedVectorAllTypeList) .clampScalar(0, s16, s64) .widenScalarToNextPow2(0); @@ -105,38 +105,38 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .widenScalarToNextPow2(0); getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) - .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) - .scalarizeIf( - [=](const LegalityQuery &Query) { - return Query.Opcode == G_MUL && Query.Types[0] == v2s64; - }, - 0) - .legalFor({v2s64}) + .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8}) + .scalarizeIf( + [=](const LegalityQuery &Query) { + return Query.Opcode == G_MUL && Query.Types[0] == v2s64; + }, + 0) + .legalFor({v2s64}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) .clampNumElements(0, v2s32, v4s32) .clampNumElements(0, v2s64, v2s64) .moreElementsToNextPow2(0); - getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) + getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) .customIf([=](const LegalityQuery &Query) { const auto &SrcTy = Query.Types[0]; const auto &AmtTy = Query.Types[1]; return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 && AmtTy.getSizeInBits() == 32; }) - .legalFor({ - {s32, s32}, - {s32, s64}, - {s64, s64}, - {v8s8, v8s8}, - {v16s8, v16s8}, - {v4s16, v4s16}, - {v8s16, v8s16}, - {v2s32, v2s32}, - {v4s32, v4s32}, - {v2s64, v2s64}, - }) + .legalFor({ + {s32, s32}, + {s32, s64}, + {s64, s64}, + {v8s8, v8s8}, + {v16s8, v16s8}, + {v4s16, v4s16}, + {v8s16, v8s16}, + {v2s32, v2s32}, + {v4s32, v4s32}, + {v2s64, v2s64}, + }) .clampScalar(1, s32, s64) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) @@ -161,25 +161,25 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SREM, G_UREM}) .lowerFor({s1, s8, s16, s32, s64}); - getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}}); + getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}}); getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); - getActionDefinitionsBuilder( - {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) + getActionDefinitionsBuilder( + {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO}) .legalFor({{s32, s1}, {s64, s1}}) .minScalar(0, s32); getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) - .legalFor({s32, s64, v2s64, v4s32, v2s32}) - .clampNumElements(0, v2s32, v4s32) - .clampNumElements(0, v2s64, v2s64); + .legalFor({s32, s64, v2s64, v4s32, v2s32}) + .clampNumElements(0, v2s32, v4s32) + .clampNumElements(0, v2s64, v2s64); getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64}); getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT, G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, - G_FNEARBYINT, G_INTRINSIC_LRINT}) + G_FNEARBYINT, G_INTRINSIC_LRINT}) // If we don't have full FP16 support, then scalarize the elements of // vectors containing fp16 types. .fewerElementsIf( @@ -285,7 +285,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v4s32, p0, 128, 8}, {v2s64, p0, 128, 8}}) // These extends are also legal - .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) + .legalForTypesWithMemDesc({{s32, p0, 8, 8}, {s32, p0, 16, 8}}) .clampScalar(0, s8, s64) .lowerIfMemSizeNotPow2() // Lower any any-extending loads left into G_ANYEXT and G_LOAD @@ -307,7 +307,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {p0, p0, 64, 8}, {s128, p0, 128, 8}, {v16s8, p0, 128, 8}, - {v8s8, p0, 64, 8}, + {v8s8, p0, 64, 8}, {v4s16, p0, 64, 8}, {v8s16, p0, 128, 8}, {v2s32, p0, 64, 8}, @@ -325,19 +325,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Constants getActionDefinitionsBuilder(G_CONSTANT) - .legalFor({p0, s8, s16, s32, s64}) + .legalFor({p0, s8, s16, s32, s64}) .clampScalar(0, s8, s64) .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) - .legalIf([=](const LegalityQuery &Query) { - const auto &Ty = Query.Types[0]; - if (HasFP16 && Ty == s16) - return true; - return Ty == s32 || Ty == s64 || Ty == s128; - }) - .clampScalar(0, MinFPScalar, s128); - - getActionDefinitionsBuilder({G_ICMP, G_FCMP}) + .legalIf([=](const LegalityQuery &Query) { + const auto &Ty = Query.Types[0]; + if (HasFP16 && Ty == s16) + return true; + return Ty == s32 || Ty == s64 || Ty == s128; + }) + .clampScalar(0, MinFPScalar, s128); + + getActionDefinitionsBuilder({G_ICMP, G_FCMP}) .legalFor({{s32, s32}, {s32, s64}, {s32, p0}, @@ -365,8 +365,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalarOrEltIf( [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0, s64) - .widenScalarOrEltToNextPow2(1) - .clampNumElements(0, v2s32, v4s32); + .widenScalarOrEltToNextPow2(1) + .clampNumElements(0, v2s32, v4s32); // Extensions auto ExtLegalFunc = [=](const LegalityQuery &Query) { @@ -374,7 +374,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) if (DstSize == 128 && !Query.Types[0].isVector()) return false; // Extending to a scalar s128 needs narrowing. - + // Make sure that we have something that will fit in a register, and // make sure it's a power of 2. if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize)) @@ -399,28 +399,28 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalIf(ExtLegalFunc) .clampScalar(0, s64, s64); // Just for s128, others are handled above. - getActionDefinitionsBuilder(G_TRUNC) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, - 0, s8) - .customIf([=](const LegalityQuery &Query) { - LLT DstTy = Query.Types[0]; - LLT SrcTy = Query.Types[1]; - return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; - }) - .alwaysLegal(); + getActionDefinitionsBuilder(G_TRUNC) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); }, + 0, s8) + .customIf([=](const LegalityQuery &Query) { + LLT DstTy = Query.Types[0]; + LLT SrcTy = Query.Types[1]; + return DstTy == v8s8 && SrcTy.getSizeInBits() > 128; + }) + .alwaysLegal(); - getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); + getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower(); // FP conversions - getActionDefinitionsBuilder(G_FPTRUNC) - .legalFor( - {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) - .clampMaxNumElements(0, s32, 2); - getActionDefinitionsBuilder(G_FPEXT) - .legalFor( - {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) - .clampMaxNumElements(0, s64, 2); + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor( + {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}}) + .clampMaxNumElements(0, s32, 2); + getActionDefinitionsBuilder(G_FPEXT) + .legalFor( + {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}}) + .clampMaxNumElements(0, s64, 2); // Conversions getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) @@ -433,7 +433,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32}) .clampScalar(1, s32, s64) - .minScalarSameAs(1, 0) + .minScalarSameAs(1, 0) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0); @@ -445,8 +445,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalFor({{s32, s1}, {s64, s1}, {p0, s1}}) .clampScalar(0, s32, s64) .widenScalarToNextPow2(0) - .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) - .lowerIf(isVector(0)); + .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0) + .lowerIf(isVector(0)); // Pointer-handling getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0}); @@ -576,8 +576,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0; }) // Any vectors left are the wrong size. Scalarize them. - .scalarize(0) - .scalarize(1); + .scalarize(0) + .scalarize(1); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -589,40 +589,40 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .legalIf([=](const LegalityQuery &Query) { const LLT &VecTy = Query.Types[1]; return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 || - VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || - VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0; - }) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { - // We want to promote to <M x s1> to <M x s64> if that wouldn't - // cause the total vec size to be > 128b. - return Query.Types[1].getNumElements() <= 2; - }, - 0, s64) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { - return Query.Types[1].getNumElements() <= 4; - }, - 0, s32) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { - return Query.Types[1].getNumElements() <= 8; - }, - 0, s16) - .minScalarOrEltIf( - [=](const LegalityQuery &Query) { - return Query.Types[1].getNumElements() <= 16; - }, - 0, s8) - .minScalarOrElt(0, s8); // Worst case, we need at least s8. + VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 || + VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0; + }) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + // We want to promote to <M x s1> to <M x s64> if that wouldn't + // cause the total vec size to be > 128b. + return Query.Types[1].getNumElements() <= 2; + }, + 0, s64) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 4; + }, + 0, s32) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 8; + }, + 0, s16) + .minScalarOrEltIf( + [=](const LegalityQuery &Query) { + return Query.Types[1].getNumElements() <= 16; + }, + 0, s8) + .minScalarOrElt(0, s8); // Worst case, we need at least s8. getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) - .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); + .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64})); getActionDefinitionsBuilder(G_BUILD_VECTOR) - .legalFor({{v8s8, s8}, - {v16s8, s8}, - {v4s16, s16}, + .legalFor({{v8s8, s8}, + {v16s8, s8}, + {v4s16, s16}, {v8s16, s16}, {v2s32, s32}, {v4s32, s32}, @@ -638,9 +638,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) }) .minScalarSameAs(1, 0); - getActionDefinitionsBuilder(G_CTLZ) - .legalForCartesianProduct( - {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) + getActionDefinitionsBuilder(G_CTLZ) + .legalForCartesianProduct( + {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32}) .scalarize(1); getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) @@ -651,7 +651,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // to be the same size as the dest. if (DstTy != SrcTy) return false; - for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) { + for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) { if (DstTy == Ty) return true; } @@ -668,7 +668,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalFor({{v4s32, v2s32}, {v8s16, v4s16}}); - getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); + getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}}); getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) { return Query.Types[0] == p0 && Query.Types[1] == s64; @@ -676,20 +676,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); - getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); - - getActionDefinitionsBuilder(G_ABS).lowerIf( - [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }); - - getActionDefinitionsBuilder(G_VECREDUCE_FADD) - // We only have FADDP to do reduction-like operations. Lower the rest. - .legalFor({{s32, v2s32}, {s64, v2s64}}) - .lower(); - - getActionDefinitionsBuilder(G_VECREDUCE_ADD) - .legalFor({{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s64, v2s64}}) - .lower(); - + getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + + getActionDefinitionsBuilder(G_ABS).lowerIf( + [=](const LegalityQuery &Query) { return Query.Types[0].isScalar(); }); + + getActionDefinitionsBuilder(G_VECREDUCE_FADD) + // We only have FADDP to do reduction-like operations. Lower the rest. + .legalFor({{s32, v2s32}, {s64, v2s64}}) + .lower(); + + getActionDefinitionsBuilder(G_VECREDUCE_ADD) + .legalFor({{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s64, v2s64}}) + .lower(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -714,63 +714,63 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer); case TargetOpcode::G_GLOBAL_VALUE: return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer); - case TargetOpcode::G_TRUNC: - return legalizeVectorTrunc(MI, Helper); + case TargetOpcode::G_TRUNC: + return legalizeVectorTrunc(MI, Helper); } llvm_unreachable("expected switch to return"); } -static void extractParts(Register Reg, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, - SmallVectorImpl<Register> &VRegs) { - for (int I = 0; I < NumParts; ++I) - VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); - MIRBuilder.buildUnmerge(VRegs, Reg); -} - -bool AArch64LegalizerInfo::legalizeVectorTrunc( - MachineInstr &MI, LegalizerHelper &Helper) const { - MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - // Similar to how operand splitting is done in SelectiondDAG, we can handle - // %res(v8s8) = G_TRUNC %in(v8s32) by generating: - // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) - // %lo16(<4 x s16>) = G_TRUNC %inlo - // %hi16(<4 x s16>) = G_TRUNC %inhi - // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 - // %res(<8 x s8>) = G_TRUNC %in16 - - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); - assert(isPowerOf2_32(DstTy.getSizeInBits()) && - isPowerOf2_32(SrcTy.getSizeInBits())); - - // Split input type. - LLT SplitSrcTy = SrcTy.changeNumElements(SrcTy.getNumElements() / 2); - // First, split the source into two smaller vectors. - SmallVector<Register, 2> SplitSrcs; - extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); - - // Truncate the splits into intermediate narrower elements. - LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); - for (unsigned I = 0; I < SplitSrcs.size(); ++I) - SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); - - auto Concat = MIRBuilder.buildConcatVectors( - DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); - - Helper.Observer.changingInstr(MI); - MI.getOperand(1).setReg(Concat.getReg(0)); - Helper.Observer.changedInstr(MI); - return true; -} - -bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { +static void extractParts(Register Reg, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts, + SmallVectorImpl<Register> &VRegs) { + for (int I = 0; I < NumParts; ++I) + VRegs.push_back(MRI.createGenericVirtualRegister(Ty)); + MIRBuilder.buildUnmerge(VRegs, Reg); +} + +bool AArch64LegalizerInfo::legalizeVectorTrunc( + MachineInstr &MI, LegalizerHelper &Helper) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + // Similar to how operand splitting is done in SelectiondDAG, we can handle + // %res(v8s8) = G_TRUNC %in(v8s32) by generating: + // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>) + // %lo16(<4 x s16>) = G_TRUNC %inlo + // %hi16(<4 x s16>) = G_TRUNC %inhi + // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16 + // %res(<8 x s8>) = G_TRUNC %in16 + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT SrcTy = MRI.getType(SrcReg); + assert(isPowerOf2_32(DstTy.getSizeInBits()) && + isPowerOf2_32(SrcTy.getSizeInBits())); + + // Split input type. + LLT SplitSrcTy = SrcTy.changeNumElements(SrcTy.getNumElements() / 2); + // First, split the source into two smaller vectors. + SmallVector<Register, 2> SplitSrcs; + extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs); + + // Truncate the splits into intermediate narrower elements. + LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2); + for (unsigned I = 0; I < SplitSrcs.size(); ++I) + SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0); + + auto Concat = MIRBuilder.buildConcatVectors( + DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs); + + Helper.Observer.changingInstr(MI); + MI.getOperand(1).setReg(Concat.getReg(0)); + Helper.Observer.changedInstr(MI); + return true; +} + +bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, + GISelChangeObserver &Observer) const { assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP + // G_ADD_LOW instructions. @@ -792,27 +792,27 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( // Set the regclass on the dest reg too. MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); - // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so - // by creating a MOVK that sets bits 48-63 of the register to (global address - // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to - // prevent an incorrect tag being generated during relocation when the the - // global appears before the code section. Without the offset, a global at - // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced - // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = - // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` - // instead of `0xf`. - // This assumes that we're in the small code model so we can assume a binary - // size of <= 4GB, which makes the untagged PC relative offset positive. The - // binary must also be loaded into address range [0, 2^48). Both of these - // properties need to be ensured at runtime when using tagged addresses. - if (OpFlags & AArch64II::MO_TAGGED) { - ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) - .addGlobalAddress(GV, 0x100000000, - AArch64II::MO_PREL | AArch64II::MO_G3) - .addImm(48); - MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); - } - + // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so + // by creating a MOVK that sets bits 48-63 of the register to (global address + // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to + // prevent an incorrect tag being generated during relocation when the the + // global appears before the code section. Without the offset, a global at + // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced + // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 = + // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe` + // instead of `0xf`. + // This assumes that we're in the small code model so we can assume a binary + // size of <= 4GB, which makes the untagged PC relative offset positive. The + // binary must also be loaded into address range [0, 2^48). Both of these + // properties need to be ensured at runtime when using tagged addresses. + if (OpFlags & AArch64II::MO_TAGGED) { + ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) + .addGlobalAddress(GV, 0x100000000, + AArch64II::MO_PREL | AArch64II::MO_G3) + .addImm(48); + MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); + } + MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); @@ -820,8 +820,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( return true; } -bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const { return true; } @@ -838,13 +838,13 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr( if (!VRegAndVal) return true; // Check the shift amount is in range for an immediate form. - int64_t Amount = VRegAndVal->Value.getSExtValue(); + int64_t Amount = VRegAndVal->Value.getSExtValue(); if (Amount > 31) return true; // This will have to remain a register variant. auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount); - Observer.changingInstr(MI); + Observer.changingInstr(MI); MI.getOperand(2).setReg(ExtCst.getReg(0)); - Observer.changedInstr(MI); + Observer.changedInstr(MI); return true; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 8217e37c85..c22cb26608 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -15,7 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINELEGALIZER_H #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" -#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" namespace llvm { @@ -46,7 +46,7 @@ private: bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; - bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const; const AArch64Subtarget *ST; }; } // End llvm namespace. diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index fdd04cb77f..bf3190ce93 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -1,22 +1,22 @@ -//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===// +//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -/// -/// \file -/// Post-legalization combines on generic MachineInstrs. -/// -/// The combines here must preserve instruction legality. -/// -/// Lowering combines (e.g. pseudo matching) should be handled by -/// AArch64PostLegalizerLowering. -/// -/// Combines which don't rely on instruction legality should go in the -/// AArch64PreLegalizerCombiner. -/// +/// +/// \file +/// Post-legalization combines on generic MachineInstrs. +/// +/// The combines here must preserve instruction legality. +/// +/// Lowering combines (e.g. pseudo matching) should be handled by +/// AArch64PostLegalizerLowering. +/// +/// Combines which don't rely on instruction legality should go in the +/// AArch64PreLegalizerCombiner. +/// //===----------------------------------------------------------------------===// #include "AArch64TargetMachine.h" @@ -24,12 +24,12 @@ #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" @@ -37,202 +37,202 @@ using namespace llvm; -/// This combine tries do what performExtractVectorEltCombine does in SDAG. -/// Rewrite for pairwise fadd pattern -/// (s32 (g_extract_vector_elt -/// (g_fadd (vXs32 Other) -/// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0)) -/// -> -/// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0) -/// (g_extract_vector_elt (vXs32 Other) 1)) -bool matchExtractVecEltPairwiseAdd( - MachineInstr &MI, MachineRegisterInfo &MRI, - std::tuple<unsigned, LLT, Register> &MatchInfo) { - Register Src1 = MI.getOperand(1).getReg(); - Register Src2 = MI.getOperand(2).getReg(); - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - - auto Cst = getConstantVRegValWithLookThrough(Src2, MRI); - if (!Cst || Cst->Value != 0) +/// This combine tries do what performExtractVectorEltCombine does in SDAG. +/// Rewrite for pairwise fadd pattern +/// (s32 (g_extract_vector_elt +/// (g_fadd (vXs32 Other) +/// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0)) +/// -> +/// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0) +/// (g_extract_vector_elt (vXs32 Other) 1)) +bool matchExtractVecEltPairwiseAdd( + MachineInstr &MI, MachineRegisterInfo &MRI, + std::tuple<unsigned, LLT, Register> &MatchInfo) { + Register Src1 = MI.getOperand(1).getReg(); + Register Src2 = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + auto Cst = getConstantVRegValWithLookThrough(Src2, MRI); + if (!Cst || Cst->Value != 0) return false; - // SDAG also checks for FullFP16, but this looks to be beneficial anyway. + // SDAG also checks for FullFP16, but this looks to be beneficial anyway. - // Now check for an fadd operation. TODO: expand this for integer add? - auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI); - if (!FAddMI) + // Now check for an fadd operation. TODO: expand this for integer add? + auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI); + if (!FAddMI) return false; - // If we add support for integer add, must restrict these types to just s64. - unsigned DstSize = DstTy.getSizeInBits(); - if (DstSize != 16 && DstSize != 32 && DstSize != 64) + // If we add support for integer add, must restrict these types to just s64. + unsigned DstSize = DstTy.getSizeInBits(); + if (DstSize != 16 && DstSize != 32 && DstSize != 64) return false; - Register Src1Op1 = FAddMI->getOperand(1).getReg(); - Register Src1Op2 = FAddMI->getOperand(2).getReg(); - MachineInstr *Shuffle = - getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI); - MachineInstr *Other = MRI.getVRegDef(Src1Op1); - if (!Shuffle) { - Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI); - Other = MRI.getVRegDef(Src1Op2); + Register Src1Op1 = FAddMI->getOperand(1).getReg(); + Register Src1Op2 = FAddMI->getOperand(2).getReg(); + MachineInstr *Shuffle = + getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI); + MachineInstr *Other = MRI.getVRegDef(Src1Op1); + if (!Shuffle) { + Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI); + Other = MRI.getVRegDef(Src1Op2); } - // We're looking for a shuffle that moves the second element to index 0. - if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 && - Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) { - std::get<0>(MatchInfo) = TargetOpcode::G_FADD; - std::get<1>(MatchInfo) = DstTy; - std::get<2>(MatchInfo) = Other->getOperand(0).getReg(); + // We're looking for a shuffle that moves the second element to index 0. + if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 && + Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) { + std::get<0>(MatchInfo) = TargetOpcode::G_FADD; + std::get<1>(MatchInfo) = DstTy; + std::get<2>(MatchInfo) = Other->getOperand(0).getReg(); return true; } return false; } -bool applyExtractVecEltPairwiseAdd( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, - std::tuple<unsigned, LLT, Register> &MatchInfo) { - unsigned Opc = std::get<0>(MatchInfo); - assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!"); - // We want to generate two extracts of elements 0 and 1, and add them. - LLT Ty = std::get<1>(MatchInfo); - Register Src = std::get<2>(MatchInfo); - LLT s64 = LLT::scalar(64); - B.setInstrAndDebugLoc(MI); - auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0)); - auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1)); - B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1}); - MI.eraseFromParent(); +bool applyExtractVecEltPairwiseAdd( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + std::tuple<unsigned, LLT, Register> &MatchInfo) { + unsigned Opc = std::get<0>(MatchInfo); + assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!"); + // We want to generate two extracts of elements 0 and 1, and add them. + LLT Ty = std::get<1>(MatchInfo); + Register Src = std::get<2>(MatchInfo); + LLT s64 = LLT::scalar(64); + B.setInstrAndDebugLoc(MI); + auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0)); + auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1)); + B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1}); + MI.eraseFromParent(); return true; } -static bool isSignExtended(Register R, MachineRegisterInfo &MRI) { - // TODO: check if extended build vector as well. - unsigned Opc = MRI.getVRegDef(R)->getOpcode(); - return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG; +static bool isSignExtended(Register R, MachineRegisterInfo &MRI) { + // TODO: check if extended build vector as well. + unsigned Opc = MRI.getVRegDef(R)->getOpcode(); + return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG; } -static bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { - // TODO: check if extended build vector as well. - return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; +static bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { + // TODO: check if extended build vector as well. + return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; } -bool matchAArch64MulConstCombine( - MachineInstr &MI, MachineRegisterInfo &MRI, - std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { - assert(MI.getOpcode() == TargetOpcode::G_MUL); - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); - Register Dst = MI.getOperand(0).getReg(); - const LLT Ty = MRI.getType(LHS); - - // The below optimizations require a constant RHS. - auto Const = getConstantVRegValWithLookThrough(RHS, MRI); - if (!Const) +bool matchAArch64MulConstCombine( + MachineInstr &MI, MachineRegisterInfo &MRI, + std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { + assert(MI.getOpcode() == TargetOpcode::G_MUL); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + const LLT Ty = MRI.getType(LHS); + + // The below optimizations require a constant RHS. + auto Const = getConstantVRegValWithLookThrough(RHS, MRI); + if (!Const) return false; - const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits()); - // The following code is ported from AArch64ISelLowering. - // Multiplication of a power of two plus/minus one can be done more - // cheaply as as shift+add/sub. For now, this is true unilaterally. If - // future CPUs have a cheaper MADD instruction, this may need to be - // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and - // 64-bit is 5 cycles, so this is always a win. - // More aggressively, some multiplications N0 * C can be lowered to - // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, - // e.g. 6=3*2=(2+1)*2. - // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 - // which equals to (1+2)*16-(1+2). - // TrailingZeroes is used to test if the mul can be lowered to - // shift+add+shift. - unsigned TrailingZeroes = ConstValue.countTrailingZeros(); - if (TrailingZeroes) { - // Conservatively do not lower to shift+add+shift if the mul might be - // folded into smul or umul. - if (MRI.hasOneNonDBGUse(LHS) && - (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI))) - return false; - // Conservatively do not lower to shift+add+shift if the mul might be - // folded into madd or msub. - if (MRI.hasOneNonDBGUse(Dst)) { - MachineInstr &UseMI = *MRI.use_instr_begin(Dst); - if (UseMI.getOpcode() == TargetOpcode::G_ADD || - UseMI.getOpcode() == TargetOpcode::G_SUB) - return false; - } - } - // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub - // and shift+add+shift. - APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); - - unsigned ShiftAmt, AddSubOpc; - // Is the shifted value the LHS operand of the add/sub? - bool ShiftValUseIsLHS = true; - // Do we need to negate the result? - bool NegateResult = false; - - if (ConstValue.isNonNegative()) { - // (mul x, 2^N + 1) => (add (shl x, N), x) - // (mul x, 2^N - 1) => (sub (shl x, N), x) - // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) - APInt SCVMinus1 = ShiftedConstValue - 1; - APInt CVPlus1 = ConstValue + 1; - if (SCVMinus1.isPowerOf2()) { - ShiftAmt = SCVMinus1.logBase2(); - AddSubOpc = TargetOpcode::G_ADD; - } else if (CVPlus1.isPowerOf2()) { - ShiftAmt = CVPlus1.logBase2(); - AddSubOpc = TargetOpcode::G_SUB; - } else - return false; - } else { - // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) - // (mul x, -(2^N + 1)) => - (add (shl x, N), x) - APInt CVNegPlus1 = -ConstValue + 1; - APInt CVNegMinus1 = -ConstValue - 1; - if (CVNegPlus1.isPowerOf2()) { - ShiftAmt = CVNegPlus1.logBase2(); - AddSubOpc = TargetOpcode::G_SUB; - ShiftValUseIsLHS = false; - } else if (CVNegMinus1.isPowerOf2()) { - ShiftAmt = CVNegMinus1.logBase2(); - AddSubOpc = TargetOpcode::G_ADD; - NegateResult = true; - } else - return false; - } - - if (NegateResult && TrailingZeroes) + const APInt ConstValue = Const->Value.sextOrSelf(Ty.getSizeInBits()); + // The following code is ported from AArch64ISelLowering. + // Multiplication of a power of two plus/minus one can be done more + // cheaply as as shift+add/sub. For now, this is true unilaterally. If + // future CPUs have a cheaper MADD instruction, this may need to be + // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and + // 64-bit is 5 cycles, so this is always a win. + // More aggressively, some multiplications N0 * C can be lowered to + // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, + // e.g. 6=3*2=(2+1)*2. + // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 + // which equals to (1+2)*16-(1+2). + // TrailingZeroes is used to test if the mul can be lowered to + // shift+add+shift. + unsigned TrailingZeroes = ConstValue.countTrailingZeros(); + if (TrailingZeroes) { + // Conservatively do not lower to shift+add+shift if the mul might be + // folded into smul or umul. + if (MRI.hasOneNonDBGUse(LHS) && + (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI))) + return false; + // Conservatively do not lower to shift+add+shift if the mul might be + // folded into madd or msub. + if (MRI.hasOneNonDBGUse(Dst)) { + MachineInstr &UseMI = *MRI.use_instr_begin(Dst); + if (UseMI.getOpcode() == TargetOpcode::G_ADD || + UseMI.getOpcode() == TargetOpcode::G_SUB) + return false; + } + } + // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub + // and shift+add+shift. + APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); + + unsigned ShiftAmt, AddSubOpc; + // Is the shifted value the LHS operand of the add/sub? + bool ShiftValUseIsLHS = true; + // Do we need to negate the result? + bool NegateResult = false; + + if (ConstValue.isNonNegative()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) + APInt SCVMinus1 = ShiftedConstValue - 1; + APInt CVPlus1 = ConstValue + 1; + if (SCVMinus1.isPowerOf2()) { + ShiftAmt = SCVMinus1.logBase2(); + AddSubOpc = TargetOpcode::G_ADD; + } else if (CVPlus1.isPowerOf2()) { + ShiftAmt = CVPlus1.logBase2(); + AddSubOpc = TargetOpcode::G_SUB; + } else + return false; + } else { + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt CVNegPlus1 = -ConstValue + 1; + APInt CVNegMinus1 = -ConstValue - 1; + if (CVNegPlus1.isPowerOf2()) { + ShiftAmt = CVNegPlus1.logBase2(); + AddSubOpc = TargetOpcode::G_SUB; + ShiftValUseIsLHS = false; + } else if (CVNegMinus1.isPowerOf2()) { + ShiftAmt = CVNegMinus1.logBase2(); + AddSubOpc = TargetOpcode::G_ADD; + NegateResult = true; + } else + return false; + } + + if (NegateResult && TrailingZeroes) return false; - ApplyFn = [=](MachineIRBuilder &B, Register DstReg) { - auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt); - auto ShiftedVal = B.buildShl(Ty, LHS, Shift); - - Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS; - Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0); - auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS}); - assert(!(NegateResult && TrailingZeroes) && - "NegateResult and TrailingZeroes cannot both be true for now."); - // Negate the result. - if (NegateResult) { - B.buildSub(DstReg, B.buildConstant(Ty, 0), Res); - return; - } - // Shift the result. - if (TrailingZeroes) { - B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes)); - return; - } - B.buildCopy(DstReg, Res.getReg(0)); - }; + ApplyFn = [=](MachineIRBuilder &B, Register DstReg) { + auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt); + auto ShiftedVal = B.buildShl(Ty, LHS, Shift); + + Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS; + Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0); + auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS}); + assert(!(NegateResult && TrailingZeroes) && + "NegateResult and TrailingZeroes cannot both be true for now."); + // Negate the result. + if (NegateResult) { + B.buildSub(DstReg, B.buildConstant(Ty, 0), Res); + return; + } + // Shift the result. + if (TrailingZeroes) { + B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes)); + return; + } + B.buildCopy(DstReg, Res.getReg(0)); + }; return true; } -bool applyAArch64MulConstCombine( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, - std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { - B.setInstrAndDebugLoc(MI); - ApplyFn(B, MI.getOperand(0).getReg()); +bool applyAArch64MulConstCombine( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { + B.setInstrAndDebugLoc(MI); + ApplyFn(B, MI.getOperand(0).getReg()); MI.eraseFromParent(); return true; } @@ -348,7 +348,7 @@ INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, false) namespace llvm { -FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { +FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { return new AArch64PostLegalizerCombiner(IsOptNone); } } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index a06ff4b541..0447c3e8a0 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -1,704 +1,704 @@ -//=== AArch64PostLegalizerLowering.cpp --------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// Post-legalization lowering for instructions. -/// -/// This is used to offload pattern matching from the selector. -/// -/// For example, this combiner will notice that a G_SHUFFLE_VECTOR is actually -/// a G_ZIP, G_UZP, etc. -/// -/// General optimization combines should be handled by either the -/// AArch64PostLegalizerCombiner or the AArch64PreLegalizerCombiner. -/// -//===----------------------------------------------------------------------===// - -#include "AArch64TargetMachine.h" -#include "AArch64GlobalISelUtils.h" -#include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/CodeGen/GlobalISel/Combiner.h" -#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" -#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" -#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/InitializePasses.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "aarch64-postlegalizer-lowering" - -using namespace llvm; -using namespace MIPatternMatch; -using namespace AArch64GISelUtils; - -/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. -/// -/// Used for matching target-supported shuffles before codegen. -struct ShuffleVectorPseudo { - unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) - Register Dst; ///< Destination register. - SmallVector<SrcOp, 2> SrcOps; ///< Source registers. - ShuffleVectorPseudo(unsigned Opc, Register Dst, - std::initializer_list<SrcOp> SrcOps) - : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; - ShuffleVectorPseudo() {} -}; - -/// Check if a vector shuffle corresponds to a REV instruction with the -/// specified blocksize. -static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts, - unsigned BlockSize) { - assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && - "Only possible block sizes for REV are: 16, 32, 64"); - assert(EltSize != 64 && "EltSize cannot be 64 for REV mask."); - - unsigned BlockElts = M[0] + 1; - - // If the first shuffle index is UNDEF, be optimistic. - if (M[0] < 0) - BlockElts = BlockSize / EltSize; - - if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize) - return false; - - for (unsigned i = 0; i < NumElts; ++i) { - // Ignore undef indices. - if (M[i] < 0) - continue; - if (static_cast<unsigned>(M[i]) != - (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) - return false; - } - - return true; -} - -/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts. -/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult. -static bool isTRNMask(ArrayRef<int> M, unsigned NumElts, - unsigned &WhichResult) { - if (NumElts % 2 != 0) - return false; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) || - (M[i + 1] >= 0 && - static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult)) - return false; - } - return true; -} - -/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector -/// sources of the shuffle are different. -static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, - unsigned NumElts) { - // Look for the first non-undef element. - auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); - if (FirstRealElt == M.end()) - return None; - - // Use APInt to handle overflow when calculating expected element. - unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); - APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); - - // The following shuffle indices must be the successive elements after the - // first real element. - if (any_of( - make_range(std::next(FirstRealElt), M.end()), - [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) - return None; - - // The index of an EXT is the first element if it is not UNDEF. - // Watch out for the beginning UNDEFs. The EXT index should be the expected - // value of the first element. E.g. - // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. - // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. - // ExpectedElt is the last mask index plus 1. - uint64_t Imm = ExpectedElt.getZExtValue(); - bool ReverseExt = false; - - // There are two difference cases requiring to reverse input vectors. - // For example, for vector <4 x i32> we have the following cases, - // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) - // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) - // For both cases, we finally use mask <5, 6, 7, 0>, which requires - // to reverse two input vectors. - if (Imm < NumElts) - ReverseExt = true; - else - Imm -= NumElts; - return std::make_pair(ReverseExt, Imm); -} - -/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. -/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. -static bool isUZPMask(ArrayRef<int> M, unsigned NumElts, - unsigned &WhichResult) { - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - // Skip undef indices. - if (M[i] < 0) - continue; - if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult) - return false; - } - return true; -} - -/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. -/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. -static bool isZipMask(ArrayRef<int> M, unsigned NumElts, - unsigned &WhichResult) { - if (NumElts % 2 != 0) - return false; - - // 0 means use ZIP1, 1 means use ZIP2. - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) || - (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts)) - return false; - Idx += 1; - } - return true; -} - -/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a -/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. -static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT Ty = MRI.getType(Dst); - unsigned EltSize = Ty.getScalarSizeInBits(); - - // Element size for a rev cannot be 64. - if (EltSize == 64) - return false; - - unsigned NumElts = Ty.getNumElements(); - - // Try to produce G_REV64 - if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) { - MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src}); - return true; - } - - // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support. - // This should be identical to above, but with a constant 32 and constant - // 16. - return false; -} - -/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with -/// a G_TRN1 or G_TRN2 instruction. -static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - unsigned WhichResult; - ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); - Register Dst = MI.getOperand(0).getReg(); - unsigned NumElts = MRI.getType(Dst).getNumElements(); - if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) - return false; - unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); - MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); - return true; -} - -/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with -/// a G_UZP1 or G_UZP2 instruction. -/// -/// \param [in] MI - The shuffle vector instruction. -/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. -static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - unsigned WhichResult; - ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); - Register Dst = MI.getOperand(0).getReg(); - unsigned NumElts = MRI.getType(Dst).getNumElements(); - if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) - return false; - unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); - MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); - return true; -} - -static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - unsigned WhichResult; - ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); - Register Dst = MI.getOperand(0).getReg(); - unsigned NumElts = MRI.getType(Dst).getNumElements(); - if (!isZipMask(ShuffleMask, NumElts, WhichResult)) - return false; - unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); - MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); - return true; -} - -/// Helper function for matchDup. -static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, - MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - if (Lane != 0) - return false; - - // Try to match a vector splat operation into a dup instruction. - // We're looking for this pattern: - // - // %scalar:gpr(s64) = COPY $x0 - // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF - // %cst0:gpr(s32) = G_CONSTANT i32 0 - // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) - // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) - // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) - // - // ...into: - // %splat = G_DUP %scalar - - // Begin matching the insert. - auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, - MI.getOperand(1).getReg(), MRI); - if (!InsMI) - return false; - // Match the undef vector operand. - if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), - MRI)) - return false; - - // Match the index constant 0. - if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ZeroInt())) - return false; - - MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), - {InsMI->getOperand(2).getReg()}); - return true; -} - -/// Helper function for matchDup. -static bool matchDupFromBuildVector(int Lane, MachineInstr &MI, - MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(Lane >= 0 && "Expected positive lane?"); - // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the - // lane's definition directly. - auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, - MI.getOperand(1).getReg(), MRI); - if (!BuildVecMI) - return false; - Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); - MatchInfo = - ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg}); - return true; -} - -static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - auto MaybeLane = getSplatIndex(MI); - if (!MaybeLane) - return false; - int Lane = *MaybeLane; - // If this is undef splat, generate it via "just" vdup, if possible. - if (Lane < 0) - Lane = 0; - if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) - return true; - if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) - return true; - return false; -} - -static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, - ShuffleVectorPseudo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - Register Dst = MI.getOperand(0).getReg(); - auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(), - MRI.getType(Dst).getNumElements()); - if (!ExtInfo) - return false; - bool ReverseExt; - uint64_t Imm; - std::tie(ReverseExt, Imm) = *ExtInfo; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); - if (ReverseExt) - std::swap(V1, V2); - uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8; - Imm *= ExtFactor; - MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); - return true; -} - -/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. -/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. -static bool applyShuffleVectorPseudo(MachineInstr &MI, - ShuffleVectorPseudo &MatchInfo) { - MachineIRBuilder MIRBuilder(MI); - MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps); - MI.eraseFromParent(); - return true; -} - -/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. -/// Special-cased because the constant operand must be emitted as a G_CONSTANT -/// for the imported tablegen patterns to work. -static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { - MachineIRBuilder MIRBuilder(MI); - // Tablegen patterns expect an i32 G_CONSTANT as the final op. - auto Cst = - MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm()); - MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, - {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); - MI.eraseFromParent(); - return true; -} - -/// isVShiftRImm - Check if this is a valid vector for the immediate -/// operand of a vector shift right operation. The value must be in the range: -/// 1 <= Value <= ElementBits for a right shift. -static bool isVShiftRImm(Register Reg, MachineRegisterInfo &MRI, LLT Ty, - int64_t &Cnt) { - assert(Ty.isVector() && "vector shift count is not a vector type"); - MachineInstr *MI = MRI.getVRegDef(Reg); - auto Cst = getBuildVectorConstantSplat(*MI, MRI); - if (!Cst) - return false; - Cnt = *Cst; - int64_t ElementBits = Ty.getScalarSizeInBits(); - return Cnt >= 1 && Cnt <= ElementBits; -} - -/// Match a vector G_ASHR or G_LSHR with a valid immediate shift. -static bool matchVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, - int64_t &Imm) { - assert(MI.getOpcode() == TargetOpcode::G_ASHR || - MI.getOpcode() == TargetOpcode::G_LSHR); - LLT Ty = MRI.getType(MI.getOperand(1).getReg()); - if (!Ty.isVector()) - return false; - return isVShiftRImm(MI.getOperand(2).getReg(), MRI, Ty, Imm); -} - -static bool applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, - int64_t &Imm) { - unsigned Opc = MI.getOpcode(); - assert(Opc == TargetOpcode::G_ASHR || Opc == TargetOpcode::G_LSHR); - unsigned NewOpc = - Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR; - MachineIRBuilder MIB(MI); - auto ImmDef = MIB.buildConstant(LLT::scalar(32), Imm); - MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1), ImmDef}); - MI.eraseFromParent(); - return true; -} - -/// Determine if it is possible to modify the \p RHS and predicate \p P of a -/// G_ICMP instruction such that the right-hand side is an arithmetic immediate. -/// -/// \returns A pair containing the updated immediate and predicate which may -/// be used to optimize the instruction. -/// -/// \note This assumes that the comparison has been legalized. -Optional<std::pair<uint64_t, CmpInst::Predicate>> -tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, - const MachineRegisterInfo &MRI) { - const auto &Ty = MRI.getType(RHS); - if (Ty.isVector()) - return None; - unsigned Size = Ty.getSizeInBits(); - assert((Size == 32 || Size == 64) && "Expected 32 or 64 bit compare only?"); - - // If the RHS is not a constant, or the RHS is already a valid arithmetic - // immediate, then there is nothing to change. - auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); - if (!ValAndVReg) - return None; - uint64_t C = ValAndVReg->Value.getZExtValue(); - if (isLegalArithImmed(C)) - return None; - - // We have a non-arithmetic immediate. Check if adjusting the immediate and - // adjusting the predicate will result in a legal arithmetic immediate. - switch (P) { - default: - return None; - case CmpInst::ICMP_SLT: - case CmpInst::ICMP_SGE: - // Check for - // - // x slt c => x sle c - 1 - // x sge c => x sgt c - 1 - // - // When c is not the smallest possible negative number. - if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) || - (Size == 32 && static_cast<int32_t>(C) == INT32_MIN)) - return None; - P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; - C -= 1; - break; - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_UGE: - // Check for - // - // x ult c => x ule c - 1 - // x uge c => x ugt c - 1 - // - // When c is not zero. - if (C == 0) - return None; - P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; - C -= 1; - break; - case CmpInst::ICMP_SLE: - case CmpInst::ICMP_SGT: - // Check for - // - // x sle c => x slt c + 1 - // x sgt c => s sge c + 1 - // - // When c is not the largest possible signed integer. - if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) || - (Size == 64 && static_cast<int64_t>(C) == INT64_MAX)) - return None; - P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; - C += 1; - break; - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_UGT: - // Check for - // - // x ule c => x ult c + 1 - // x ugt c => s uge c + 1 - // - // When c is not the largest possible unsigned integer. - if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) || - (Size == 64 && C == UINT64_MAX)) - return None; - P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; - C += 1; - break; - } - - // Check if the new constant is valid, and return the updated constant and - // predicate if it is. - if (Size == 32) - C = static_cast<uint32_t>(C); - if (!isLegalArithImmed(C)) - return None; - return {{C, P}}; -} - -/// Determine whether or not it is possible to update the RHS and predicate of -/// a G_ICMP instruction such that the RHS will be selected as an arithmetic -/// immediate. -/// -/// \p MI - The G_ICMP instruction -/// \p MatchInfo - The new RHS immediate and predicate on success -/// -/// See tryAdjustICmpImmAndPred for valid transformations. -bool matchAdjustICmpImmAndPred( - MachineInstr &MI, const MachineRegisterInfo &MRI, - std::pair<uint64_t, CmpInst::Predicate> &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_ICMP); - Register RHS = MI.getOperand(3).getReg(); - auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); - if (auto MaybeNewImmAndPred = tryAdjustICmpImmAndPred(RHS, Pred, MRI)) { - MatchInfo = *MaybeNewImmAndPred; - return true; - } - return false; -} - -bool applyAdjustICmpImmAndPred( - MachineInstr &MI, std::pair<uint64_t, CmpInst::Predicate> &MatchInfo, - MachineIRBuilder &MIB, GISelChangeObserver &Observer) { - MIB.setInstrAndDebugLoc(MI); - MachineOperand &RHS = MI.getOperand(3); - MachineRegisterInfo &MRI = *MIB.getMRI(); - auto Cst = MIB.buildConstant(MRI.cloneVirtualRegister(RHS.getReg()), - MatchInfo.first); - Observer.changingInstr(MI); - RHS.setReg(Cst->getOperand(0).getReg()); - MI.getOperand(1).setPredicate(MatchInfo.second); - Observer.changedInstr(MI); - return true; -} - -bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, - std::pair<unsigned, int> &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - Register Src1Reg = MI.getOperand(1).getReg(); - const LLT SrcTy = MRI.getType(Src1Reg); - const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - - auto LaneIdx = getSplatIndex(MI); - if (!LaneIdx) - return false; - - // The lane idx should be within the first source vector. - if (*LaneIdx >= SrcTy.getNumElements()) - return false; - - if (DstTy != SrcTy) - return false; - - LLT ScalarTy = SrcTy.getElementType(); - unsigned ScalarSize = ScalarTy.getSizeInBits(); - - unsigned Opc = 0; - switch (SrcTy.getNumElements()) { - case 2: - if (ScalarSize == 64) - Opc = AArch64::G_DUPLANE64; - break; - case 4: - if (ScalarSize == 32) - Opc = AArch64::G_DUPLANE32; - break; - case 8: - if (ScalarSize == 16) - Opc = AArch64::G_DUPLANE16; - break; - case 16: - if (ScalarSize == 8) - Opc = AArch64::G_DUPLANE8; - break; - default: - break; - } - if (!Opc) - return false; - - MatchInfo.first = Opc; - MatchInfo.second = *LaneIdx; - return true; -} - -bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); - B.setInstrAndDebugLoc(MI); - auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second); - B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, - {MI.getOperand(1).getReg(), Lane}); - MI.eraseFromParent(); - return true; -} - -#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS -#include "AArch64GenPostLegalizeGILowering.inc" -#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS - -namespace { -#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H -#include "AArch64GenPostLegalizeGILowering.inc" -#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H - -class AArch64PostLegalizerLoweringInfo : public CombinerInfo { -public: - AArch64GenPostLegalizerLoweringHelperRuleConfig GeneratedRuleCfg; - - AArch64PostLegalizerLoweringInfo(bool OptSize, bool MinSize) - : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr, /*OptEnabled = */ true, OptSize, - MinSize) { - if (!GeneratedRuleCfg.parseCommandLineOption()) - report_fatal_error("Invalid rule identifier"); - } - - virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, - MachineIRBuilder &B) const override; -}; - -bool AArch64PostLegalizerLoweringInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { - CombinerHelper Helper(Observer, B); - AArch64GenPostLegalizerLoweringHelper Generated(GeneratedRuleCfg); - return Generated.tryCombineAll(Observer, MI, B, Helper); -} - -#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP -#include "AArch64GenPostLegalizeGILowering.inc" -#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP - -class AArch64PostLegalizerLowering : public MachineFunctionPass { -public: - static char ID; - - AArch64PostLegalizerLowering(); - - StringRef getPassName() const override { - return "AArch64PostLegalizerLowering"; - } - - bool runOnMachineFunction(MachineFunction &MF) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; -}; -} // end anonymous namespace - -void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetPassConfig>(); - AU.setPreservesCFG(); - getSelectionDAGFallbackAnalysisUsage(AU); - MachineFunctionPass::getAnalysisUsage(AU); -} - -AArch64PostLegalizerLowering::AArch64PostLegalizerLowering() - : MachineFunctionPass(ID) { - initializeAArch64PostLegalizerLoweringPass(*PassRegistry::getPassRegistry()); -} - -bool AArch64PostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) { - if (MF.getProperties().hasProperty( - MachineFunctionProperties::Property::FailedISel)) - return false; - assert(MF.getProperties().hasProperty( - MachineFunctionProperties::Property::Legalized) && - "Expected a legalized function?"); - auto *TPC = &getAnalysis<TargetPassConfig>(); - const Function &F = MF.getFunction(); - AArch64PostLegalizerLoweringInfo PCInfo(F.hasOptSize(), F.hasMinSize()); - Combiner C(PCInfo, TPC); - return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); -} - -char AArch64PostLegalizerLowering::ID = 0; -INITIALIZE_PASS_BEGIN(AArch64PostLegalizerLowering, DEBUG_TYPE, - "Lower AArch64 MachineInstrs after legalization", false, - false) -INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END(AArch64PostLegalizerLowering, DEBUG_TYPE, - "Lower AArch64 MachineInstrs after legalization", false, - false) - -namespace llvm { -FunctionPass *createAArch64PostLegalizerLowering() { - return new AArch64PostLegalizerLowering(); -} -} // end namespace llvm +//=== AArch64PostLegalizerLowering.cpp --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Post-legalization lowering for instructions. +/// +/// This is used to offload pattern matching from the selector. +/// +/// For example, this combiner will notice that a G_SHUFFLE_VECTOR is actually +/// a G_ZIP, G_UZP, etc. +/// +/// General optimization combines should be handled by either the +/// AArch64PostLegalizerCombiner or the AArch64PreLegalizerCombiner. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "AArch64GlobalISelUtils.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-postlegalizer-lowering" + +using namespace llvm; +using namespace MIPatternMatch; +using namespace AArch64GISelUtils; + +/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR. +/// +/// Used for matching target-supported shuffles before codegen. +struct ShuffleVectorPseudo { + unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1) + Register Dst; ///< Destination register. + SmallVector<SrcOp, 2> SrcOps; ///< Source registers. + ShuffleVectorPseudo(unsigned Opc, Register Dst, + std::initializer_list<SrcOp> SrcOps) + : Opc(Opc), Dst(Dst), SrcOps(SrcOps){}; + ShuffleVectorPseudo() {} +}; + +/// Check if a vector shuffle corresponds to a REV instruction with the +/// specified blocksize. +static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts, + unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + assert(EltSize != 64 && "EltSize cannot be 64 for REV mask."); + + unsigned BlockElts = M[0] + 1; + + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSize; + + if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + // Ignore undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != + (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts. +/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult. +static bool isTRNMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) || + (M[i + 1] >= 0 && + static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector +/// sources of the shuffle are different. +static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M, + unsigned NumElts) { + // Look for the first non-undef element. + auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); + if (FirstRealElt == M.end()) + return None; + + // Use APInt to handle overflow when calculating expected element. + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + + // The following shuffle indices must be the successive elements after the + // first real element. + if (any_of( + make_range(std::next(FirstRealElt), M.end()), + [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; })) + return None; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + uint64_t Imm = ExpectedElt.getZExtValue(); + bool ReverseExt = false; + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseExt = true; + else + Imm -= NumElts; + return std::make_pair(ReverseExt, Imm); +} + +/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. +/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. +static bool isUZPMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + // Skip undef indices. + if (M[i] < 0) + continue; + if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult) + return false; + } + return true; +} + +/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. +/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. +static bool isZipMask(ArrayRef<int> M, unsigned NumElts, + unsigned &WhichResult) { + if (NumElts % 2 != 0) + return false; + + // 0 means use ZIP1, 1 means use ZIP2. + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) || + (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts)) + return false; + Idx += 1; + } + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a +/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc. +static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Dst); + unsigned EltSize = Ty.getScalarSizeInBits(); + + // Element size for a rev cannot be 64. + if (EltSize == 64) + return false; + + unsigned NumElts = Ty.getNumElements(); + + // Try to produce G_REV64 + if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) { + MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src}); + return true; + } + + // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support. + // This should be identical to above, but with a constant 32 and constant + // 16. + return false; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_TRN1 or G_TRN2 instruction. +static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_UZP1 or G_UZP2 instruction. +/// +/// \param [in] MI - The shuffle vector instruction. +/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success. +static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask(); + Register Dst = MI.getOperand(0).getReg(); + unsigned NumElts = MRI.getType(Dst).getNumElements(); + if (!isZipMask(ShuffleMask, NumElts, WhichResult)) + return false; + unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + if (Lane != 0) + return false; + + // Try to match a vector splat operation into a dup instruction. + // We're looking for this pattern: + // + // %scalar:gpr(s64) = COPY $x0 + // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF + // %cst0:gpr(s32) = G_CONSTANT i32 0 + // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32) + // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32) + // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>) + // + // ...into: + // %splat = G_DUP %scalar + + // Begin matching the insert. + auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT, + MI.getOperand(1).getReg(), MRI); + if (!InsMI) + return false; + // Match the undef vector operand. + if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), + MRI)) + return false; + + // Match the index constant 0. + if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ZeroInt())) + return false; + + MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), + {InsMI->getOperand(2).getReg()}); + return true; +} + +/// Helper function for matchDup. +static bool matchDupFromBuildVector(int Lane, MachineInstr &MI, + MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(Lane >= 0 && "Expected positive lane?"); + // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the + // lane's definition directly. + auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR, + MI.getOperand(1).getReg(), MRI); + if (!BuildVecMI) + return false; + Register Reg = BuildVecMI->getOperand(Lane + 1).getReg(); + MatchInfo = + ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg}); + return true; +} + +static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + auto MaybeLane = getSplatIndex(MI); + if (!MaybeLane) + return false; + int Lane = *MaybeLane; + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane < 0) + Lane = 0; + if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo)) + return true; + if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo)) + return true; + return false; +} + +static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI, + ShuffleVectorPseudo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Dst = MI.getOperand(0).getReg(); + auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(), + MRI.getType(Dst).getNumElements()); + if (!ExtInfo) + return false; + bool ReverseExt; + uint64_t Imm; + std::tie(ReverseExt, Imm) = *ExtInfo; + Register V1 = MI.getOperand(1).getReg(); + Register V2 = MI.getOperand(2).getReg(); + if (ReverseExt) + std::swap(V1, V2); + uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8; + Imm *= ExtFactor; + MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm}); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. +/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. +static bool applyShuffleVectorPseudo(MachineInstr &MI, + ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps); + MI.eraseFromParent(); + return true; +} + +/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT. +/// Special-cased because the constant operand must be emitted as a G_CONSTANT +/// for the imported tablegen patterns to work. +static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) { + MachineIRBuilder MIRBuilder(MI); + // Tablegen patterns expect an i32 G_CONSTANT as the final op. + auto Cst = + MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm()); + MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, + {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst}); + MI.eraseFromParent(); + return true; +} + +/// isVShiftRImm - Check if this is a valid vector for the immediate +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift. +static bool isVShiftRImm(Register Reg, MachineRegisterInfo &MRI, LLT Ty, + int64_t &Cnt) { + assert(Ty.isVector() && "vector shift count is not a vector type"); + MachineInstr *MI = MRI.getVRegDef(Reg); + auto Cst = getBuildVectorConstantSplat(*MI, MRI); + if (!Cst) + return false; + Cnt = *Cst; + int64_t ElementBits = Ty.getScalarSizeInBits(); + return Cnt >= 1 && Cnt <= ElementBits; +} + +/// Match a vector G_ASHR or G_LSHR with a valid immediate shift. +static bool matchVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, + int64_t &Imm) { + assert(MI.getOpcode() == TargetOpcode::G_ASHR || + MI.getOpcode() == TargetOpcode::G_LSHR); + LLT Ty = MRI.getType(MI.getOperand(1).getReg()); + if (!Ty.isVector()) + return false; + return isVShiftRImm(MI.getOperand(2).getReg(), MRI, Ty, Imm); +} + +static bool applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI, + int64_t &Imm) { + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_ASHR || Opc == TargetOpcode::G_LSHR); + unsigned NewOpc = + Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR; + MachineIRBuilder MIB(MI); + auto ImmDef = MIB.buildConstant(LLT::scalar(32), Imm); + MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1), ImmDef}); + MI.eraseFromParent(); + return true; +} + +/// Determine if it is possible to modify the \p RHS and predicate \p P of a +/// G_ICMP instruction such that the right-hand side is an arithmetic immediate. +/// +/// \returns A pair containing the updated immediate and predicate which may +/// be used to optimize the instruction. +/// +/// \note This assumes that the comparison has been legalized. +Optional<std::pair<uint64_t, CmpInst::Predicate>> +tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, + const MachineRegisterInfo &MRI) { + const auto &Ty = MRI.getType(RHS); + if (Ty.isVector()) + return None; + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && "Expected 32 or 64 bit compare only?"); + + // If the RHS is not a constant, or the RHS is already a valid arithmetic + // immediate, then there is nothing to change. + auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI); + if (!ValAndVReg) + return None; + uint64_t C = ValAndVReg->Value.getZExtValue(); + if (isLegalArithImmed(C)) + return None; + + // We have a non-arithmetic immediate. Check if adjusting the immediate and + // adjusting the predicate will result in a legal arithmetic immediate. + switch (P) { + default: + return None; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + // Check for + // + // x slt c => x sle c - 1 + // x sge c => x sgt c - 1 + // + // When c is not the smallest possible negative number. + if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) || + (Size == 32 && static_cast<int32_t>(C) == INT32_MIN)) + return None; + P = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT; + C -= 1; + break; + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_UGE: + // Check for + // + // x ult c => x ule c - 1 + // x uge c => x ugt c - 1 + // + // When c is not zero. + if (C == 0) + return None; + P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; + C -= 1; + break; + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_SGT: + // Check for + // + // x sle c => x slt c + 1 + // x sgt c => s sge c + 1 + // + // When c is not the largest possible signed integer. + if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) || + (Size == 64 && static_cast<int64_t>(C) == INT64_MAX)) + return None; + P = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE; + C += 1; + break; + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_UGT: + // Check for + // + // x ule c => x ult c + 1 + // x ugt c => s uge c + 1 + // + // When c is not the largest possible unsigned integer. + if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) || + (Size == 64 && C == UINT64_MAX)) + return None; + P = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE; + C += 1; + break; + } + + // Check if the new constant is valid, and return the updated constant and + // predicate if it is. + if (Size == 32) + C = static_cast<uint32_t>(C); + if (!isLegalArithImmed(C)) + return None; + return {{C, P}}; +} + +/// Determine whether or not it is possible to update the RHS and predicate of +/// a G_ICMP instruction such that the RHS will be selected as an arithmetic +/// immediate. +/// +/// \p MI - The G_ICMP instruction +/// \p MatchInfo - The new RHS immediate and predicate on success +/// +/// See tryAdjustICmpImmAndPred for valid transformations. +bool matchAdjustICmpImmAndPred( + MachineInstr &MI, const MachineRegisterInfo &MRI, + std::pair<uint64_t, CmpInst::Predicate> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_ICMP); + Register RHS = MI.getOperand(3).getReg(); + auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + if (auto MaybeNewImmAndPred = tryAdjustICmpImmAndPred(RHS, Pred, MRI)) { + MatchInfo = *MaybeNewImmAndPred; + return true; + } + return false; +} + +bool applyAdjustICmpImmAndPred( + MachineInstr &MI, std::pair<uint64_t, CmpInst::Predicate> &MatchInfo, + MachineIRBuilder &MIB, GISelChangeObserver &Observer) { + MIB.setInstrAndDebugLoc(MI); + MachineOperand &RHS = MI.getOperand(3); + MachineRegisterInfo &MRI = *MIB.getMRI(); + auto Cst = MIB.buildConstant(MRI.cloneVirtualRegister(RHS.getReg()), + MatchInfo.first); + Observer.changingInstr(MI); + RHS.setReg(Cst->getOperand(0).getReg()); + MI.getOperand(1).setPredicate(MatchInfo.second); + Observer.changedInstr(MI); + return true; +} + +bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, + std::pair<unsigned, int> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + Register Src1Reg = MI.getOperand(1).getReg(); + const LLT SrcTy = MRI.getType(Src1Reg); + const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + + auto LaneIdx = getSplatIndex(MI); + if (!LaneIdx) + return false; + + // The lane idx should be within the first source vector. + if (*LaneIdx >= SrcTy.getNumElements()) + return false; + + if (DstTy != SrcTy) + return false; + + LLT ScalarTy = SrcTy.getElementType(); + unsigned ScalarSize = ScalarTy.getSizeInBits(); + + unsigned Opc = 0; + switch (SrcTy.getNumElements()) { + case 2: + if (ScalarSize == 64) + Opc = AArch64::G_DUPLANE64; + break; + case 4: + if (ScalarSize == 32) + Opc = AArch64::G_DUPLANE32; + break; + case 8: + if (ScalarSize == 16) + Opc = AArch64::G_DUPLANE16; + break; + case 16: + if (ScalarSize == 8) + Opc = AArch64::G_DUPLANE8; + break; + default: + break; + } + if (!Opc) + return false; + + MatchInfo.first = Opc; + MatchInfo.second = *LaneIdx; + return true; +} + +bool applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, std::pair<unsigned, int> &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + B.setInstrAndDebugLoc(MI); + auto Lane = B.buildConstant(LLT::scalar(64), MatchInfo.second); + B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, + {MI.getOperand(1).getReg(), Lane}); + MI.eraseFromParent(); + return true; +} + +#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenPostLegalizeGILowering.inc" +#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H +#include "AArch64GenPostLegalizeGILowering.inc" +#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_H + +class AArch64PostLegalizerLoweringInfo : public CombinerInfo { +public: + AArch64GenPostLegalizerLoweringHelperRuleConfig GeneratedRuleCfg; + + AArch64PostLegalizerLoweringInfo(bool OptSize, bool MinSize) + : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, + /*LegalizerInfo*/ nullptr, /*OptEnabled = */ true, OptSize, + MinSize) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AArch64PostLegalizerLoweringInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B); + AArch64GenPostLegalizerLoweringHelper Generated(GeneratedRuleCfg); + return Generated.tryCombineAll(Observer, MI, B, Helper); +} + +#define AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenPostLegalizeGILowering.inc" +#undef AARCH64POSTLEGALIZERLOWERINGHELPER_GENCOMBINERHELPER_CPP + +class AArch64PostLegalizerLowering : public MachineFunctionPass { +public: + static char ID; + + AArch64PostLegalizerLowering(); + + StringRef getPassName() const override { + return "AArch64PostLegalizerLowering"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; +} // end anonymous namespace + +void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostLegalizerLowering::AArch64PostLegalizerLowering() + : MachineFunctionPass(ID) { + initializeAArch64PostLegalizerLoweringPass(*PassRegistry::getPassRegistry()); +} + +bool AArch64PostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + assert(MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Legalized) && + "Expected a legalized function?"); + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + AArch64PostLegalizerLoweringInfo PCInfo(F.hasOptSize(), F.hasMinSize()); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AArch64PostLegalizerLowering::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostLegalizerLowering, DEBUG_TYPE, + "Lower AArch64 MachineInstrs after legalization", false, + false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AArch64PostLegalizerLowering, DEBUG_TYPE, + "Lower AArch64 MachineInstrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostLegalizerLowering() { + return new AArch64PostLegalizerLowering(); +} +} // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp index 2f882ecb1f..00436b5924 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp @@ -1,187 +1,187 @@ -//=== AArch64PostSelectOptimize.cpp ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass does post-instruction-selection optimizations in the GlobalISel -// pipeline, before the rest of codegen runs. -// -//===----------------------------------------------------------------------===// - -#include "AArch64.h" -#include "AArch64TargetMachine.h" -#include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/Support/Debug.h" - -#define DEBUG_TYPE "aarch64-post-select-optimize" - -using namespace llvm; - -namespace { -class AArch64PostSelectOptimize : public MachineFunctionPass { -public: - static char ID; - - AArch64PostSelectOptimize(); - - StringRef getPassName() const override { - return "AArch64 Post Select Optimizer"; - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override; - -private: - bool optimizeNZCVDefs(MachineBasicBlock &MBB); -}; -} // end anonymous namespace - -void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetPassConfig>(); - AU.setPreservesCFG(); - getSelectionDAGFallbackAnalysisUsage(AU); - MachineFunctionPass::getAnalysisUsage(AU); -} - -AArch64PostSelectOptimize::AArch64PostSelectOptimize() - : MachineFunctionPass(ID) { - initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); -} - -unsigned getNonFlagSettingVariant(unsigned Opc) { - switch (Opc) { - default: - return 0; - case AArch64::SUBSXrr: - return AArch64::SUBXrr; - case AArch64::SUBSWrr: - return AArch64::SUBWrr; - case AArch64::SUBSXrs: - return AArch64::SUBXrs; - case AArch64::SUBSXri: - return AArch64::SUBXri; - case AArch64::SUBSWri: - return AArch64::SUBWri; - } -} - -bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { - // Consider the following code: - // FCMPSrr %0, %1, implicit-def $nzcv - // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv - // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv - // FCMPSrr %0, %1, implicit-def $nzcv - // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv - // This kind of code where we have 2 FCMPs each feeding a CSEL can happen - // when we have a single IR fcmp being used by two selects. During selection, - // to ensure that there can be no clobbering of nzcv between the fcmp and the - // csel, we have to generate an fcmp immediately before each csel is - // selected. - // However, often we can essentially CSE these together later in MachineCSE. - // This doesn't work though if there are unrelated flag-setting instructions - // in between the two FCMPs. In this case, the SUBS defines NZCV - // but it doesn't have any users, being overwritten by the second FCMP. - // - // Our solution here is to try to convert flag setting operations between - // a interval of identical FCMPs, so that CSE will be able to eliminate one. - bool Changed = false; - const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo(); - - // The first step is to find the first and last FCMPs. If we have found - // at least two, then set the limit of the bottom-up walk to the first FCMP - // found since we're only interested in dealing with instructions between - // them. - MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr; - for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) { - if (MI.getOpcode() == AArch64::FCMPSrr || - MI.getOpcode() == AArch64::FCMPDrr) { - if (!FirstCmp) - FirstCmp = &MI; - else - LastCmp = &MI; - } - } - - // In addition to converting flag-setting ops in fcmp ranges into non-flag - // setting ops, across the whole basic block we also detect when nzcv - // implicit-defs are dead, and mark them as dead. Peephole optimizations need - // this information later. - - LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); - LRU.addLiveOuts(MBB); - bool NZCVDead = LRU.available(AArch64::NZCV); - bool InsideCmpRange = false; - for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { - LRU.stepBackward(II); - - if (LastCmp) { // There's a range present in this block. - // If we're inside an fcmp range, look for begin instruction. - if (InsideCmpRange && &II == FirstCmp) - InsideCmpRange = false; - else if (&II == LastCmp) - InsideCmpRange = true; - } - - // Did this instruction define NZCV? - bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV); - if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) { - // If we have a def and NZCV is dead, then we may convert this op. - unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); - int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV); - if (DeadNZCVIdx != -1) { - // If we're inside an fcmp range, then convert flag setting ops. - if (InsideCmpRange && NewOpc) { - LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " - "op in fcmp range: " - << II); - II.setDesc(TII->get(NewOpc)); - II.RemoveOperand(DeadNZCVIdx); - Changed |= true; - } else { - // Otherwise, we just set the nzcv imp-def operand to be dead, so the - // peephole optimizations can optimize them further. - II.getOperand(DeadNZCVIdx).setIsDead(); - } - } - } - - NZCVDead = NZCVDeadAtCurrInstr; - } - return Changed; -} - -bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { - if (MF.getProperties().hasProperty( - MachineFunctionProperties::Property::FailedISel)) - return false; - assert(MF.getProperties().hasProperty( - MachineFunctionProperties::Property::Selected) && - "Expected a selected MF"); - - bool Changed = false; - for (auto &BB : MF) - Changed |= optimizeNZCVDefs(BB); - return true; -} - -char AArch64PostSelectOptimize::ID = 0; -INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, - "Optimize AArch64 selected instructions", - false, false) -INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, - "Optimize AArch64 selected instructions", false, - false) - -namespace llvm { -FunctionPass *createAArch64PostSelectOptimize() { - return new AArch64PostSelectOptimize(); -} -} // end namespace llvm +//=== AArch64PostSelectOptimize.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does post-instruction-selection optimizations in the GlobalISel +// pipeline, before the rest of codegen runs. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aarch64-post-select-optimize" + +using namespace llvm; + +namespace { +class AArch64PostSelectOptimize : public MachineFunctionPass { +public: + static char ID; + + AArch64PostSelectOptimize(); + + StringRef getPassName() const override { + return "AArch64 Post Select Optimizer"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + bool optimizeNZCVDefs(MachineBasicBlock &MBB); +}; +} // end anonymous namespace + +void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + MachineFunctionPass::getAnalysisUsage(AU); +} + +AArch64PostSelectOptimize::AArch64PostSelectOptimize() + : MachineFunctionPass(ID) { + initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); +} + +unsigned getNonFlagSettingVariant(unsigned Opc) { + switch (Opc) { + default: + return 0; + case AArch64::SUBSXrr: + return AArch64::SUBXrr; + case AArch64::SUBSWrr: + return AArch64::SUBWrr; + case AArch64::SUBSXrs: + return AArch64::SUBXrs; + case AArch64::SUBSXri: + return AArch64::SUBXri; + case AArch64::SUBSWri: + return AArch64::SUBWri; + } +} + +bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { + // Consider the following code: + // FCMPSrr %0, %1, implicit-def $nzcv + // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv + // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv + // FCMPSrr %0, %1, implicit-def $nzcv + // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv + // This kind of code where we have 2 FCMPs each feeding a CSEL can happen + // when we have a single IR fcmp being used by two selects. During selection, + // to ensure that there can be no clobbering of nzcv between the fcmp and the + // csel, we have to generate an fcmp immediately before each csel is + // selected. + // However, often we can essentially CSE these together later in MachineCSE. + // This doesn't work though if there are unrelated flag-setting instructions + // in between the two FCMPs. In this case, the SUBS defines NZCV + // but it doesn't have any users, being overwritten by the second FCMP. + // + // Our solution here is to try to convert flag setting operations between + // a interval of identical FCMPs, so that CSE will be able to eliminate one. + bool Changed = false; + const auto *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + + // The first step is to find the first and last FCMPs. If we have found + // at least two, then set the limit of the bottom-up walk to the first FCMP + // found since we're only interested in dealing with instructions between + // them. + MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr; + for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) { + if (MI.getOpcode() == AArch64::FCMPSrr || + MI.getOpcode() == AArch64::FCMPDrr) { + if (!FirstCmp) + FirstCmp = &MI; + else + LastCmp = &MI; + } + } + + // In addition to converting flag-setting ops in fcmp ranges into non-flag + // setting ops, across the whole basic block we also detect when nzcv + // implicit-defs are dead, and mark them as dead. Peephole optimizations need + // this information later. + + LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); + LRU.addLiveOuts(MBB); + bool NZCVDead = LRU.available(AArch64::NZCV); + bool InsideCmpRange = false; + for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { + LRU.stepBackward(II); + + if (LastCmp) { // There's a range present in this block. + // If we're inside an fcmp range, look for begin instruction. + if (InsideCmpRange && &II == FirstCmp) + InsideCmpRange = false; + else if (&II == LastCmp) + InsideCmpRange = true; + } + + // Did this instruction define NZCV? + bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV); + if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) { + // If we have a def and NZCV is dead, then we may convert this op. + unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); + int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV); + if (DeadNZCVIdx != -1) { + // If we're inside an fcmp range, then convert flag setting ops. + if (InsideCmpRange && NewOpc) { + LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " + "op in fcmp range: " + << II); + II.setDesc(TII->get(NewOpc)); + II.RemoveOperand(DeadNZCVIdx); + Changed |= true; + } else { + // Otherwise, we just set the nzcv imp-def operand to be dead, so the + // peephole optimizations can optimize them further. + II.getOperand(DeadNZCVIdx).setIsDead(); + } + } + } + + NZCVDead = NZCVDeadAtCurrInstr; + } + return Changed; +} + +bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + assert(MF.getProperties().hasProperty( + MachineFunctionProperties::Property::Selected) && + "Expected a selected MF"); + + bool Changed = false; + for (auto &BB : MF) + Changed |= optimizeNZCVDefs(BB); + return true; +} + +char AArch64PostSelectOptimize::ID = 0; +INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, + "Optimize AArch64 selected instructions", + false, false) +INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, + "Optimize AArch64 selected instructions", false, + false) + +namespace llvm { +FunctionPass *createAArch64PostSelectOptimize() { + return new AArch64PostSelectOptimize(); +} +} // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 5f9b64e274..2686f6dc46 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -104,16 +104,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: return Helper.tryCombineShuffleVector(MI); - case TargetOpcode::G_MEMCPY: - case TargetOpcode::G_MEMMOVE: - case TargetOpcode::G_MEMSET: { - // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other - // heuristics decide. - unsigned MaxLen = EnableOpt ? 0 : 32; - // Try to inline memcpy type calls if optimizations are enabled. - return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false; - } + case TargetOpcode::G_MEMCPY: + case TargetOpcode::G_MEMMOVE: + case TargetOpcode::G_MEMSET: { + // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other + // heuristics decide. + unsigned MaxLen = EnableOpt ? 0 : 32; + // Try to inline memcpy type calls if optimizations are enabled. + return !EnableMinSize ? Helper.tryCombineMemCpyFamily(MI, MaxLen) : false; } + } return false; } @@ -188,7 +188,7 @@ INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, namespace llvm { -FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) { +FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) { return new AArch64PreLegalizerCombiner(IsOptNone); } } // end namespace llvm diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index c76c43389b..e26fe60d93 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -13,7 +13,7 @@ #include "AArch64RegisterBankInfo.h" #include "AArch64InstrInfo.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -466,10 +466,10 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping( getValueMapping(RBIdx, Size), NumOperands); } -bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - unsigned Depth) const { +bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth) const { unsigned Op = MI.getOpcode(); // Do we have an explicit floating point instruction? @@ -481,30 +481,30 @@ bool AArch64RegisterBankInfo::hasFPConstraints(const MachineInstr &MI, if (Op != TargetOpcode::COPY && !MI.isPHI()) return false; - // Check if we already know the register bank. - auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI); - if (RB == &AArch64::FPRRegBank) - return true; - if (RB == &AArch64::GPRRegBank) - return false; - - // We don't know anything. - // - // If we have a phi, we may be able to infer that it will be assigned a FPR - // based off of its inputs. - if (!MI.isPHI() || Depth > MaxFPRSearchDepth) - return false; - - return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) { - return Op.isReg() && - onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1); - }); + // Check if we already know the register bank. + auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI); + if (RB == &AArch64::FPRRegBank) + return true; + if (RB == &AArch64::GPRRegBank) + return false; + + // We don't know anything. + // + // If we have a phi, we may be able to infer that it will be assigned a FPR + // based off of its inputs. + if (!MI.isPHI() || Depth > MaxFPRSearchDepth) + return false; + + return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) { + return Op.isReg() && + onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1); + }); } bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - unsigned Depth) const { + const TargetRegisterInfo &TRI, + unsigned Depth) const { switch (MI.getOpcode()) { case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: @@ -513,13 +513,13 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, default: break; } - return hasFPConstraints(MI, MRI, TRI, Depth); + return hasFPConstraints(MI, MRI, TRI, Depth); } -bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - unsigned Depth) const { +bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth) const { switch (MI.getOpcode()) { case AArch64::G_DUP: case TargetOpcode::G_SITOFP: @@ -530,7 +530,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, default: break; } - return hasFPConstraints(MI, MRI, TRI, Depth); + return hasFPConstraints(MI, MRI, TRI, Depth); } const RegisterBankInfo::InstructionMapping & @@ -680,18 +680,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: { + case TargetOpcode::G_UITOFP: { if (MRI.getType(MI.getOperand(0).getReg()).isVector()) break; - // Integer to FP conversions don't necessarily happen between GPR -> FPR - // regbanks. They can also be done within an FPR register. - Register SrcReg = MI.getOperand(1).getReg(); - if (getRegBank(SrcReg, MRI, TRI) == &AArch64::FPRRegBank) - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; - else - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; + // Integer to FP conversions don't necessarily happen between GPR -> FPR + // regbanks. They can also be done within an FPR register. + Register SrcReg = MI.getOperand(1).getReg(); + if (getRegBank(SrcReg, MRI, TRI) == &AArch64::FPRRegBank) + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + else + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; break; - } + } case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) @@ -729,8 +729,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // assume this was a floating point load in the IR. // If it was not, we would have had a bitcast before // reaching that instruction. - // Int->FP conversion operations are also captured in onlyDefinesFP(). - if (onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI)) { + // Int->FP conversion operations are also captured in onlyDefinesFP(). + if (onlyUsesFP(UseMI, MRI, TRI) || onlyDefinesFP(UseMI, MRI, TRI)) { OpRegBankIdx[0] = PMI_FirstFPR; break; } @@ -853,7 +853,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } - case TargetOpcode::G_BUILD_VECTOR: { + case TargetOpcode::G_BUILD_VECTOR: { // If the first source operand belongs to a FPR register bank, then make // sure that we preserve that. if (OpRegBankIdx[1] != PMI_FirstGPR) @@ -864,17 +864,17 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Get the instruction that defined the source operand reg, and check if // it's a floating point operation. Or, if it's a type like s16 which - // doesn't have a exact size gpr register class. The exception is if the - // build_vector has all constant operands, which may be better to leave as - // gpr without copies, so it can be matched in imported patterns. + // doesn't have a exact size gpr register class. The exception is if the + // build_vector has all constant operands, which may be better to leave as + // gpr without copies, so it can be matched in imported patterns. MachineInstr *DefMI = MRI.getVRegDef(VReg); unsigned DefOpc = DefMI->getOpcode(); const LLT SrcTy = MRI.getType(VReg); - if (all_of(MI.operands(), [&](const MachineOperand &Op) { - return Op.isDef() || MRI.getVRegDef(Op.getReg())->getOpcode() == - TargetOpcode::G_CONSTANT; - })) - break; + if (all_of(MI.operands(), [&](const MachineOperand &Op) { + return Op.isDef() || MRI.getVRegDef(Op.getReg())->getOpcode() == + TargetOpcode::G_CONSTANT; + })) + break; if (isPreISelGenericFloatingPointOpcode(DefOpc) || SrcTy.getSizeInBits() < 32) { // Have a floating point op. @@ -885,30 +885,30 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } - case TargetOpcode::G_VECREDUCE_FADD: - case TargetOpcode::G_VECREDUCE_FMUL: - case TargetOpcode::G_VECREDUCE_FMAX: - case TargetOpcode::G_VECREDUCE_FMIN: - case TargetOpcode::G_VECREDUCE_ADD: - case TargetOpcode::G_VECREDUCE_MUL: - case TargetOpcode::G_VECREDUCE_AND: - case TargetOpcode::G_VECREDUCE_OR: - case TargetOpcode::G_VECREDUCE_XOR: - case TargetOpcode::G_VECREDUCE_SMAX: - case TargetOpcode::G_VECREDUCE_SMIN: - case TargetOpcode::G_VECREDUCE_UMAX: - case TargetOpcode::G_VECREDUCE_UMIN: - // Reductions produce a scalar value from a vector, the scalar should be on - // FPR bank. - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; - break; - case TargetOpcode::G_VECREDUCE_SEQ_FADD: - case TargetOpcode::G_VECREDUCE_SEQ_FMUL: - // These reductions also take a scalar accumulator input. - // Assign them FPR for now. - OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR}; - break; - } + case TargetOpcode::G_VECREDUCE_FADD: + case TargetOpcode::G_VECREDUCE_FMUL: + case TargetOpcode::G_VECREDUCE_FMAX: + case TargetOpcode::G_VECREDUCE_FMIN: + case TargetOpcode::G_VECREDUCE_ADD: + case TargetOpcode::G_VECREDUCE_MUL: + case TargetOpcode::G_VECREDUCE_AND: + case TargetOpcode::G_VECREDUCE_OR: + case TargetOpcode::G_VECREDUCE_XOR: + case TargetOpcode::G_VECREDUCE_SMAX: + case TargetOpcode::G_VECREDUCE_SMIN: + case TargetOpcode::G_VECREDUCE_UMAX: + case TargetOpcode::G_VECREDUCE_UMIN: + // Reductions produce a scalar value from a vector, the scalar should be on + // FPR bank. + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR}; + break; + case TargetOpcode::G_VECREDUCE_SEQ_FADD: + case TargetOpcode::G_VECREDUCE_SEQ_FMUL: + // These reductions also take a scalar accumulator input. + // Assign them FPR for now. + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR, PMI_FirstFPR}; + break; + } // Finally construct the computed mapping. SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h index 019017bc3e..c8cfe53299 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h @@ -114,20 +114,20 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { const InstructionMapping & getSameKindOfOperandsMapping(const MachineInstr &MI) const; - /// Maximum recursion depth for hasFPConstraints. - const unsigned MaxFPRSearchDepth = 2; - - /// \returns true if \p MI only uses and defines FPRs. + /// Maximum recursion depth for hasFPConstraints. + const unsigned MaxFPRSearchDepth = 2; + + /// \returns true if \p MI only uses and defines FPRs. bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, unsigned Depth = 0) const; + const TargetRegisterInfo &TRI, unsigned Depth = 0) const; - /// \returns true if \p MI only uses FPRs. + /// \returns true if \p MI only uses FPRs. bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, unsigned Depth = 0) const; + const TargetRegisterInfo &TRI, unsigned Depth = 0) const; - /// \returns true if \p MI only defines FPRs. + /// \returns true if \p MI only defines FPRs. bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, unsigned Depth = 0) const; + const TargetRegisterInfo &TRI, unsigned Depth = 0) const; public: AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 2cbe8315bc..77b7c09946 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -763,8 +763,8 @@ static inline bool isSVECpyImm(int64_t Imm) { bool IsImm8 = int8_t(Imm) == Imm; bool IsImm16 = int16_t(Imm & ~0xff) == Imm; - if (std::is_same<int8_t, std::make_signed_t<T>>::value || - std::is_same<int8_t, T>::value) + if (std::is_same<int8_t, std::make_signed_t<T>>::value || + std::is_same<int8_t, T>::value) return IsImm8 || uint8_t(Imm) == Imm; if (std::is_same<int16_t, std::make_signed_t<T>>::value) @@ -776,8 +776,8 @@ static inline bool isSVECpyImm(int64_t Imm) { /// Returns true if Imm is valid for ADD/SUB. template <typename T> static inline bool isSVEAddSubImm(int64_t Imm) { - bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value || - std::is_same<int8_t, T>::value; + bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value || + std::is_same<int8_t, T>::value; return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm); } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 75a9f2f5c8..33448cef46 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -758,7 +758,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, assert(TheTriple.isOSBinFormatELF() && "Invalid target"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); - bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; + bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/true, IsILP32); } @@ -771,7 +771,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!"); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); - bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; + bool IsILP32 = STI.getTargetTriple().getEnvironment() == Triple::GNUILP32; return new ELFAArch64AsmBackend(T, TheTriple, OSABI, /*IsLittleEndian=*/false, IsILP32); } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index fcf67bd2f7..6c98ac4737 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -43,7 +43,7 @@ protected: } // end anonymous namespace AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) - : MCELFObjectTargetWriter(/*Is64Bit*/ !IsILP32, OSABI, ELF::EM_AARCH64, + : MCELFObjectTargetWriter(/*Is64Bit*/ !IsILP32, OSABI, ELF::EM_AARCH64, /*HasRelocationAddend*/ true), IsILP32(IsILP32) {} @@ -322,11 +322,11 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(LDST64_ABS_LO12_NC); if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) { - AArch64MCExpr::VariantKind AddressLoc = - AArch64MCExpr::getAddressFrag(RefKind); + AArch64MCExpr::VariantKind AddressLoc = + AArch64MCExpr::getAddressFrag(RefKind); if (!IsILP32) { - if (AddressLoc == AArch64MCExpr::VK_LO15) - return ELF::R_AARCH64_LD64_GOTPAGE_LO15; + if (AddressLoc == AArch64MCExpr::VK_LO15) + return ELF::R_AARCH64_LD64_GOTPAGE_LO15; return ELF::R_AARCH64_LD64_GOT_LO12_NC; } else { Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index ec97e1c8b7..2135cf605b 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -51,61 +51,61 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer { OS << "\t.variant_pcs " << Symbol->getName() << "\n"; } - void EmitARM64WinCFIAllocStack(unsigned Size) override { - OS << "\t.seh_stackalloc " << Size << "\n"; - } - void EmitARM64WinCFISaveR19R20X(int Offset) override { - OS << "\t.seh_save_r19r20_x " << Offset << "\n"; - } - void EmitARM64WinCFISaveFPLR(int Offset) override { - OS << "\t.seh_save_fplr " << Offset << "\n"; - } - void EmitARM64WinCFISaveFPLRX(int Offset) override { - OS << "\t.seh_save_fplr_x " << Offset << "\n"; - } - void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override { - OS << "\t.seh_save_reg x" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override { - OS << "\t.seh_save_reg_x x" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override { - OS << "\t.seh_save_regp x" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override { - OS << "\t.seh_save_regp_x x" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override { - OS << "\t.seh_save_lrpair x" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override { - OS << "\t.seh_save_freg d" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override { - OS << "\t.seh_save_freg_x d" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override { - OS << "\t.seh_save_fregp d" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override { - OS << "\t.seh_save_fregp_x d" << Reg << ", " << Offset << "\n"; - } - void EmitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; } - void EmitARM64WinCFIAddFP(unsigned Size) override { - OS << "\t.seh_add_fp " << Size << "\n"; - } - void EmitARM64WinCFINop() override { OS << "\t.seh_nop\n"; } - void EmitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; } - void EmitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; } - void EmitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; } - void EmitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; } - void EmitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; } - void EmitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; } - void EmitARM64WinCFIContext() override { OS << "\t.seh_context\n"; } - void EmitARM64WinCFIClearUnwoundToCall() override { - OS << "\t.seh_clear_unwound_to_call\n"; - } - + void EmitARM64WinCFIAllocStack(unsigned Size) override { + OS << "\t.seh_stackalloc " << Size << "\n"; + } + void EmitARM64WinCFISaveR19R20X(int Offset) override { + OS << "\t.seh_save_r19r20_x " << Offset << "\n"; + } + void EmitARM64WinCFISaveFPLR(int Offset) override { + OS << "\t.seh_save_fplr " << Offset << "\n"; + } + void EmitARM64WinCFISaveFPLRX(int Offset) override { + OS << "\t.seh_save_fplr_x " << Offset << "\n"; + } + void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override { + OS << "\t.seh_save_reg x" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override { + OS << "\t.seh_save_reg_x x" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override { + OS << "\t.seh_save_regp x" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override { + OS << "\t.seh_save_regp_x x" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override { + OS << "\t.seh_save_lrpair x" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override { + OS << "\t.seh_save_freg d" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override { + OS << "\t.seh_save_freg_x d" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override { + OS << "\t.seh_save_fregp d" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override { + OS << "\t.seh_save_fregp_x d" << Reg << ", " << Offset << "\n"; + } + void EmitARM64WinCFISetFP() override { OS << "\t.seh_set_fp\n"; } + void EmitARM64WinCFIAddFP(unsigned Size) override { + OS << "\t.seh_add_fp " << Size << "\n"; + } + void EmitARM64WinCFINop() override { OS << "\t.seh_nop\n"; } + void EmitARM64WinCFISaveNext() override { OS << "\t.seh_save_next\n"; } + void EmitARM64WinCFIPrologEnd() override { OS << "\t.seh_endprologue\n"; } + void EmitARM64WinCFIEpilogStart() override { OS << "\t.seh_startepilogue\n"; } + void EmitARM64WinCFIEpilogEnd() override { OS << "\t.seh_endepilogue\n"; } + void EmitARM64WinCFITrapFrame() override { OS << "\t.seh_trap_frame\n"; } + void EmitARM64WinCFIMachineFrame() override { OS << "\t.seh_pushframe\n"; } + void EmitARM64WinCFIContext() override { OS << "\t.seh_context\n"; } + void EmitARM64WinCFIClearUnwoundToCall() override { + OS << "\t.seh_clear_unwound_to_call\n"; + } + public: AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); }; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 340120d2b9..4aeb45ac21 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -849,7 +849,7 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, } break; } - } else if (CnVal == 8 || CnVal == 9) { + } else if (CnVal == 8 || CnVal == 9) { // TLBI aliases const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding); if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits())) @@ -1377,8 +1377,8 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address, } } -void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, uint64_t Address, - unsigned OpNum, +void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, uint64_t Address, + unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNum); @@ -1386,11 +1386,11 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, uint64_t Address, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - const int64_t Offset = Op.getImm() * 4096; - if (PrintBranchImmAsAddress) - O << formatHex((Address & -4096) + Offset); - else - O << "#" << Offset; + const int64_t Offset = Op.getImm() * 4096; + if (PrintBranchImmAsAddress) + O << formatHex((Address & -4096) + Offset); + else + O << "#" << Offset; return; } @@ -1421,22 +1421,22 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, O << "#" << Val; } -void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - assert(MI->getOpcode() == AArch64::DSBnXS); - - StringRef Name; - auto DB = AArch64DBnXS::lookupDBnXSByEncoding(Val); - Name = DB ? DB->Name : ""; - - if (!Name.empty()) - O << Name; - else - O << "#" << Val; -} - +void AArch64InstPrinter::printBarriernXSOption(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + assert(MI->getOpcode() == AArch64::DSBnXS); + + StringRef Name; + auto DB = AArch64DBnXS::lookupDBnXSByEncoding(Val); + Name = DB ? DB->Name : ""; + + if (!Name.empty()) + O << Name; + else + O << "#" << Val; +} + void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1644,10 +1644,10 @@ void AArch64InstPrinter::printGPR64as32(const MCInst *MI, unsigned OpNum, unsigned Reg = MI->getOperand(OpNum).getReg(); O << getRegisterName(getWRegFromXReg(Reg)); } - -void AArch64InstPrinter::printGPR64x8(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - O << getRegisterName(MRI.getSubReg(Reg, AArch64::x8sub_0)); -} + +void AArch64InstPrinter::printGPR64x8(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + O << getRegisterName(MRI.getSubReg(Reg, AArch64::x8sub_0)); +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h index 4be885e667..b1952ebd27 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h @@ -30,7 +30,7 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; // Autogenerated by tblgen. - std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; virtual void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); virtual bool printAliasInstr(const MCInst *MI, uint64_t Address, @@ -156,12 +156,12 @@ protected: void printVectorIndex(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum, + void printAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printBarrierOption(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printBarriernXSOption(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); + void printBarriernXSOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, @@ -190,8 +190,8 @@ protected: const MCSubtargetInfo &STI, raw_ostream &O); void printGPR64as32(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printGPR64x8(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); + void printGPR64x8(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); template <int Width> void printZPRasFPR(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); @@ -208,7 +208,7 @@ public: void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &O) override; - std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O) override; bool printAliasInstr(const MCInst *MI, uint64_t Address, diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 68c721cb0d..257ecd33d2 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -73,7 +73,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { // targeting ELF. AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant; - CodePointerSize = T.getEnvironment() == Triple::GNUILP32 ? 4 : 8; + CodePointerSize = T.getEnvironment() == Triple::GNUILP32 ? 4 : 8; // ".comm align is in bytes but .align is pow-2." AlignmentIsInBytes = false; @@ -111,7 +111,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { SupportsDebugInformation = true; CodePointerSize = 8; - CommentString = "//"; + CommentString = "//"; ExceptionsType = ExceptionHandling::WinEH; WinEHEncodingType = WinEH::EncodingType::Itanium; } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 844bd6bbad..dd975cd363 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -70,7 +70,7 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_ABS_PAGE_NC: return ":pg_hi21_nc:"; case VK_GOT: return ":got:"; case VK_GOT_PAGE: return ":got:"; - case VK_GOT_PAGE_LO15: return ":gotpage_lo15:"; + case VK_GOT_PAGE_LO15: return ":gotpage_lo15:"; case VK_GOT_LO12: return ":got_lo12:"; case VK_GOTTPREL: return ":gottprel:"; case VK_GOTTPREL_PAGE: return ":gottprel:"; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index d3e834a140..6e191cd455 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -46,7 +46,7 @@ public: VK_G1 = 0x050, VK_G2 = 0x060, VK_G3 = 0x070, - VK_LO15 = 0x080, + VK_LO15 = 0x080, VK_AddressFragBits = 0x0f0, // Whether the final relocation is a checked one (where a linker should @@ -83,7 +83,7 @@ public: VK_PREL_G0_NC = VK_PREL | VK_G0 | VK_NC, VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, VK_GOT_PAGE = VK_GOT | VK_PAGE, - VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC, + VK_GOT_PAGE_LO15 = VK_GOT | VK_LO15 | VK_NC, VK_DTPREL_G2 = VK_DTPREL | VK_G2, VK_DTPREL_G1 = VK_DTPREL | VK_G1, VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 3c2df1621e..98dcd9a96a 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -50,14 +50,14 @@ static MCInstrInfo *createAArch64MCInstrInfo() { static MCSubtargetInfo * createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - if (CPU.empty()) { + if (CPU.empty()) { CPU = "generic"; - if (TT.isArm64e()) - CPU = "apple-a12"; - } - - return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); + if (TT.isArm64e()) + CPU = "apple-a12"; + } + + return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); } void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 012661edbb..f2384aa588 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -373,11 +373,11 @@ void AArch64MachObjectWriter::recordRelocation( Type == MachO::ARM64_RELOC_PAGE21 || Type == MachO::ARM64_RELOC_PAGEOFF12) && Value) { - if (!isInt<24>(Value)) { - Asm.getContext().reportError(Fixup.getLoc(), - "addend too big for relocation"); - return; - } + if (!isInt<24>(Value)) { + Asm.getContext().reportError(Fixup.getLoc(), + "addend too big for relocation"); + return; + } MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index f32a8f15b8..8f3e876061 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -11,23 +11,23 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" -#include "AArch64MCAsmInfo.h" -#include "AArch64Subtarget.h" -#include "llvm/BinaryFormat/ELF.h" +#include "AArch64MCAsmInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/ConstantPools.h" -#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; -static cl::opt<bool> MarkBTIProperty( - "aarch64-mark-bti-property", cl::Hidden, - cl::desc("Add .note.gnu.property with BTI to assembly files"), - cl::init(false)); - +static cl::opt<bool> MarkBTIProperty( + "aarch64-mark-bti-property", cl::Hidden, + cl::desc("Add .note.gnu.property with BTI to assembly files"), + cl::init(false)); + // // AArch64TargetStreamer Implemenation // @@ -48,51 +48,51 @@ void AArch64TargetStreamer::emitCurrentConstantPool() { ConstantPools->emitForCurrentSection(Streamer); } -// finish() - write out any non-empty assembler constant pools and -// write out note.gnu.properties if need. -void AArch64TargetStreamer::finish() { - ConstantPools->emitAll(Streamer); - - if (MarkBTIProperty) - emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI); -} - -void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { - if (Flags == 0) - return; - - MCStreamer &OutStreamer = getStreamer(); - MCContext &Context = OutStreamer.getContext(); - // Emit a .note.gnu.property section with the flags. - MCSectionELF *Nt = Context.getELFSection(".note.gnu.property", ELF::SHT_NOTE, - ELF::SHF_ALLOC); - if (Nt->isRegistered()) { - SMLoc Loc; - Context.reportWarning( - Loc, - "The .note.gnu.property is not emitted because it is already present."); - return; - } - MCSection *Cur = OutStreamer.getCurrentSectionOnly(); - OutStreamer.SwitchSection(Nt); - - // Emit the note header. - OutStreamer.emitValueToAlignment(Align(8).value()); - OutStreamer.emitIntValue(4, 4); // data size for "GNU\0" - OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size - OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); - OutStreamer.emitBytes(StringRef("GNU", 4)); // note name - - // Emit the PAC/BTI properties. - OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); - OutStreamer.emitIntValue(4, 4); // data size - OutStreamer.emitIntValue(Flags, 4); // data - OutStreamer.emitIntValue(0, 4); // pad - - OutStreamer.endSection(Nt); - OutStreamer.SwitchSection(Cur); -} - +// finish() - write out any non-empty assembler constant pools and +// write out note.gnu.properties if need. +void AArch64TargetStreamer::finish() { + ConstantPools->emitAll(Streamer); + + if (MarkBTIProperty) + emitNoteSection(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI); +} + +void AArch64TargetStreamer::emitNoteSection(unsigned Flags) { + if (Flags == 0) + return; + + MCStreamer &OutStreamer = getStreamer(); + MCContext &Context = OutStreamer.getContext(); + // Emit a .note.gnu.property section with the flags. + MCSectionELF *Nt = Context.getELFSection(".note.gnu.property", ELF::SHT_NOTE, + ELF::SHF_ALLOC); + if (Nt->isRegistered()) { + SMLoc Loc; + Context.reportWarning( + Loc, + "The .note.gnu.property is not emitted because it is already present."); + return; + } + MCSection *Cur = OutStreamer.getCurrentSectionOnly(); + OutStreamer.SwitchSection(Nt); + + // Emit the note header. + OutStreamer.emitValueToAlignment(Align(8).value()); + OutStreamer.emitIntValue(4, 4); // data size for "GNU\0" + OutStreamer.emitIntValue(4 * 4, 4); // Elf_Prop size + OutStreamer.emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); + OutStreamer.emitBytes(StringRef("GNU", 4)); // note name + + // Emit the PAC/BTI properties. + OutStreamer.emitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); + OutStreamer.emitIntValue(4, 4); // data size + OutStreamer.emitIntValue(Flags, 4); // data + OutStreamer.emitIntValue(0, 4); // pad + + OutStreamer.endSection(Nt); + OutStreamer.SwitchSection(Cur); +} + void AArch64TargetStreamer::emitInst(uint32_t Inst) { char Buffer[4]; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index 73dc1e5d4d..5212d70a57 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -33,9 +33,9 @@ public: /// Emit contents of constant pool for the current section. void emitCurrentConstantPool(); - /// Callback used to implement the .note.gnu.property section. - void emitNoteSection(unsigned Flags); - + /// Callback used to implement the .note.gnu.property section. + void emitNoteSection(unsigned Flags); + /// Callback used to implement the .inst directive. virtual void emitInst(uint32_t Inst); @@ -43,14 +43,14 @@ public: virtual void emitDirectiveVariantPCS(MCSymbol *Symbol) {}; virtual void EmitARM64WinCFIAllocStack(unsigned Size) {} - virtual void EmitARM64WinCFISaveR19R20X(int Offset) {} + virtual void EmitARM64WinCFISaveR19R20X(int Offset) {} virtual void EmitARM64WinCFISaveFPLR(int Offset) {} virtual void EmitARM64WinCFISaveFPLRX(int Offset) {} virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {} - virtual void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {} + virtual void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {} virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {} @@ -58,14 +58,14 @@ public: virtual void EmitARM64WinCFISetFP() {} virtual void EmitARM64WinCFIAddFP(unsigned Size) {} virtual void EmitARM64WinCFINop() {} - virtual void EmitARM64WinCFISaveNext() {} + virtual void EmitARM64WinCFISaveNext() {} virtual void EmitARM64WinCFIPrologEnd() {} virtual void EmitARM64WinCFIEpilogStart() {} virtual void EmitARM64WinCFIEpilogEnd() {} - virtual void EmitARM64WinCFITrapFrame() {} - virtual void EmitARM64WinCFIMachineFrame() {} - virtual void EmitARM64WinCFIContext() {} - virtual void EmitARM64WinCFIClearUnwoundToCall() {} + virtual void EmitARM64WinCFITrapFrame() {} + virtual void EmitARM64WinCFIMachineFrame() {} + virtual void EmitARM64WinCFIContext() {} + virtual void EmitARM64WinCFIClearUnwoundToCall() {} private: std::unique_ptr<AssemblerConstantPools> ConstantPools; @@ -96,14 +96,14 @@ public: // The unwind codes on ARM64 Windows are documented at // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling void EmitARM64WinCFIAllocStack(unsigned Size) override; - void EmitARM64WinCFISaveR19R20X(int Offset) override; + void EmitARM64WinCFISaveR19R20X(int Offset) override; void EmitARM64WinCFISaveFPLR(int Offset) override; void EmitARM64WinCFISaveFPLRX(int Offset) override; void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override; - void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override; + void EmitARM64WinCFISaveLRPair(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override; void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override; @@ -111,15 +111,15 @@ public: void EmitARM64WinCFISetFP() override; void EmitARM64WinCFIAddFP(unsigned Size) override; void EmitARM64WinCFINop() override; - void EmitARM64WinCFISaveNext() override; + void EmitARM64WinCFISaveNext() override; void EmitARM64WinCFIPrologEnd() override; void EmitARM64WinCFIEpilogStart() override; void EmitARM64WinCFIEpilogEnd() override; - void EmitARM64WinCFITrapFrame() override; - void EmitARM64WinCFIMachineFrame() override; - void EmitARM64WinCFIContext() override; - void EmitARM64WinCFIClearUnwoundToCall() override; - + void EmitARM64WinCFITrapFrame() override; + void EmitARM64WinCFIMachineFrame() override; + void EmitARM64WinCFIContext() override; + void EmitARM64WinCFIClearUnwoundToCall() override; + private: void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset); }; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp index 1c50706a26..603446f40d 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -28,7 +28,7 @@ public: void EmitWinEHHandlerData(SMLoc Loc) override; void EmitWindowsUnwindTables() override; - void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; + void EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) override; void finishImpl() override; }; @@ -37,14 +37,14 @@ void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! - EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo(), - /* HandlerData = */ true); -} - -void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { - EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); + EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo(), + /* HandlerData = */ true); } +void AArch64WinCOFFStreamer::EmitWindowsUnwindTables(WinEH::FrameInfo *Frame) { + EHStreamer.EmitUnwindInfo(*this, Frame, /* HandlerData = */ false); +} + void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() { if (!getNumWinFrameInfos()) return; @@ -91,10 +91,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) { EmitARM64WinUnwindCode(Op, -1, Size); } -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveR19R20X(int Offset) { - EmitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset); -} - +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveR19R20X(int Offset) { + EmitARM64WinUnwindCode(Win64EH::UOP_SaveR19R20X, -1, Offset); +} + void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) { EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset); } @@ -125,11 +125,11 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg, EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset); } -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveLRPair(unsigned Reg, - int Offset) { - EmitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset); -} - +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveLRPair(unsigned Reg, + int Offset) { + EmitARM64WinUnwindCode(Win64EH::UOP_SaveLRPair, Reg, Offset); +} + void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) { assert(Offset >= 0 && Offset <= 504 && @@ -165,10 +165,10 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() { EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0); } -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveNext() { - EmitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0); -} - +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveNext() { + EmitARM64WinUnwindCode(Win64EH::UOP_SaveNext, -1, 0); +} + // The functions below handle opcodes that can end up in either a prolog or // an epilog, but not both. void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() { @@ -207,22 +207,22 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() { CurrentEpilog = nullptr; } -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFITrapFrame() { - EmitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0); -} - -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIMachineFrame() { - EmitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0); -} - -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIContext() { - EmitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0); -} - -void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIClearUnwoundToCall() { - EmitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0); -} - +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFITrapFrame() { + EmitARM64WinUnwindCode(Win64EH::UOP_TrapFrame, -1, 0); +} + +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIMachineFrame() { + EmitARM64WinUnwindCode(Win64EH::UOP_PushMachFrame, -1, 0); +} + +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIContext() { + EmitARM64WinUnwindCode(Win64EH::UOP_Context, -1, 0); +} + +void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIClearUnwoundToCall() { + EmitARM64WinUnwindCode(Win64EH::UOP_ClearUnwoundToCall, -1, 0); +} + MCWinCOFFStreamer *createAArch64WinCOFFStreamer( MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter, diff --git a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/ya.make index 18b5c7460f..9a6f23a3c8 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc/ya.make @@ -12,19 +12,19 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/BinaryFormat - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target/AArch64/TargetInfo - contrib/libs/llvm12/lib/Target/AArch64/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/BinaryFormat + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target/AArch64/TargetInfo + contrib/libs/llvm12/lib/Target/AArch64/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/AArch64/SVEInstrFormats.td b/contrib/libs/llvm12/lib/Target/AArch64/SVEInstrFormats.td index 4eecf72862..0c31ac1f9a 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/SVEInstrFormats.td +++ b/contrib/libs/llvm12/lib/Target/AArch64/SVEInstrFormats.td @@ -206,20 +206,20 @@ def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>; -def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>; -def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>; -def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>; -def SVEArithUImm64Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>; +def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>; +def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>; +def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>; +def SVEArithUImm64Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>; def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>; -def SVEShiftImmL8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>", []>; -def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>; -def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>; -def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>; -def SVEShiftImmR8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8, true>", []>; -def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>; -def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>; -def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>; +def SVEShiftImmL8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>", []>; +def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>; +def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>; +def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>; +def SVEShiftImmR8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8, true>", []>; +def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>; +def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>; +def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>; class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; @@ -280,8 +280,8 @@ class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; let Defs = !if(!eq (opc{0}, 1), [NZCV], []); - let ElementSize = pprty.ElementSize; - let isReMaterializable = 1; + let ElementSize = pprty.ElementSize; + let isReMaterializable = 1; } multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> { @@ -317,18 +317,18 @@ class SVE_1_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1)), (inst $Op1)>; -class SVE_1_Op_Passthru_Pat<ValueType vtd, SDPatternOperator op, ValueType pg, - ValueType vts, Instruction inst> -: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)), - (inst $Op3, $Op1, $Op2)>; - -// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the -// type of rounding. This is matched by timm0_1 in pattern below and ignored. -class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg, - ValueType vts, Instruction inst> -: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)), - (inst $Op3, $Op1, $Op2)>; - +class SVE_1_Op_Passthru_Pat<ValueType vtd, SDPatternOperator op, ValueType pg, + ValueType vts, Instruction inst> +: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; + +// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the +// type of rounding. This is matched by timm0_1 in pattern below and ignored. +class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg, + ValueType vts, Instruction inst> +: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; + class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst> : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))), @@ -354,11 +354,11 @@ class SVE_2_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1, vt2:$Op2)), (inst $Op1, $Op2)>; -class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op, - ValueType pt, ValueType vt1, ValueType vt2, - Instruction inst> -: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)), - (inst $Op1, $Op2)>; +class SVE_2_Op_Pred_All_Active<ValueType vtd, SDPatternOperator op, + ValueType pt, ValueType vt1, ValueType vt2, + Instruction inst> +: Pat<(vtd (op (pt (AArch64ptrue 31)), vt1:$Op1, vt2:$Op2)), + (inst $Op1, $Op2)>; class SVE_3_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, ValueType vt2, ValueType vt3, Instruction inst> @@ -418,23 +418,23 @@ class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1, vt2:$Op2)), (inst (ptrue 31), $Op1, $Op2)>; -class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt, - ValueType inreg_vt, Instruction inst> -: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)), - (inst $PassThru, $Pg, $Src)>; - -class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op, - ValueType pt, ValueType it, - ComplexPattern cast, Instruction inst> -: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), - (inst $Pg, $Rn, i32:$imm)>; - -class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op, - ValueType pt, ValueType it, - ComplexPattern cast, Instruction inst> -: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), - (inst $Rn, i32:$imm)>; - +class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt, + ValueType inreg_vt, Instruction inst> +: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)), + (inst $PassThru, $Pg, $Src)>; + +class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op, + ValueType pt, ValueType it, + ComplexPattern cast, Instruction inst> +: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), + (inst $Pg, $Rn, i32:$imm)>; + +class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op, + ValueType pt, ValueType it, + ComplexPattern cast, Instruction inst> +: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), + (inst $Rn, i32:$imm)>; + // // Pseudo -> Instruction mappings // @@ -511,8 +511,8 @@ class sve_int_pfalse<bits<6> opc, string asm> let Inst{9} = opc{0}; let Inst{8-4} = 0b00000; let Inst{3-0} = Pd; - - let isReMaterializable = 1; + + let isReMaterializable = 1; } class sve_int_ptest<bits<6> opc, string asm> @@ -533,7 +533,7 @@ class sve_int_ptest<bits<6> opc, string asm> let Inst{4-0} = 0b00000; let Defs = [NZCV]; - let isCompare = 1; + let isCompare = 1; } class sve_int_pfirst_next<bits<2> sz8_64, bits<5> opc, string asm, @@ -1014,8 +1014,8 @@ multiclass sve_int_perm_dup_i<string asm> { (!cast<Instruction>(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; } -class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty, - RegisterOperand VecList> +class sve_int_perm_tbl<bits<2> sz8_64, bits<2> opc, string asm, ZPRRegOp zprty, + RegisterOperand VecList> : I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", "", @@ -1057,8 +1057,8 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> { def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>; - - def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>; + + def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>; } multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> { @@ -1101,11 +1101,11 @@ multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> { (nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0, nxv2f64:$Op2, zsub1), nxv2i64:$Op3))>; - - def : Pat<(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)), - (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, - nxv8bf16:$Op2, zsub1), - nxv8i16:$Op3))>; + + def : Pat<(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)), + (nxv8bf16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, + nxv8bf16:$Op2, zsub1), + nxv8i16:$Op3))>; } class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty> @@ -1141,8 +1141,8 @@ multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>; - - def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>; + + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8i16, !cast<Instruction>(NAME # _H)>; } class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty> @@ -1173,8 +1173,8 @@ multiclass sve_int_perm_reverse_z<string asm, SDPatternOperator op> { def : SVE_1_Op_Pat<nxv8f16, op, nxv8f16, !cast<Instruction>(NAME # _H)>; def : SVE_1_Op_Pat<nxv4f32, op, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_1_Op_Pat<nxv2f64, op, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_1_Op_Pat<nxv8bf16, op, nxv8bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_1_Op_Pat<nxv8bf16, op, nxv8bf16, !cast<Instruction>(NAME # _H)>; } class sve_int_perm_reverse_p<bits<2> sz8_64, string asm, PPRRegOp pprty> @@ -1287,8 +1287,8 @@ multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> { def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, bf16, !cast<Instruction>(NAME # _H)>; } //===----------------------------------------------------------------------===// @@ -1375,8 +1375,8 @@ multiclass sve_int_sel_vvv<string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; - + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; + def : InstAlias<"mov $Zd, $Pg/m, $Zn", (!cast<Instruction>(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Zn", @@ -1713,8 +1713,8 @@ class sve_fp_3op_u_zd<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op, - SDPatternOperator predicated_op = null_frag> { +multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op, + SDPatternOperator predicated_op = null_frag> { def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; @@ -1723,9 +1723,9 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op, def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - def : SVE_2_Op_Pred_All_Active<nxv8f16, predicated_op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pred_All_Active<nxv4f32, predicated_op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pred_All_Active<nxv8f16, predicated_op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active<nxv4f32, predicated_op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pred_All_Active<nxv2f64, predicated_op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; } multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> { @@ -2117,8 +2117,8 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm, let ElementSize = zprty.ElementSize; } -multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, - SDPatternOperator op> { +multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, + SDPatternOperator op> { def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>; def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>; def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>; @@ -2270,11 +2270,11 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm, def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv4f16, op, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2f16, op, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _D)>; - def : SVE_2_Op_Pat<nxv2f32, op, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv2f16, op, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv2f32, op, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _D)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; } //===----------------------------------------------------------------------===// @@ -2282,7 +2282,7 @@ multiclass sve_int_perm_bin_perm_zz<bits<3> opc, string asm, //===----------------------------------------------------------------------===// class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, - RegisterOperand o_zprtype, ElementSizeEnum Sz> + RegisterOperand o_zprtype, ElementSizeEnum Sz> : I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", @@ -2301,64 +2301,64 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, let Constraints = "$Zd = $_Zd"; let DestructiveInstType = DestructiveOther; - let ElementSize = Sz; + let ElementSize = Sz; } multiclass sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype, RegisterOperand o_zprtype, - SDPatternOperator int_op, - SDPatternOperator ir_op, ValueType vt1, + SDPatternOperator int_op, + SDPatternOperator ir_op, ValueType vt1, ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>; - // convert vt1 to a packed type for the intrinsic patterns - defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, - !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16, - !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32, - 1 : vt1); - - // convert vt3 to a packed type for the intrinsic patterns - defvar packedvt3 = !cond(!eq(!cast<string>(vt3), "nxv2f16"): nxv8f16, - !eq(!cast<string>(vt3), "nxv4f16"): nxv8f16, - !eq(!cast<string>(vt3), "nxv2f32"): nxv4f32, - 1 : vt3); - - def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>; - - def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; -} - -multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm, - RegisterOperand i_zprtype, - RegisterOperand o_zprtype, - SDPatternOperator int_op, - SDPatternOperator ir_op, ValueType vt1, - ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { - def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>; - - // convert vt1 to a packed type for the intrinsic patterns - defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, - !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16, - !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32, - 1 : vt1); - - def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>; - - def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; -} - + // convert vt1 to a packed type for the intrinsic patterns + defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, + !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16, + !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32, + 1 : vt1); + + // convert vt3 to a packed type for the intrinsic patterns + defvar packedvt3 = !cond(!eq(!cast<string>(vt3), "nxv2f16"): nxv8f16, + !eq(!cast<string>(vt3), "nxv4f16"): nxv8f16, + !eq(!cast<string>(vt3), "nxv2f32"): nxv4f32, + 1 : vt3); + + def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, packedvt3, !cast<Instruction>(NAME)>; + + def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; +} + +multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm, + RegisterOperand i_zprtype, + RegisterOperand o_zprtype, + SDPatternOperator int_op, + SDPatternOperator ir_op, ValueType vt1, + ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { + def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>; + + // convert vt1 to a packed type for the intrinsic patterns + defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16, + !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16, + !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32, + 1 : vt1); + + def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>; + + def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>; +} + multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> { def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>; def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>; def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>; - def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } multiclass sve2_fp_flogb<string asm, SDPatternOperator op> { @@ -2466,19 +2466,19 @@ multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps, def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, string Ps, - SDPatternOperator op, - DestructiveInstTypeEnum flags> { - let DestructiveInstType = flags in { - def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>, - SVEPseudo2Instr<Ps # _B, 1>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>, - SVEPseudo2Instr<Ps # _H, 1>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>, - SVEPseudo2Instr<Ps # _S, 1>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>, - SVEPseudo2Instr<Ps # _D, 1>; - } +multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>, + SVEPseudo2Instr<Ps # _B, 1>; + def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>; + def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>; + def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>; + } def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -2486,19 +2486,19 @@ multiclass sve_int_bin_pred_arit_1<bits<3> opc, string asm, string Ps, def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, string Ps, - SDPatternOperator op, - DestructiveInstTypeEnum flags> { - let DestructiveInstType = flags in { - def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>, - SVEPseudo2Instr<Ps # _B, 1>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>, - SVEPseudo2Instr<Ps # _H, 1>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, - SVEPseudo2Instr<Ps # _S, 1>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, - SVEPseudo2Instr<Ps # _D, 1>; - } +multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>, + SVEPseudo2Instr<Ps # _B, 1>; + def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>, + SVEPseudo2Instr<Ps # _H, 1>; + def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, + SVEPseudo2Instr<Ps # _S, 1>; + def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, + SVEPseudo2Instr<Ps # _D, 1>; + } def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; @@ -2588,8 +2588,8 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm, let ElementSize = zprty.ElementSize; } -multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op, - SDPatternOperator outerop, SDPatternOperator mulop> { +multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op, + SDPatternOperator outerop, SDPatternOperator mulop> { def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>; def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>; def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>; @@ -2599,15 +2599,15 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op, def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; - - def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)), - (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)), - (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)), - (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>; - def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)), - (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>; + + def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)), + (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)), + (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)), + (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)), + (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>; } //===----------------------------------------------------------------------===// @@ -2711,8 +2711,8 @@ multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm, // SVE2 Integer Multiply-Add Long - Indexed Group //===----------------------------------------------------------------------===// -multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, - SDPatternOperator op> { +multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, + SDPatternOperator op> { def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; @@ -2962,8 +2962,8 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op, - SDPatternOperator op_pred = null_frag> { +multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op, + SDPatternOperator op_pred = null_frag> { def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; def _H : sve2_int_mul<0b01, opc, asm, ZPR16>; def _S : sve2_int_mul<0b10, opc, asm, ZPR32>; @@ -2973,11 +2973,11 @@ multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op, def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; - - def : SVE_2_Op_Pred_All_Active<nxv16i8, op_pred, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pred_All_Active<nxv8i16, op_pred, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pred_All_Active<nxv4i32, op_pred, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pred_All_Active<nxv2i64, op_pred, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; + + def : SVE_2_Op_Pred_All_Active<nxv16i8, op_pred, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active<nxv8i16, op_pred, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active<nxv4i32, op_pred, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pred_All_Active<nxv2i64, op_pred, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> { @@ -3531,8 +3531,8 @@ multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm, def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>; } -multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, - SDPatternOperator op> { +multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, + SDPatternOperator op> { def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, ZPR32, ZPR32>; def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, @@ -3576,7 +3576,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm, let Inst{19} = imm{3}; } def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64, - tvecshiftR32> { + tvecshiftR32> { let Inst{20-19} = imm{4-3}; } def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; @@ -3616,7 +3616,7 @@ multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm, let Inst{19} = imm{3}; } def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64, - tvecshiftR32> { + tvecshiftR32> { let Inst{20-19} = imm{4-3}; } def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; @@ -3777,10 +3777,10 @@ multiclass sve_int_un_pred_arit_0<bits<3> opc, string asm, def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; - def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm, @@ -3789,9 +3789,9 @@ multiclass sve_int_un_pred_arit_0_h<bits<3> opc, string asm, def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; - def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>; - def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>; - def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>; + def : SVE_InReg_Extend<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>; + def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>; + def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm, @@ -3799,15 +3799,15 @@ multiclass sve_int_un_pred_arit_0_w<bits<3> opc, string asm, def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; - def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>; - def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>; + def : SVE_InReg_Extend<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>; + def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_0_d<bits<3> opc, string asm, SDPatternOperator op> { def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; - def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>; + def : SVE_InReg_Extend<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm, @@ -3817,23 +3817,23 @@ multiclass sve_int_un_pred_arit_1<bits<3> opc, string asm, def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; - def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm, SDPatternOperator op> { +multiclass sve_int_un_pred_arit_1_fp<bits<3> opc, string asm, SDPatternOperator op> { def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; - def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4002,10 +4002,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImm8Pat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImm16Pat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImm32Pat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImm64Pat, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> { @@ -4014,10 +4014,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4130,7 +4130,7 @@ multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } - + def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8, !cast<Instruction>(NAME # _B)>; def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>; @@ -4289,8 +4289,8 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm, let Inst{3-0} = Pd; let Defs = [NZCV]; - let ElementSize = pprty.ElementSize; - let isPTestLike = 1; + let ElementSize = pprty.ElementSize; + let isPTestLike = 1; } multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt, @@ -4363,7 +4363,7 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let Defs = [NZCV]; let ElementSize = pprty.ElementSize; - let isPTestLike = 1; + let isPTestLike = 1; } multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc, @@ -4423,8 +4423,8 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; let Defs = [NZCV]; - let ElementSize = pprty.ElementSize; - let isPTestLike = 1; + let ElementSize = pprty.ElementSize; + let isPTestLike = 1; } multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc, @@ -4469,7 +4469,7 @@ class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt> } class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm, - RegisterClass gprty, PPRRegOp pprty> + RegisterClass gprty, PPRRegOp pprty> : I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm), asm, "\t$Pd, $Rn, $Rm", "", []>, Sched<[]> { @@ -4487,32 +4487,32 @@ class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm, let Inst{3-0} = Pd; let Defs = [NZCV]; - let ElementSize = pprty.ElementSize; - let isWhile = 1; + let ElementSize = pprty.ElementSize; + let isWhile = 1; } multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> { - def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>; - def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>; - def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>; - def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>; + def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>; + def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>; + def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>; + def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>; def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> { - def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>; - def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>; - def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>; - def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>; + def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>; + def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>; + def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>; + def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>; def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>; } class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm, @@ -4533,8 +4533,8 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm, let Inst{3-0} = Pd; let Defs = [NZCV]; - let ElementSize = pprty.ElementSize; - let isWhile = 1; + let ElementSize = pprty.ElementSize; + let isWhile = 1; } multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> { @@ -4577,10 +4577,10 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> { def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>; def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>; def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; } @@ -4616,10 +4616,10 @@ multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> { def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>; def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; } @@ -4840,11 +4840,11 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator op> { def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>; } - + //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// - + class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm, ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), @@ -4869,19 +4869,19 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm, let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps, - SDPatternOperator op = null_frag> { - def _B : SVEPseudo2Instr<Ps # _B, 1>, +multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps, + SDPatternOperator op = null_frag> { + def _B : SVEPseudo2Instr<Ps # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : SVEPseudo2Instr<Ps # _H, 1>, + def _H : SVEPseudo2Instr<Ps # _H, 1>, sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : SVEPseudo2Instr<Ps # _S, 1>, + def _S : SVEPseudo2Instr<Ps # _S, 1>, sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : SVEPseudo2Instr<Ps # _D, 1>, + def _D : SVEPseudo2Instr<Ps # _D, 1>, sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; @@ -4893,16 +4893,16 @@ multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps, def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>; } -// As above but shift amount takes the form of a "vector immediate". -multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm, - string Ps, SDPatternOperator op> -: sve_int_bin_pred_shift_imm_left<opc, asm, Ps, null_frag> { - def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>; -} - +// As above but shift amount takes the form of a "vector immediate". +multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm, + string Ps, SDPatternOperator op> +: sve_int_bin_pred_shift_imm_left<opc, asm, Ps, null_frag> { + def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>; +} + multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> { def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, tvecshiftL8, FalseLanesZero>; def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>; @@ -4939,16 +4939,16 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps, def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>; } -// As above but shift amount takes the form of a "vector immediate". -multiclass sve_int_bin_pred_shift_imm_right_dup<bits<4> opc, string asm, - string Ps, SDPatternOperator op> -: sve_int_bin_pred_shift_imm_right<opc, asm, Ps, null_frag> { - def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>; -} - +// As above but shift amount takes the form of a "vector immediate". +multiclass sve_int_bin_pred_shift_imm_right_dup<bits<4> opc, string asm, + string Ps, SDPatternOperator op> +: sve_int_bin_pred_shift_imm_right<opc, asm, Ps, null_frag> { + def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>; +} + multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> { def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>; def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>; @@ -5089,10 +5089,10 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm, let Inst{20-19} = imm{4-3}; } - def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8, !cast<Instruction>(NAME # _B)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm, @@ -5109,12 +5109,12 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm, let Inst{20-19} = imm{4-3}; } - def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>; - def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8, !cast<Instruction>(NAME # _B)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1, i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1, i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>; + def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1, i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>; } - + //===----------------------------------------------------------------------===// // SVE Memory - Store Group //===----------------------------------------------------------------------===// @@ -5623,7 +5623,7 @@ class sve_int_perm_bin_perm_pp<bits<3> opc, bits<2> sz8_64, string asm, PPRRegOp pprty> : I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm), asm, "\t$Pd, $Pn, $Pm", - "", []>, Sched<[]> { + "", []>, Sched<[]> { bits<4> Pd; bits<4> Pm; bits<4> Pn; @@ -5689,7 +5689,7 @@ class sve_int_rdffr_pred<bit s, string asm> let Inst{4} = 0; let Inst{3-0} = Pd; - let Defs = !if(s, [NZCV], []); + let Defs = !if(s, [NZCV], []); let Uses = [FFR]; } @@ -5816,11 +5816,11 @@ multiclass sve_int_perm_clast_vz<bit ab, string asm, SDPatternOperator op> { def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>; def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>; - def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>; - def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_3_Op_Pat<bf16, op, nxv8i1, bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>; + def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>; + def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>; + + def : SVE_3_Op_Pat<bf16, op, nxv8i1, bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; } class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm, @@ -5860,8 +5860,8 @@ multiclass sve_int_perm_clast_zz<bit ab, string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; } class sve_int_perm_last_r<bits<2> sz8_64, bit ab, string asm, @@ -5924,8 +5924,8 @@ multiclass sve_int_perm_last_v<bit ab, string asm, SDPatternOperator op> { def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>; def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_2_Op_Pat<bf16, op, nxv8i1, nxv8bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_2_Op_Pat<bf16, op, nxv8i1, nxv8bf16, !cast<Instruction>(NAME # _H)>; } class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty> @@ -5962,8 +5962,8 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> { def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>; - - def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; + + def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _H)>; } class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm, @@ -6019,20 +6019,20 @@ multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> { def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>; - def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_perm_rev_revb<string asm, SDPatternOperator op> { +multiclass sve_int_perm_rev_revb<string asm, SDPatternOperator op> { def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>; def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>; - def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> { @@ -6139,9 +6139,9 @@ multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> { (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>; def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)), (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>; - - def : Pat<(nxv8bf16 (op nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), - (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>; + + def : Pat<(nxv8bf16 (op nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), + (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>; } class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty> @@ -6194,8 +6194,8 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm, let Inst{4-0} = Zt; let mayLoad = 1; - let Uses = !if(nf, [FFR], []); - let Defs = !if(nf, [FFR], []); + let Uses = !if(nf, [FFR], []); + let Defs = !if(nf, [FFR], []); } multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm, @@ -6397,8 +6397,8 @@ class sve_mem_cld_ss_base<bits<4> dtype, bit ff, dag iops, string asm, let Inst{4-0} = Zt; let mayLoad = 1; - let Uses = !if(ff, [FFR], []); - let Defs = !if(ff, [FFR], []); + let Uses = !if(ff, [FFR], []); + let Defs = !if(ff, [FFR], []); } multiclass sve_mem_cld_ss<bits<4> dtype, string asm, RegisterOperand listty, @@ -7227,8 +7227,8 @@ multiclass sve_int_bin_cons_misc_0_c_fexpa<string asm, SDPatternOperator op> { //===----------------------------------------------------------------------===// class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm, - ZPRRegOp zprty, FPRasZPROperand dstOpType> -: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -7246,54 +7246,54 @@ class sve_int_reduce<bits<2> sz8_32, bits<2> fmt, bits<3> opc, string asm, let Inst{4-0} = Vd; } -multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, - SDPatternOperator op> { - def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; - def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; - def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; +multiclass sve_int_reduce_0_saddv<bits<3> opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; + def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; + def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; } -multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, - SDPatternOperator op> { - def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; - def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; - def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; - def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64asZPR>; +multiclass sve_int_reduce_0_uaddv<bits<3> opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; + def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; + def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; + def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_reduce_1<bits<3> opc, string asm, - SDPatternOperator op> { - def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8asZPR>; - def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16asZPR>; - def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32asZPR>; - def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64asZPR>; +multiclass sve_int_reduce_1<bits<3> opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } -multiclass sve_int_reduce_2<bits<3> opc, string asm, - SDPatternOperator op> { - def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8asZPR>; - def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16asZPR>; - def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32asZPR>; - def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64asZPR>; +multiclass sve_int_reduce_2<bits<3> opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; - def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; - def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_2_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>; + def : SVE_2_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>; + def : SVE_2_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_2_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve_int_movprfx_pred<bits<2> sz8_32, bits<3> opc, string asm, @@ -7398,7 +7398,7 @@ class sve_int_brkn<bit S, string asm> let Inst{3-0} = Pdm; let Constraints = "$Pdm = $_Pdm"; - let Defs = !if(S, [NZCV], []); + let Defs = !if(S, [NZCV], []); } multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> { @@ -7900,8 +7900,8 @@ multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty, def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]", (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; - def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))), - (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>; + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))), + (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>; } //===----------------------------------------------------------------------===// @@ -7935,7 +7935,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, !cast<Instruction>(NAME)>; def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>; def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME)>; - def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>; + def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>; } /// Addressing modes @@ -7954,10 +7954,10 @@ multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> { def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>; def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>; - def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Pseudo>(NAME # _UNDEF_H)>; - def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Pseudo>(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat<nxv4f16, op, nxv4i1, nxv4f16, nxv4f16, !cast<Pseudo>(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat<nxv2f16, op, nxv2i1, nxv2f16, nxv2f16, !cast<Pseudo>(NAME # _UNDEF_H)>; def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>; - def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Pseudo>(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat<nxv2f32, op, nxv2i1, nxv2f32, nxv2f32, !cast<Pseudo>(NAME # _UNDEF_S)>; def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>; } @@ -7982,19 +7982,19 @@ multiclass sve_int_bin_pred_sd<SDPatternOperator op> { def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>; def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>; } - -// Predicated pseudo integer two operand instructions. Second operand is an -// immediate specified by imm_[bhsd]. -multiclass sve_int_shift_pred_bhsd<SDPatternOperator op, - ComplexPattern imm_b, ComplexPattern imm_h, - ComplexPattern imm_s, ComplexPattern imm_d> { - def _UNDEF_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, Operand<i32>, FalseLanesUndef>; - def _UNDEF_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesUndef>; - def _UNDEF_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesUndef>; - def _UNDEF_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesUndef>; - - def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Instruction>(NAME # _UNDEF_B)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, imm_h, !cast<Instruction>(NAME # _UNDEF_H)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>; - def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>; -} + +// Predicated pseudo integer two operand instructions. Second operand is an +// immediate specified by imm_[bhsd]. +multiclass sve_int_shift_pred_bhsd<SDPatternOperator op, + ComplexPattern imm_b, ComplexPattern imm_h, + ComplexPattern imm_s, ComplexPattern imm_d> { + def _UNDEF_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, Operand<i32>, FalseLanesUndef>; + def _UNDEF_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, Operand<i32>, FalseLanesUndef>; + def _UNDEF_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, Operand<i32>, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, Operand<i32>, FalseLanesUndef>; + + def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, imm_b, !cast<Instruction>(NAME # _UNDEF_B)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1, i32, imm_h, !cast<Instruction>(NAME # _UNDEF_H)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1, i32, imm_s, !cast<Instruction>(NAME # _UNDEF_S)>; + def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1, i64, imm_d, !cast<Instruction>(NAME # _UNDEF_D)>; +} diff --git a/contrib/libs/llvm12/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/contrib/libs/llvm12/lib/Target/AArch64/SVEIntrinsicOpts.cpp index 9911f33371..e312d9d28b 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -37,7 +37,7 @@ using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "aarch64-sve-intrinsic-opts" +#define DEBUG_TYPE "aarch64-sve-intrinsic-opts" namespace llvm { void initializeSVEIntrinsicOptsPass(PassRegistry &); @@ -177,50 +177,50 @@ bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) { if (isa<PHINode>(I->getArgOperand(0))) return processPhiNode(I); - SmallVector<Instruction *, 32> CandidatesForRemoval; - Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr; - - const auto *IVTy = cast<VectorType>(I->getType()); - - // Walk the chain of conversions. - while (Cursor) { - // If the type of the cursor has fewer lanes than the final result, zeroing - // must take place, which breaks the equivalence chain. - const auto *CursorVTy = cast<VectorType>(Cursor->getType()); - if (CursorVTy->getElementCount().getKnownMinValue() < - IVTy->getElementCount().getKnownMinValue()) - break; - - // If the cursor has the same type as I, it is a viable replacement. - if (Cursor->getType() == IVTy) - EarliestReplacement = Cursor; - - auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); - - // If this is not an SVE conversion intrinsic, this is the end of the chain. - if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == - Intrinsic::aarch64_sve_convert_to_svbool || - IntrinsicCursor->getIntrinsicID() == - Intrinsic::aarch64_sve_convert_from_svbool)) - break; - - CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); - Cursor = IntrinsicCursor->getOperand(0); - } - - // If no viable replacement in the conversion chain was found, there is - // nothing to do. - if (!EarliestReplacement) + SmallVector<Instruction *, 32> CandidatesForRemoval; + Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr; + + const auto *IVTy = cast<VectorType>(I->getType()); + + // Walk the chain of conversions. + while (Cursor) { + // If the type of the cursor has fewer lanes than the final result, zeroing + // must take place, which breaks the equivalence chain. + const auto *CursorVTy = cast<VectorType>(Cursor->getType()); + if (CursorVTy->getElementCount().getKnownMinValue() < + IVTy->getElementCount().getKnownMinValue()) + break; + + // If the cursor has the same type as I, it is a viable replacement. + if (Cursor->getType() == IVTy) + EarliestReplacement = Cursor; + + auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); + + // If this is not an SVE conversion intrinsic, this is the end of the chain. + if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_to_svbool || + IntrinsicCursor->getIntrinsicID() == + Intrinsic::aarch64_sve_convert_from_svbool)) + break; + + CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); + Cursor = IntrinsicCursor->getOperand(0); + } + + // If no viable replacement in the conversion chain was found, there is + // nothing to do. + if (!EarliestReplacement) return false; - I->replaceAllUsesWith(EarliestReplacement); + I->replaceAllUsesWith(EarliestReplacement); I->eraseFromParent(); - while (!CandidatesForRemoval.empty()) { - Instruction *Candidate = CandidatesForRemoval.pop_back_val(); - if (Candidate->use_empty()) - Candidate->eraseFromParent(); - } + while (!CandidatesForRemoval.empty()) { + Instruction *Candidate = CandidatesForRemoval.pop_back_val(); + if (Candidate->use_empty()) + Candidate->eraseFromParent(); + } return true; } @@ -276,8 +276,8 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) { case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: - for (User *U : F.users()) - Functions.insert(cast<Instruction>(U)->getFunction()); + for (User *U : F.users()) + Functions.insert(cast<Instruction>(U)->getFunction()); break; default: break; diff --git a/contrib/libs/llvm12/lib/Target/AArch64/TargetInfo/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/TargetInfo/ya.make index bb7d4a2c89..cf2f9565d1 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/TargetInfo/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/TargetInfo/ya.make @@ -12,13 +12,13 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/lib/Support + contrib/libs/llvm12 + contrib/libs/llvm12/lib/Support ) ADDINCL( - contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64/TargetInfo + contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64/TargetInfo ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index ac59d73fd9..8a90a74841 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -26,13 +26,13 @@ namespace llvm { namespace llvm { - namespace AArch64DBnXS { -#define GET_DBNXS_IMPL -#include "AArch64GenSystemOperands.inc" - } -} - -namespace llvm { + namespace AArch64DBnXS { +#define GET_DBNXS_IMPL +#include "AArch64GenSystemOperands.inc" + } +} + +namespace llvm { namespace AArch64DB { #define GET_DB_IMPL #include "AArch64GenSystemOperands.inc" @@ -165,7 +165,7 @@ std::string AArch64SysReg::genericRegisterString(uint32_t Bits) { namespace llvm { namespace AArch64TLBI { -#define GET_TLBITable_IMPL +#define GET_TLBITable_IMPL #include "AArch64GenSystemOperands.inc" } } diff --git a/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 1b13c94389..6d737ac8e1 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/libs/llvm12/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -338,14 +338,14 @@ struct SysAliasReg : SysAlias { : SysAlias(N, E, F), NeedsReg(R) {} }; -struct SysAliasImm : SysAlias { - uint16_t ImmValue; - constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I) - : SysAlias(N, E), ImmValue(I) {} - constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I, FeatureBitset F) - : SysAlias(N, E, F), ImmValue(I) {} -}; - +struct SysAliasImm : SysAlias { + uint16_t ImmValue; + constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I) + : SysAlias(N, E), ImmValue(I) {} + constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I, FeatureBitset F) + : SysAlias(N, E, F), ImmValue(I) {} +}; + namespace AArch64AT{ struct AT : SysAlias { using SysAlias::SysAlias; @@ -362,14 +362,14 @@ namespace AArch64DB { #include "AArch64GenSystemOperands.inc" } -namespace AArch64DBnXS { - struct DBnXS : SysAliasImm { - using SysAliasImm::SysAliasImm; - }; - #define GET_DBNXS_DECL - #include "AArch64GenSystemOperands.inc" -} - +namespace AArch64DBnXS { + struct DBnXS : SysAliasImm { + using SysAliasImm::SysAliasImm; + }; + #define GET_DBNXS_DECL + #include "AArch64GenSystemOperands.inc" +} + namespace AArch64DC { struct DC : SysAlias { using SysAlias::SysAlias; @@ -568,7 +568,7 @@ namespace AArch64TLBI { struct TLBI : SysAliasReg { using SysAliasReg::SysAliasReg; }; - #define GET_TLBITable_DECL + #define GET_TLBITable_DECL #include "AArch64GenSystemOperands.inc" } @@ -622,7 +622,7 @@ namespace AArch64II { MO_HI12 = 7, /// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the - /// reference is actually to the ".refptr.FOO" symbol. This is used for + /// reference is actually to the ".refptr.FOO" symbol. This is used for /// stub symbols on windows. MO_COFFSTUB = 0x8, diff --git a/contrib/libs/llvm12/lib/Target/AArch64/Utils/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/Utils/ya.make index 3668c2a650..37d19feb17 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/Utils/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/Utils/ya.make @@ -12,15 +12,15 @@ LICENSE(Apache-2.0 WITH LLVM-exception) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Support + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Support ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64/Utils + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64/Utils ) NO_COMPILER_WARNINGS() diff --git a/contrib/libs/llvm12/lib/Target/AArch64/ya.make b/contrib/libs/llvm12/lib/Target/AArch64/ya.make index 244cbc7f34..0c05f2840f 100644 --- a/contrib/libs/llvm12/lib/Target/AArch64/ya.make +++ b/contrib/libs/llvm12/lib/Target/AArch64/ya.make @@ -15,28 +15,28 @@ LICENSE( LICENSE_TEXTS(.yandex_meta/licenses.list.txt) PEERDIR( - contrib/libs/llvm12 - contrib/libs/llvm12/include - contrib/libs/llvm12/lib/Analysis - contrib/libs/llvm12/lib/CodeGen - contrib/libs/llvm12/lib/CodeGen/AsmPrinter - contrib/libs/llvm12/lib/CodeGen/GlobalISel - contrib/libs/llvm12/lib/CodeGen/SelectionDAG - contrib/libs/llvm12/lib/IR - contrib/libs/llvm12/lib/MC - contrib/libs/llvm12/lib/Support - contrib/libs/llvm12/lib/Target - contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc - contrib/libs/llvm12/lib/Target/AArch64/TargetInfo - contrib/libs/llvm12/lib/Target/AArch64/Utils - contrib/libs/llvm12/lib/Transforms/CFGuard - contrib/libs/llvm12/lib/Transforms/Scalar - contrib/libs/llvm12/lib/Transforms/Utils + contrib/libs/llvm12 + contrib/libs/llvm12/include + contrib/libs/llvm12/lib/Analysis + contrib/libs/llvm12/lib/CodeGen + contrib/libs/llvm12/lib/CodeGen/AsmPrinter + contrib/libs/llvm12/lib/CodeGen/GlobalISel + contrib/libs/llvm12/lib/CodeGen/SelectionDAG + contrib/libs/llvm12/lib/IR + contrib/libs/llvm12/lib/MC + contrib/libs/llvm12/lib/Support + contrib/libs/llvm12/lib/Target + contrib/libs/llvm12/lib/Target/AArch64/MCTargetDesc + contrib/libs/llvm12/lib/Target/AArch64/TargetInfo + contrib/libs/llvm12/lib/Target/AArch64/Utils + contrib/libs/llvm12/lib/Transforms/CFGuard + contrib/libs/llvm12/lib/Transforms/Scalar + contrib/libs/llvm12/lib/Transforms/Utils ) ADDINCL( - ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 - contrib/libs/llvm12/lib/Target/AArch64 + ${ARCADIA_BUILD_ROOT}/contrib/libs/llvm12/lib/Target/AArch64 + contrib/libs/llvm12/lib/Target/AArch64 ) NO_COMPILER_WARNINGS() @@ -88,8 +88,8 @@ SRCS( GISel/AArch64InstructionSelector.cpp GISel/AArch64LegalizerInfo.cpp GISel/AArch64PostLegalizerCombiner.cpp - GISel/AArch64PostLegalizerLowering.cpp - GISel/AArch64PostSelectOptimize.cpp + GISel/AArch64PostLegalizerLowering.cpp + GISel/AArch64PostSelectOptimize.cpp GISel/AArch64PreLegalizerCombiner.cpp GISel/AArch64RegisterBankInfo.cpp SVEIntrinsicOpts.cpp |