aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
diff options
context:
space:
mode:
authorshadchin <shadchin@yandex-team.ru>2022-02-10 16:44:39 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:44:39 +0300
commite9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch)
tree64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
parent2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff)
downloadydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz
Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td')
-rw-r--r--contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td976
1 files changed, 488 insertions, 488 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
index c5e1d32e8d..12296ad092 100644
--- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
+++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
@@ -1,488 +1,488 @@
-//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
-//
-//===----------------------------------------------------------------------===//
-
-def CortexM7Model : SchedMachineModel {
- let IssueWidth = 2; // Dual issue for most instructions.
- let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
- let LoadLatency = 2; // Best case for load-use case.
- let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
- // but 4 works better
- let CompleteModel = 0;
-}
-
-//===--------------------------------------------------------------------===//
-// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
-// pipe. The stages relevant to scheduling are as follows:
-//
-// EX1: address generation shifts
-// EX2: fast load data ALUs FP operation
-// EX3: slow load data integer writeback FP operation
-// EX4: store data FP writeback
-//
-// There are shifters in both EX1 and EX2, and some instructions can be
-// flexibly allocated between them. EX2 is used as the "zero" point
-// for scheduling, so simple ALU operations executing in EX2 will have
-// ReadAdvance<0> (the default) for their source operands and Latency = 1.
-
-def M7UnitLoad : ProcResource<2> { let BufferSize = 0; }
-def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
-def M7UnitALU : ProcResource<2>;
-def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
-def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
-def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
-def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
-def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
-def M7UnitVPort : ProcResource<2> { let BufferSize = 0; }
-def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }
-
-//===---------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types with map ProcResources and set latency.
-
-let SchedModel = CortexM7Model in {
-
-def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
-
-// Basic ALU with shifts.
-let Latency = 1 in {
- def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>;
- def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>;
- def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
-}
-
-// Compares.
-def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
-def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
-
-// Multiplies.
-let Latency = 2 in {
- def : WriteRes<WriteMUL16, [M7UnitMAC]>;
- def : WriteRes<WriteMUL32, [M7UnitMAC]>;
- def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
- def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
-}
-
-// Multiply-accumulates.
-let Latency = 2 in {
- def : WriteRes<WriteMAC16, [M7UnitMAC]>;
- def : WriteRes<WriteMAC32, [M7UnitMAC]>;
- def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
- def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
-}
-
-// Divisions.
-// These cannot be dual-issued with any instructions.
-def : WriteRes<WriteDIV, [M7UnitALU]> {
- let Latency = 7;
- let SingleIssue = 1;
-}
-
-// Loads/Stores.
-def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; }
-def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
-def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; }
-
-// Branches.
-def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; }
-def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; }
-def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
-
-// Noop.
-def : WriteRes<WriteNoop, []> { let Latency = 0; }
-
-//===---------------------------------------------------------------------===//
-// Sched definitions for floating-point instructions
-//
-// Floating point conversions.
-def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
-def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }
-
-// The FP pipeline has a latency of 3 cycles.
-// ALU operations (32/64-bit). These go down the FP pipeline.
-def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
-def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 4;
- let BeginGroup = 1;
-}
-
-// Multiplication
-def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
-def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 7;
- let BeginGroup = 1;
-}
-
-// Multiply-accumulate. FPMAC goes down the FP Pipeline.
-def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
-def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 11;
- let BeginGroup = 1;
-}
-
-// Division. Effective scheduling latency is 3, though real latency is larger
-def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
-def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 30;
- let BeginGroup = 1;
-}
-
-// Square-root. Effective scheduling latency is 3; real latency is larger
-def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
-def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 30;
- let BeginGroup = 1;
-}
-
-def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
-
-// Not used for M7, but needing definitions anyway
-def : WriteRes<WriteVLD1, []>;
-def : WriteRes<WriteVLD2, []>;
-def : WriteRes<WriteVLD3, []>;
-def : WriteRes<WriteVLD4, []>;
-def : WriteRes<WriteVST1, []>;
-def : WriteRes<WriteVST2, []>;
-def : WriteRes<WriteVST3, []>;
-def : WriteRes<WriteVST4, []>;
-
-def M7SingleIssue : SchedWriteRes<[]> {
- let SingleIssue = 1;
- let NumMicroOps = 0;
-}
-def M7Slot0Only : SchedWriteRes<[]> {
- let BeginGroup = 1;
- let NumMicroOps = 0;
-}
-
-// What pipeline stage operands need to be ready for depending on
-// where they come from.
-def : ReadAdvance<ReadALUsr, 0>;
-def : ReadAdvance<ReadMUL, 0>;
-def : ReadAdvance<ReadMAC, 1>;
-def : ReadAdvance<ReadALU, 0>;
-def : ReadAdvance<ReadFPMUL, 0>;
-def : ReadAdvance<ReadFPMAC, 3>;
-def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1
-def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3
-def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4
-
-// Non general purpose instructions may not be dual issued. These
-// use both issue units.
-def M7NonGeneralPurpose : SchedWriteRes<[]> {
- // Assume that these will go down the main ALU pipeline.
- // In reality, many look likely to stall the whole pipeline.
- let Latency = 3;
- let SingleIssue = 1;
-}
-
-// List the non general purpose instructions.
-def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
- "t2MSR", "t2DMB", "t2DSB", "t2ISB",
- "t2HVC", "t2SMC", "t2UDF", "ERET",
- "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
-
-//===---------------------------------------------------------------------===//
-// Sched definitions for load/store
-//
-// Mark whether the loads/stores must be single-issue
-// Address operands are needed earlier
-// Data operands are needed later
-
-def M7BaseUpdate : SchedWriteRes<[]> {
- let Latency = 0; // Update is bypassable out of EX1
- let NumMicroOps = 0;
-}
-def M7LoadLatency1 : SchedWriteRes<[]> {
- let Latency = 1;
- let NumMicroOps = 0;
-}
-def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; }
-
-// Byte and half-word loads should have greater latency than other loads.
-// So should load exclusive.
-
-def : InstRW<[M7SlowLoad],
- (instregex "t2LDR(B|H|SB|SH)pc")>;
-def : InstRW<[M7SlowLoad, M7Read_ISS],
- (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
- "tLDR(B|H)i")>;
-def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
- (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
-def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
- (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
-
-// Exclusive loads/stores cannot be dual-issued
-def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
- (instregex "t2LDREX$")>;
-def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
- (instregex "t2LDREX(B|H)")>;
-def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
- (instregex "t2STREX(B|H)?$")>;
-
-// Load/store multiples cannot be dual-issued. Note that default scheduling
-// occurs around read/write times of individual registers in the list; read
-// time for STM cannot be overridden because it is a variadic source operand.
-
-def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
- (instregex "(t|t2)LDM(DB|IA)$")>;
-def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
- (instregex "(t|t2)STM(DB|IA)$")>;
-def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
- (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
- (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
-
-// Load/store doubles cannot be dual-issued.
-
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
- M7Read_EX2, M7Read_EX2, M7Read_ISS],
- (instregex "t2STRD_(PRE|POST)")>;
-def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
- (instregex "t2STRDi")>;
-def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
- (instregex "t2LDRD_(PRE|POST)")>;
-def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
- (instregex "t2LDRDi")>;
-
-// Word load / preload
-def : InstRW<[WriteLd],
- (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
-def : InstRW<[WriteLd, M7Read_ISS],
- (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
-def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
- (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
-def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
- (instregex "t2LDR_(POST|PRE)")>;
-
-// Stores
-def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
- (instregex "t2STR(B|H)?_(POST|PRE)")>;
-def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
- (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
-def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
- (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
-
-// TBB/TBH - single-issue only; takes two cycles to issue
-
-def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
- let NumMicroOps = 2;
- let SingleIssue = 1;
-}
-
-def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
-
-// VFP loads and stores
-
-def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
-def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
- let Latency = 2;
- let SingleIssue = 1;
-}
-def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
-def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
- let SingleIssue = 1;
-}
-
-def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>;
-def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>;
-def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>;
-def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>;
-
-// Load/store multiples cannot be dual-issued.
-
-def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
- (instregex "VLDM(S|D|Q)(DB|IA)$")>;
-def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
- (instregex "VSTM(S|D|Q)(DB|IA)$")>;
-def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
- (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
- (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
-
-//===---------------------------------------------------------------------===//
-// Sched definitions for ALU
-//
-
-// Shifted ALU operands are read a cycle early.
-def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
-
-def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
- (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
- "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
- "t2MOVsr(a|l)")>;
-def : InstRW<[WriteALUsi, M7Read_ISS],
- (instregex "t2MVNs")>;
-
-// Treat pure shift operations (except for RRX) as if they used the EX1
-// shifter but have timing as if they used the EX2 shifter as they usually
-// can choose the EX2 shifter when needed. Will miss a few dual-issue cases,
-// but the results prove to be better than trying to get them exact.
-
-def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
-def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
-
-// Instructions that use the shifter, but have normal timing.
-
-def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
-
-// Instructions which are slot zero only but otherwise normal.
-
-def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
-
-// MAC operations that don't have SchedRW set.
-
-def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
-
-// Divides are special because they stall for their latency, and so look like a
-// single-cycle as far as scheduling opportunities go. By putting WriteALU
-// first, we make the operand latency 1, but keep the instruction latency 7.
-
-def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
-
-// DSP extension operations
-
-def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
- let Latency = 1;
- let BeginGroup = 1;
-}
-def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
- let Latency = 2;
- let BeginGroup = 1;
-}
-def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
- let Latency = 1;
- let BeginGroup = 1;
-}
-def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
- let Latency = 0; // Bypassable out of EX1
- let BeginGroup = 1;
-}
-def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
- let Latency = 2;
- let BeginGroup = 1;
-}
-
-def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
- (instregex "t2(S|U)SAT")>;
-def : InstRW<[M7WriteSIMD1, ReadALU],
- (instregex "(t|t2)(S|U)XT(B|H)")>;
-def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
- (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
- "t2SEL")>;
-def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
- (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
-def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
- (instregex "t2QD(ADD|SUB)")>;
-def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
- (instregex "t2(RBIT|REV)", "tREV")>;
-def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
- (instregex "t2(SBFX|UBFX)")>;
-def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
- (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
-def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
- (instregex "t2USADA8")>;
-
-// MSR/MRS
-def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
-
-//===---------------------------------------------------------------------===//
-// Sched definitions for FP operations
-//
-
-// Effective scheduling latency is really 3 for nearly all FP operations,
-// even if their true latency is higher.
-def M7WriteVFPLatOverride : SchedWriteRes<[]> {
- let Latency = 3;
- let NumMicroOps = 0;
-}
-def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> {
- let Latency = 3;
- let NumMicroOps = 0;
-}
-
-// Instructions which are missing default schedules.
-def : InstRW<[WriteFPALU32],
- (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
- (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
-
-// VCMP
-def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
-def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
- let Latency = 0;
- let BeginGroup = 1;
-}
-def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
-def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
-
- // VMRS/VMSR
-def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
-def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
-def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
-def : InstRW<[M7VMSR], (instregex "VMSR")>;
-
-// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
-def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
- (instregex "VSEL.*S$")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
- ReadALU, ReadALU, M7Read_ISS],
- (instregex "VSEL.*D$")>;
-
-// VMOV
-def : InstRW<[WriteFPMOV],
- (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
- (instregex "VMOVD$")>;
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
- (instregex "FCONSTD")>;
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
- (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
-
-// Larger-latency overrides.
-
-def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
- (instregex "V(MUL|NMUL)D")>;
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
- (instregex "V(ADD|SUB)D")>;
-
-// Multiply-accumulate. Chained SP timing is correct; rest need overrides
-// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
-// making it appear to have 3 cycle latency for scheduling.
-
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
- ReadFPMAC, ReadFPMUL, ReadFPMUL],
- (instregex "V(N)?ML(A|S)D$")>;
-
-// Single-precision fused MACs look like latency 5 with advance of 2.
-
-def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
- let Latency = 5;
- let NumMicroOps = 0;
-}
-def M7ReadFPMAC2 : SchedReadAdvance<2>;
-
-def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
- M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
- (instregex "VF(N)?M(A|S)S$")>;
-
-// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
-// it appear to have 3 cycle latency for scheduling.
-
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
- ReadFPMAC, ReadFPMUL, ReadFPMUL],
- (instregex "VF(N)?M(A|S)D$")>;
-
-} // SchedModel = CortexM7Model
+//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM7Model : SchedMachineModel {
+ let IssueWidth = 2; // Dual issue for most instructions.
+ let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
+ let LoadLatency = 2; // Best case for load-use case.
+ let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
+ // but 4 works better
+ let CompleteModel = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
+// pipe. The stages relevant to scheduling are as follows:
+//
+// EX1: address generation shifts
+// EX2: fast load data ALUs FP operation
+// EX3: slow load data integer writeback FP operation
+// EX4: store data FP writeback
+//
+// There are shifters in both EX1 and EX2, and some instructions can be
+// flexibly allocated between them. EX2 is used as the "zero" point
+// for scheduling, so simple ALU operations executing in EX2 will have
+// ReadAdvance<0> (the default) for their source operands and Latency = 1.
+
+def M7UnitLoad : ProcResource<2> { let BufferSize = 0; }
+def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
+def M7UnitALU : ProcResource<2>;
+def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
+def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPort : ProcResource<2> { let BufferSize = 0; }
+def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }
+
+//===---------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types with map ProcResources and set latency.
+
+let SchedModel = CortexM7Model in {
+
+def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
+
+// Basic ALU with shifts.
+let Latency = 1 in {
+ def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>;
+ def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>;
+ def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
+}
+
+// Compares.
+def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+
+// Multiplies.
+let Latency = 2 in {
+ def : WriteRes<WriteMUL16, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL32, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
+ def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Multiply-accumulates.
+let Latency = 2 in {
+ def : WriteRes<WriteMAC16, [M7UnitMAC]>;
+ def : WriteRes<WriteMAC32, [M7UnitMAC]>;
+ def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
+ def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Divisions.
+// These cannot be dual-issued with any instructions.
+def : WriteRes<WriteDIV, [M7UnitALU]> {
+ let Latency = 7;
+ let SingleIssue = 1;
+}
+
+// Loads/Stores.
+def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; }
+def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
+def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; }
+
+// Branches.
+def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
+
+// Noop.
+def : WriteRes<WriteNoop, []> { let Latency = 0; }
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for floating-point instructions
+//
+// Floating point conversions.
+def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }
+
+// The FP pipeline has a latency of 3 cycles.
+// ALU operations (32/64-bit). These go down the FP pipeline.
+def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 4;
+ let BeginGroup = 1;
+}
+
+// Multiplication
+def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 7;
+ let BeginGroup = 1;
+}
+
+// Multiply-accumulate. FPMAC goes down the FP Pipeline.
+def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
+def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 11;
+ let BeginGroup = 1;
+}
+
+// Division. Effective scheduling latency is 3, though real latency is larger
+def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 30;
+ let BeginGroup = 1;
+}
+
+// Square-root. Effective scheduling latency is 3; real latency is larger
+def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 30;
+ let BeginGroup = 1;
+}
+
+def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
+
+// Not used for M7, but needing definitions anyway
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+def M7SingleIssue : SchedWriteRes<[]> {
+ let SingleIssue = 1;
+ let NumMicroOps = 0;
+}
+def M7Slot0Only : SchedWriteRes<[]> {
+ let BeginGroup = 1;
+ let NumMicroOps = 0;
+}
+
+// What pipeline stage operands need to be ready for depending on
+// where they come from.
+def : ReadAdvance<ReadALUsr, 0>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 1>;
+def : ReadAdvance<ReadALU, 0>;
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 3>;
+def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1
+def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3
+def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4
+
+// Non general purpose instructions may not be dual issued. These
+// use both issue units.
+def M7NonGeneralPurpose : SchedWriteRes<[]> {
+ // Assume that these will go down the main ALU pipeline.
+ // In reality, many look likely to stall the whole pipeline.
+ let Latency = 3;
+ let SingleIssue = 1;
+}
+
+// List the non general purpose instructions.
+def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
+ "t2MSR", "t2DMB", "t2DSB", "t2ISB",
+ "t2HVC", "t2SMC", "t2UDF", "ERET",
+ "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for load/store
+//
+// Mark whether the loads/stores must be single-issue
+// Address operands are needed earlier
+// Data operands are needed later
+
+def M7BaseUpdate : SchedWriteRes<[]> {
+ let Latency = 0; // Update is bypassable out of EX1
+ let NumMicroOps = 0;
+}
+def M7LoadLatency1 : SchedWriteRes<[]> {
+ let Latency = 1;
+ let NumMicroOps = 0;
+}
+def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; }
+
+// Byte and half-word loads should have greater latency than other loads.
+// So should load exclusive.
+
+def : InstRW<[M7SlowLoad],
+ (instregex "t2LDR(B|H|SB|SH)pc")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
+ "tLDR(B|H)i")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
+def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
+
+// Exclusive loads/stores cannot be dual-issued
+def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
+ (instregex "t2LDREX$")>;
+def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
+ (instregex "t2LDREX(B|H)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STREX(B|H)?$")>;
+
+// Load/store multiples cannot be dual-issued. Note that default scheduling
+// occurs around read/write times of individual registers in the list; read
+// time for STM cannot be overridden because it is a variadic source operand.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)LDM(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)STM(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
+
+// Load/store doubles cannot be dual-issued.
+
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
+ M7Read_EX2, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STRD_(PRE|POST)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STRDi")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDRD_(PRE|POST)")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
+ (instregex "t2LDRDi")>;
+
+// Word load / preload
+def : InstRW<[WriteLd],
+ (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
+def : InstRW<[WriteLd, M7Read_ISS],
+ (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
+def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
+ (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
+def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
+ (instregex "t2LDR_(POST|PRE)")>;
+
+// Stores
+def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STR(B|H)?_(POST|PRE)")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
+ (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
+ (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
+
+// TBB/TBH - single-issue only; takes two cycles to issue
+
+def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
+ let NumMicroOps = 2;
+ let SingleIssue = 1;
+}
+
+def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
+
+// VFP loads and stores
+
+def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
+def M7LoadDP : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 2;
+ let SingleIssue = 1;
+}
+def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
+def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
+ let SingleIssue = 1;
+}
+
+def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>;
+def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>;
+def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>;
+def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>;
+
+// Load/store multiples cannot be dual-issued.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "VLDM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "VSTM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+ (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+ (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for ALU
+//
+
+// Shifted ALU operands are read a cycle early.
+def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
+
+def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
+ (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
+ "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
+ "t2MOVsr(a|l)")>;
+def : InstRW<[WriteALUsi, M7Read_ISS],
+ (instregex "t2MVNs")>;
+
+// Treat pure shift operations (except for RRX) as if they used the EX1
+// shifter but have timing as if they used the EX2 shifter as they usually
+// can choose the EX2 shifter when needed. Will miss a few dual-issue cases,
+// but the results prove to be better than trying to get them exact.
+
+def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
+def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
+
+// Instructions that use the shifter, but have normal timing.
+
+def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
+
+// Instructions which are slot zero only but otherwise normal.
+
+def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
+
+// MAC operations that don't have SchedRW set.
+
+def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
+
+// Divides are special because they stall for their latency, and so look like a
+// single-cycle as far as scheduling opportunities go. By putting WriteALU
+// first, we make the operand latency 1, but keep the instruction latency 7.
+
+def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
+
+// DSP extension operations
+
+def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+ let Latency = 1;
+ let BeginGroup = 1;
+}
+def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+ let Latency = 2;
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 1;
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 0; // Bypassable out of EX1
+ let BeginGroup = 1;
+}
+def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+ let Latency = 2;
+ let BeginGroup = 1;
+}
+
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
+ (instregex "t2(S|U)SAT")>;
+def : InstRW<[M7WriteSIMD1, ReadALU],
+ (instregex "(t|t2)(S|U)XT(B|H)")>;
+def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
+ (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
+ "t2SEL")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
+ (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
+ (instregex "t2QD(ADD|SUB)")>;
+def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
+ (instregex "t2(RBIT|REV)", "tREV")>;
+def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
+ (instregex "t2(SBFX|UBFX)")>;
+def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
+ (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
+ (instregex "t2USADA8")>;
+
+// MSR/MRS
+def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for FP operations
+//
+
+// Effective scheduling latency is really 3 for nearly all FP operations,
+// even if their true latency is higher.
+def M7WriteVFPLatOverride : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+
+// Instructions which are missing default schedules.
+def : InstRW<[WriteFPALU32],
+ (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+ (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
+
+// VCMP
+def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
+def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+ let Latency = 0;
+ let BeginGroup = 1;
+}
+def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
+def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
+
+ // VMRS/VMSR
+def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
+def : InstRW<[M7VMSR], (instregex "VMSR")>;
+
+// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
+def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
+ (instregex "VSEL.*S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
+ ReadALU, ReadALU, M7Read_ISS],
+ (instregex "VSEL.*D$")>;
+
+// VMOV
+def : InstRW<[WriteFPMOV],
+ (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+ (instregex "VMOVD$")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+ (instregex "FCONSTD")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
+ (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
+
+// Larger-latency overrides.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
+ (instregex "V(MUL|NMUL)D")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+ (instregex "V(ADD|SUB)D")>;
+
+// Multiply-accumulate. Chained SP timing is correct; rest need overrides
+// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
+// making it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+ ReadFPMAC, ReadFPMUL, ReadFPMUL],
+ (instregex "V(N)?ML(A|S)D$")>;
+
+// Single-precision fused MACs look like latency 5 with advance of 2.
+
+def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
+ let Latency = 5;
+ let NumMicroOps = 0;
+}
+def M7ReadFPMAC2 : SchedReadAdvance<2>;
+
+def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
+ M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
+ (instregex "VF(N)?M(A|S)S$")>;
+
+// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
+// it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+ ReadFPMAC, ReadFPMUL, ReadFPMUL],
+ (instregex "VF(N)?M(A|S)D$")>;
+
+} // SchedModel = CortexM7Model