Restoring authorship annotation for <shadchin@yandex-team.ru>. Commit 2 of 2.

author: shadchin <shadchin@yandex-team.ru> 2022-02-10 16:44:39 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:44:39 +0300
commit: e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch)
tree: 64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
parent: 2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff)
download: ydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz
1 files changed, 488 insertions, 488 deletions
diff --git a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
index c5e1d32e8d..12296ad092 100644
--- a/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
+++ b/contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
@@ -1,488 +1,488 @@
-//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=// 
-// 
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
-// See https://llvm.org/LICENSE.txt for license information. 
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
-// 
-//===----------------------------------------------------------------------===// 
-// 
-// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor. 
-// 
-//===----------------------------------------------------------------------===// 
- 
-def CortexM7Model : SchedMachineModel { 
-  let IssueWidth = 2;        // Dual issue for most instructions. 
-  let MicroOpBufferSize = 0; // The Cortex-M7 is in-order. 
-  let LoadLatency = 2;       // Best case for load-use case. 
-  let MispredictPenalty = 4; // Mispredict cost for forward branches is 6, 
-                             // but 4 works better 
-  let CompleteModel = 0; 
-} 
- 
-//===--------------------------------------------------------------------===// 
-// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP 
-// pipe. The stages relevant to scheduling are as follows: 
-// 
-//   EX1: address generation  shifts 
-//   EX2: fast load data      ALUs                  FP operation 
-//   EX3: slow load data      integer writeback     FP operation 
-//   EX4: store data                                FP writeback 
-// 
-// There are shifters in both EX1 and EX2, and some instructions can be 
-// flexibly allocated between them.  EX2 is used as the "zero" point 
-// for scheduling, so simple ALU operations executing in EX2 will have 
-// ReadAdvance<0> (the default) for their source operands and Latency = 1. 
- 
-def M7UnitLoad   : ProcResource<2> { let BufferSize = 0; } 
-def M7UnitStore  : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitALU    : ProcResource<2>; 
-def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitMAC    : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitBranch : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitVFP    : ProcResource<1> { let BufferSize = 0; } 
-def M7UnitVPort  : ProcResource<2> { let BufferSize = 0; } 
-def M7UnitSIMD   : ProcResource<1> { let BufferSize = 0; } 
- 
-//===---------------------------------------------------------------------===// 
-// Subtarget-specific SchedWrite types with map ProcResources and set latency. 
- 
-let SchedModel = CortexM7Model in { 
- 
-def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; } 
- 
-// Basic ALU with shifts. 
-let Latency = 1 in { 
-  def : WriteRes<WriteALUsi,  [M7UnitALU, M7UnitShift1]>; 
-  def : WriteRes<WriteALUsr,  [M7UnitALU, M7UnitShift1]>; 
-  def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>; 
-} 
- 
-// Compares. 
-def : WriteRes<WriteCMP,   [M7UnitALU]> { let Latency = 1; } 
-def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } 
-def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } 
- 
-// Multiplies. 
-let Latency = 2 in { 
-  def : WriteRes<WriteMUL16,   [M7UnitMAC]>; 
-  def : WriteRes<WriteMUL32,   [M7UnitMAC]>; 
-  def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>; 
-  def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; } 
-} 
- 
-// Multiply-accumulates. 
-let Latency = 2 in { 
-  def : WriteRes<WriteMAC16,   [M7UnitMAC]>; 
-  def : WriteRes<WriteMAC32,   [M7UnitMAC]>; 
-  def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; } 
-  def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; } 
-} 
- 
-// Divisions. 
-// These cannot be dual-issued with any instructions. 
-def : WriteRes<WriteDIV, [M7UnitALU]> { 
-  let Latency = 7; 
-  let SingleIssue = 1; 
-} 
- 
-// Loads/Stores. 
-def : WriteRes<WriteLd,    [M7UnitLoad]> { let Latency = 1; } 
-def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; } 
-def : WriteRes<WriteST,    [M7UnitStore]> { let Latency = 2; } 
- 
-// Branches. 
-def : WriteRes<WriteBr,    [M7UnitBranch]> { let Latency = 2; } 
-def : WriteRes<WriteBrL,   [M7UnitBranch]> { let Latency = 2; } 
-def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; } 
- 
-// Noop. 
-def : WriteRes<WriteNoop, []> { let Latency = 0; } 
- 
-//===---------------------------------------------------------------------===// 
-// Sched definitions for floating-point instructions 
-// 
-// Floating point conversions. 
-def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } 
-def : WriteRes<WriteFPMOV, [M7UnitVPort]>            { let Latency = 3; } 
- 
-// The FP pipeline has a latency of 3 cycles. 
-// ALU operations (32/64-bit).  These go down the FP pipeline. 
-def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 3; } 
-def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 4; 
-  let BeginGroup = 1; 
-} 
- 
-// Multiplication 
-def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } 
-def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 7; 
-  let BeginGroup = 1; 
-} 
- 
-// Multiply-accumulate.  FPMAC goes down the FP Pipeline. 
-def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; } 
-def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 11; 
-  let BeginGroup = 1; 
-} 
- 
-// Division.   Effective scheduling latency is 3, though real latency is larger 
-def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 16; } 
-def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 30; 
-  let BeginGroup = 1; 
-} 
- 
-// Square-root.  Effective scheduling latency is 3; real latency is larger 
-def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } 
-def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 30; 
-  let BeginGroup = 1; 
-} 
- 
-def M7WriteShift2   : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {} 
- 
-// Not used for M7, but needing definitions anyway 
-def : WriteRes<WriteVLD1, []>; 
-def : WriteRes<WriteVLD2, []>; 
-def : WriteRes<WriteVLD3, []>; 
-def : WriteRes<WriteVLD4, []>; 
-def : WriteRes<WriteVST1, []>; 
-def : WriteRes<WriteVST2, []>; 
-def : WriteRes<WriteVST3, []>; 
-def : WriteRes<WriteVST4, []>; 
- 
-def M7SingleIssue : SchedWriteRes<[]> { 
-  let SingleIssue = 1; 
-  let NumMicroOps = 0; 
-} 
-def M7Slot0Only   : SchedWriteRes<[]> { 
-  let BeginGroup = 1; 
-  let NumMicroOps = 0; 
-} 
- 
-// What pipeline stage operands need to be ready for depending on 
-// where they come from. 
-def : ReadAdvance<ReadALUsr, 0>; 
-def : ReadAdvance<ReadMUL, 0>; 
-def : ReadAdvance<ReadMAC, 1>; 
-def : ReadAdvance<ReadALU, 0>; 
-def : ReadAdvance<ReadFPMUL, 0>; 
-def : ReadAdvance<ReadFPMAC, 3>; 
-def M7Read_ISS : SchedReadAdvance<-1>;     // operands needed at EX1 
-def M7Read_EX2   : SchedReadAdvance<1>;    // operands needed at EX3 
-def M7Read_EX3   : SchedReadAdvance<2>;    // operands needed at EX4 
- 
-// Non general purpose instructions may not be dual issued. These 
-// use both issue units. 
-def M7NonGeneralPurpose : SchedWriteRes<[]> { 
-  // Assume that these will go down the main ALU pipeline. 
-  // In reality, many look likely to stall the whole pipeline. 
-  let Latency = 3; 
-  let SingleIssue = 1; 
-} 
- 
-// List the non general purpose instructions. 
-def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT", 
-                                     "t2MSR", "t2DMB", "t2DSB", "t2ISB", 
-                                     "t2HVC", "t2SMC", "t2UDF", "ERET", 
-                                     "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>; 
- 
-//===---------------------------------------------------------------------===// 
-// Sched definitions for load/store 
-// 
-// Mark whether the loads/stores must be single-issue 
-// Address operands are needed earlier 
-// Data operands are needed later 
- 
-def M7BaseUpdate : SchedWriteRes<[]> { 
-    let Latency = 0; // Update is bypassable out of EX1 
-    let NumMicroOps = 0; 
-} 
-def M7LoadLatency1 : SchedWriteRes<[]> { 
-    let Latency = 1; 
-    let NumMicroOps = 0; 
-} 
-def M7SlowLoad : SchedWriteRes<[M7UnitLoad]>            { let Latency = 2; } 
- 
-// Byte and half-word loads should have greater latency than other loads. 
-// So should load exclusive. 
- 
-def : InstRW<[M7SlowLoad], 
-      (instregex "t2LDR(B|H|SB|SH)pc")>; 
-def : InstRW<[M7SlowLoad, M7Read_ISS], 
-      (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i", 
-                 "tLDR(B|H)i")>; 
-def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS], 
-      (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>; 
-def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS], 
-      (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>; 
- 
-// Exclusive loads/stores cannot be dual-issued 
-def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS], 
-      (instregex "t2LDREX$")>; 
-def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS], 
-      (instregex "t2LDREX(B|H)")>; 
-def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS], 
-      (instregex "t2STREX(B|H)?$")>; 
- 
-// Load/store multiples cannot be dual-issued.  Note that default scheduling 
-// occurs around read/write times of individual registers in the list; read 
-// time for STM cannot be overridden because it is a variadic source operand. 
- 
-def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], 
-      (instregex "(t|t2)LDM(DB|IA)$")>; 
-def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], 
-      (instregex "(t|t2)STM(DB|IA)$")>; 
-def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], 
-      (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>; 
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], 
-      (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>; 
- 
-// Load/store doubles cannot be dual-issued. 
- 
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, 
-              M7Read_EX2, M7Read_EX2, M7Read_ISS], 
-      (instregex "t2STRD_(PRE|POST)")>; 
-def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS], 
-      (instregex "t2STRDi")>; 
-def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS], 
-      (instregex "t2LDRD_(PRE|POST)")>; 
-def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS], 
-      (instregex "t2LDRDi")>; 
- 
-// Word load / preload 
-def : InstRW<[WriteLd], 
-      (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>; 
-def : InstRW<[WriteLd, M7Read_ISS], 
-      (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>; 
-def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS], 
-      (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>; 
-def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS], 
-      (instregex "t2LDR_(POST|PRE)")>; 
- 
-// Stores 
-def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS], 
-      (instregex "t2STR(B|H)?_(POST|PRE)")>; 
-def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS], 
-      (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>; 
-def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS], 
-      (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>; 
- 
-// TBB/TBH - single-issue only; takes two cycles to issue 
- 
-def M7TableLoad : SchedWriteRes<[M7UnitLoad]> { 
-  let NumMicroOps = 2; 
-  let SingleIssue = 1; 
-} 
- 
-def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>; 
- 
-// VFP loads and stores 
- 
-def M7LoadSP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; } 
-def M7LoadDP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 2; 
-  let SingleIssue = 1; 
-} 
-def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>; 
-def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> { 
-  let SingleIssue = 1; 
-} 
- 
-def : InstRW<[M7LoadSP, M7Read_ISS],                 (instregex "VLDR(S|H)$")>; 
-def : InstRW<[M7LoadDP, M7Read_ISS],                 (instregex "VLDRD$")>; 
-def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS],    (instregex "VSTR(S|H)$")>; 
-def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS],    (instregex "VSTRD$")>; 
- 
-// Load/store multiples cannot be dual-issued. 
- 
-def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], 
-      (instregex "VLDM(S|D|Q)(DB|IA)$")>; 
-def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], 
-      (instregex "VSTM(S|D|Q)(DB|IA)$")>; 
-def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], 
-      (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>; 
-def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], 
-      (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>; 
- 
-//===---------------------------------------------------------------------===// 
-// Sched definitions for ALU 
-// 
- 
-// Shifted ALU operands are read a cycle early. 
-def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>; 
- 
-def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS], 
-             (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$", 
-                        "t2(SUB|CMP|CMNz|TEQ|TST)rs$", 
-                        "t2MOVsr(a|l)")>; 
-def : InstRW<[WriteALUsi, M7Read_ISS], 
-             (instregex "t2MVNs")>; 
- 
-// Treat pure shift operations (except for RRX) as if they used the EX1 
-// shifter but have timing as if they used the EX2 shifter as they usually 
-// can choose the EX2 shifter when needed.  Will miss a few dual-issue cases, 
-// but the results prove to be better than trying to get them exact. 
- 
-def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>; 
-def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>; 
- 
-// Instructions that use the shifter, but have normal timing. 
- 
-def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>; 
- 
-// Instructions which are slot zero only but otherwise normal. 
- 
-def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>; 
- 
-// MAC operations that don't have SchedRW set. 
- 
-def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>; 
- 
-// Divides are special because they stall for their latency, and so look like a 
-// single-cycle as far as scheduling opportunities go.  By putting WriteALU 
-// first, we make the operand latency 1, but keep the instruction latency 7. 
- 
-def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>; 
- 
-// DSP extension operations 
- 
-def M7WriteSIMD1   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { 
-  let Latency = 1; 
-  let BeginGroup = 1; 
-} 
-def M7WriteSIMD2   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { 
-  let Latency = 2; 
-  let BeginGroup = 1; 
-} 
-def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 
-  let Latency = 1; 
-  let BeginGroup = 1; 
-} 
-def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 
-  let Latency = 0;      // Bypassable out of EX1 
-  let BeginGroup = 1; 
-} 
-def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 
-  let Latency = 2; 
-  let BeginGroup = 1; 
-} 
- 
-def : InstRW<[M7WriteShSIMD2, M7Read_ISS], 
-             (instregex "t2(S|U)SAT")>; 
-def : InstRW<[M7WriteSIMD1, ReadALU], 
-             (instregex "(t|t2)(S|U)XT(B|H)")>; 
-def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU], 
-             (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)", 
-                        "t2SEL")>; 
-def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU], 
-             (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>; 
-def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS], 
-             (instregex "t2QD(ADD|SUB)")>; 
-def : InstRW<[M7WriteShSIMD0, M7Read_ISS], 
-             (instregex "t2(RBIT|REV)", "tREV")>; 
-def : InstRW<[M7WriteShSIMD1, M7Read_ISS], 
-             (instregex "t2(SBFX|UBFX)")>; 
-def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS], 
-             (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>; 
-def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2], 
-             (instregex "t2USADA8")>; 
- 
-// MSR/MRS 
-def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>; 
- 
-//===---------------------------------------------------------------------===// 
-// Sched definitions for FP operations 
-// 
- 
-// Effective scheduling latency is really 3 for nearly all FP operations, 
-// even if their true latency is higher. 
-def M7WriteVFPLatOverride : SchedWriteRes<[]> { 
-  let Latency = 3; 
-  let NumMicroOps = 0; 
-} 
-def M7WriteVFPExtraVPort  : SchedWriteRes<[M7UnitVPort]> { 
-  let Latency = 3; 
-  let NumMicroOps = 0; 
-} 
- 
-// Instructions which are missing default schedules. 
-def : InstRW<[WriteFPALU32], 
-             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], 
-             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>; 
- 
-// VCMP 
-def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; } 
-def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 
-  let Latency = 0; 
-  let BeginGroup = 1; 
-} 
-def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>; 
-def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>; 
- 
-    // VMRS/VMSR 
-def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } 
-def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } 
-def : InstRW<[M7VMRS], (instregex "FMSTAT")>; 
-def : InstRW<[M7VMSR], (instregex "VMSR")>; 
- 
-// VSEL cannot bypass in its implied $cpsr operand; model as earlier read 
-def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS], 
-             (instregex "VSEL.*S$")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only, 
-              ReadALU, ReadALU, M7Read_ISS], 
-             (instregex "VSEL.*D$")>; 
- 
-// VMOV 
-def : InstRW<[WriteFPMOV], 
-             (instregex "VMOV(H|S)$", "FCONST(H|S)")>; 
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], 
-             (instregex "VMOVD$")>; 
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], 
-             (instregex "FCONSTD")>; 
-def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue], 
-             (instregex "VMOV(DRR|RRD|RRS|SRR)")>; 
- 
-// Larger-latency overrides. 
- 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32],  (instregex "VDIVS")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64],  (instregex "VDIVD")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64], 
-             (instregex "V(MUL|NMUL)D")>; 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], 
-             (instregex "V(ADD|SUB)D")>; 
- 
-// Multiply-accumulate.  Chained SP timing is correct; rest need overrides 
-// Double-precision chained MAC stalls the pipeline behind it for 3 cycles, 
-// making it appear to have 3 cycle latency for scheduling. 
- 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, 
-              ReadFPMAC, ReadFPMUL, ReadFPMUL], 
-             (instregex "V(N)?ML(A|S)D$")>; 
- 
-// Single-precision fused MACs look like latency 5 with advance of 2. 
- 
-def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { 
-  let Latency = 5; 
-  let NumMicroOps = 0; 
-} 
-def M7ReadFPMAC2   : SchedReadAdvance<2>; 
- 
-def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32, 
-              M7ReadFPMAC2, ReadFPMUL, ReadFPMUL], 
-             (instregex "VF(N)?M(A|S)S$")>; 
- 
-// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making 
-// it appear to have 3 cycle latency for scheduling. 
- 
-def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, 
-              ReadFPMAC, ReadFPMUL, ReadFPMUL], 
-             (instregex "VF(N)?M(A|S)D$")>; 
- 
-}  // SchedModel = CortexM7Model 
+//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM7Model : SchedMachineModel {
+  let IssueWidth = 2;        // Dual issue for most instructions.
+  let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
+  let LoadLatency = 2;       // Best case for load-use case.
+  let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
+                             // but 4 works better
+  let CompleteModel = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
+// pipe. The stages relevant to scheduling are as follows:
+//
+//   EX1: address generation  shifts
+//   EX2: fast load data      ALUs                  FP operation
+//   EX3: slow load data      integer writeback     FP operation
+//   EX4: store data                                FP writeback
+//
+// There are shifters in both EX1 and EX2, and some instructions can be
+// flexibly allocated between them.  EX2 is used as the "zero" point
+// for scheduling, so simple ALU operations executing in EX2 will have
+// ReadAdvance<0> (the default) for their source operands and Latency = 1.
+
+def M7UnitLoad   : ProcResource<2> { let BufferSize = 0; }
+def M7UnitStore  : ProcResource<1> { let BufferSize = 0; }
+def M7UnitALU    : ProcResource<2>;
+def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
+def M7UnitMAC    : ProcResource<1> { let BufferSize = 0; }
+def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVFP    : ProcResource<1> { let BufferSize = 0; }
+def M7UnitVPort  : ProcResource<2> { let BufferSize = 0; }
+def M7UnitSIMD   : ProcResource<1> { let BufferSize = 0; }
+
+//===---------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types with map ProcResources and set latency.
+
+let SchedModel = CortexM7Model in {
+
+def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
+
+// Basic ALU with shifts.
+let Latency = 1 in {
+  def : WriteRes<WriteALUsi,  [M7UnitALU, M7UnitShift1]>;
+  def : WriteRes<WriteALUsr,  [M7UnitALU, M7UnitShift1]>;
+  def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
+}
+
+// Compares.
+def : WriteRes<WriteCMP,   [M7UnitALU]> { let Latency = 1; }
+def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
+
+// Multiplies.
+let Latency = 2 in {
+  def : WriteRes<WriteMUL16,   [M7UnitMAC]>;
+  def : WriteRes<WriteMUL32,   [M7UnitMAC]>;
+  def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
+  def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Multiply-accumulates.
+let Latency = 2 in {
+  def : WriteRes<WriteMAC16,   [M7UnitMAC]>;
+  def : WriteRes<WriteMAC32,   [M7UnitMAC]>;
+  def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
+  def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
+}
+
+// Divisions.
+// These cannot be dual-issued with any instructions.
+def : WriteRes<WriteDIV, [M7UnitALU]> {
+  let Latency = 7;
+  let SingleIssue = 1;
+}
+
+// Loads/Stores.
+def : WriteRes<WriteLd,    [M7UnitLoad]> { let Latency = 1; }
+def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
+def : WriteRes<WriteST,    [M7UnitStore]> { let Latency = 2; }
+
+// Branches.
+def : WriteRes<WriteBr,    [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrL,   [M7UnitBranch]> { let Latency = 2; }
+def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
+
+// Noop.
+def : WriteRes<WriteNoop, []> { let Latency = 0; }
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for floating-point instructions
+//
+// Floating point conversions.
+def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMOV, [M7UnitVPort]>            { let Latency = 3; }
+
+// The FP pipeline has a latency of 3 cycles.
+// ALU operations (32/64-bit).  These go down the FP pipeline.
+def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 3; }
+def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 4;
+  let BeginGroup = 1;
+}
+
+// Multiplication
+def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
+def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 7;
+  let BeginGroup = 1;
+}
+
+// Multiply-accumulate.  FPMAC goes down the FP Pipeline.
+def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
+def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 11;
+  let BeginGroup = 1;
+}
+
+// Division.   Effective scheduling latency is 3, though real latency is larger
+def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]>  { let Latency = 16; }
+def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 30;
+  let BeginGroup = 1;
+}
+
+// Square-root.  Effective scheduling latency is 3; real latency is larger
+def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
+def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 30;
+  let BeginGroup = 1;
+}
+
+def M7WriteShift2   : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
+
+// Not used for M7, but needing definitions anyway
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+def M7SingleIssue : SchedWriteRes<[]> {
+  let SingleIssue = 1;
+  let NumMicroOps = 0;
+}
+def M7Slot0Only   : SchedWriteRes<[]> {
+  let BeginGroup = 1;
+  let NumMicroOps = 0;
+}
+
+// What pipeline stage operands need to be ready for depending on
+// where they come from.
+def : ReadAdvance<ReadALUsr, 0>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 1>;
+def : ReadAdvance<ReadALU, 0>;
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 3>;
+def M7Read_ISS : SchedReadAdvance<-1>;     // operands needed at EX1
+def M7Read_EX2   : SchedReadAdvance<1>;    // operands needed at EX3
+def M7Read_EX3   : SchedReadAdvance<2>;    // operands needed at EX4
+
+// Non general purpose instructions may not be dual issued. These
+// use both issue units.
+def M7NonGeneralPurpose : SchedWriteRes<[]> {
+  // Assume that these will go down the main ALU pipeline.
+  // In reality, many look likely to stall the whole pipeline.
+  let Latency = 3;
+  let SingleIssue = 1;
+}
+
+// List the non general purpose instructions.
+def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
+                                     "t2MSR", "t2DMB", "t2DSB", "t2ISB",
+                                     "t2HVC", "t2SMC", "t2UDF", "ERET",
+                                     "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for load/store
+//
+// Mark whether the loads/stores must be single-issue
+// Address operands are needed earlier
+// Data operands are needed later
+
+def M7BaseUpdate : SchedWriteRes<[]> {
+    let Latency = 0; // Update is bypassable out of EX1
+    let NumMicroOps = 0;
+}
+def M7LoadLatency1 : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+}
+def M7SlowLoad : SchedWriteRes<[M7UnitLoad]>            { let Latency = 2; }
+
+// Byte and half-word loads should have greater latency than other loads.
+// So should load exclusive.
+
+def : InstRW<[M7SlowLoad],
+      (instregex "t2LDR(B|H|SB|SH)pc")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
+                 "tLDR(B|H)i")>;
+def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
+def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
+
+// Exclusive loads/stores cannot be dual-issued
+def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
+      (instregex "t2LDREX$")>;
+def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
+      (instregex "t2LDREX(B|H)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STREX(B|H)?$")>;
+
+// Load/store multiples cannot be dual-issued.  Note that default scheduling
+// occurs around read/write times of individual registers in the list; read
+// time for STM cannot be overridden because it is a variadic source operand.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)LDM(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)STM(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
+
+// Load/store doubles cannot be dual-issued.
+
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
+              M7Read_EX2, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STRD_(PRE|POST)")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STRDi")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDRD_(PRE|POST)")>;
+def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
+      (instregex "t2LDRDi")>;
+
+// Word load / preload
+def : InstRW<[WriteLd],
+      (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
+def : InstRW<[WriteLd, M7Read_ISS],
+      (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
+def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
+      (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
+def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
+      (instregex "t2LDR_(POST|PRE)")>;
+
+// Stores
+def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STR(B|H)?_(POST|PRE)")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
+      (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
+def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
+      (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
+
+// TBB/TBH - single-issue only; takes two cycles to issue
+
+def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
+  let NumMicroOps = 2;
+  let SingleIssue = 1;
+}
+
+def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
+
+// VFP loads and stores
+
+def M7LoadSP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
+def M7LoadDP  : SchedWriteRes<[M7UnitLoad, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 2;
+  let SingleIssue = 1;
+}
+def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
+def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPort, M7UnitVPort]> {
+  let SingleIssue = 1;
+}
+
+def : InstRW<[M7LoadSP, M7Read_ISS],                 (instregex "VLDR(S|H)$")>;
+def : InstRW<[M7LoadDP, M7Read_ISS],                 (instregex "VLDRD$")>;
+def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS],    (instregex "VSTR(S|H)$")>;
+def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS],    (instregex "VSTRD$")>;
+
+// Load/store multiples cannot be dual-issued.
+
+def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "VLDM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "VSTM(S|D|Q)(DB|IA)$")>;
+def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
+      (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
+def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
+      (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for ALU
+//
+
+// Shifted ALU operands are read a cycle early.
+def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
+
+def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
+             (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
+                        "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
+                        "t2MOVsr(a|l)")>;
+def : InstRW<[WriteALUsi, M7Read_ISS],
+             (instregex "t2MVNs")>;
+
+// Treat pure shift operations (except for RRX) as if they used the EX1
+// shifter but have timing as if they used the EX2 shifter as they usually
+// can choose the EX2 shifter when needed.  Will miss a few dual-issue cases,
+// but the results prove to be better than trying to get them exact.
+
+def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
+def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
+
+// Instructions that use the shifter, but have normal timing.
+
+def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
+
+// Instructions which are slot zero only but otherwise normal.
+
+def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
+
+// MAC operations that don't have SchedRW set.
+
+def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
+
+// Divides are special because they stall for their latency, and so look like a
+// single-cycle as far as scheduling opportunities go.  By putting WriteALU
+// first, we make the operand latency 1, but keep the instruction latency 7.
+
+def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
+
+// DSP extension operations
+
+def M7WriteSIMD1   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+  let Latency = 1;
+  let BeginGroup = 1;
+}
+def M7WriteSIMD2   : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
+  let Latency = 2;
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 1;
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 0;      // Bypassable out of EX1
+  let BeginGroup = 1;
+}
+def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
+  let Latency = 2;
+  let BeginGroup = 1;
+}
+
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
+             (instregex "t2(S|U)SAT")>;
+def : InstRW<[M7WriteSIMD1, ReadALU],
+             (instregex "(t|t2)(S|U)XT(B|H)")>;
+def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
+             (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
+                        "t2SEL")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
+             (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
+def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
+             (instregex "t2QD(ADD|SUB)")>;
+def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
+             (instregex "t2(RBIT|REV)", "tREV")>;
+def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
+             (instregex "t2(SBFX|UBFX)")>;
+def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
+             (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
+def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
+             (instregex "t2USADA8")>;
+
+// MSR/MRS
+def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
+
+//===---------------------------------------------------------------------===//
+// Sched definitions for FP operations
+//
+
+// Effective scheduling latency is really 3 for nearly all FP operations,
+// even if their true latency is higher.
+def M7WriteVFPLatOverride : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def M7WriteVFPExtraVPort  : SchedWriteRes<[M7UnitVPort]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+
+// Instructions which are missing default schedules.
+def : InstRW<[WriteFPALU32],
+             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+             (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
+
+// VCMP
+def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
+def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
+  let Latency = 0;
+  let BeginGroup = 1;
+}
+def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
+def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
+
+    // VMRS/VMSR
+def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
+def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
+def : InstRW<[M7VMSR], (instregex "VMSR")>;
+
+// VSEL cannot bypass in its implied $cpsr operand; model as earlier read
+def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
+             (instregex "VSEL.*S$")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
+              ReadALU, ReadALU, M7Read_ISS],
+             (instregex "VSEL.*D$")>;
+
+// VMOV
+def : InstRW<[WriteFPMOV],
+             (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+             (instregex "VMOVD$")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
+             (instregex "FCONSTD")>;
+def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
+             (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
+
+// Larger-latency overrides.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32],  (instregex "VDIVS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64],  (instregex "VDIVD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
+             (instregex "V(MUL|NMUL)D")>;
+def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
+             (instregex "V(ADD|SUB)D")>;
+
+// Multiply-accumulate.  Chained SP timing is correct; rest need overrides
+// Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
+// making it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+              ReadFPMAC, ReadFPMUL, ReadFPMUL],
+             (instregex "V(N)?ML(A|S)D$")>;
+
+// Single-precision fused MACs look like latency 5 with advance of 2.
+
+def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
+  let Latency = 5;
+  let NumMicroOps = 0;
+}
+def M7ReadFPMAC2   : SchedReadAdvance<2>;
+
+def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
+              M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
+             (instregex "VF(N)?M(A|S)S$")>;
+
+// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
+// it appear to have 3 cycle latency for scheduling.
+
+def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
+              ReadFPMAC, ReadFPMUL, ReadFPMUL],
+             (instregex "VF(N)?M(A|S)D$")>;
+
+}  // SchedModel = CortexM7Model
author	shadchin <shadchin@yandex-team.ru>	2022-02-10 16:44:39 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:44:39 +0300
commit	e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0 (patch)
tree	64175d5cadab313b3e7039ebaa06c5bc3295e274 /contrib/libs/llvm12/lib/Target/ARM/ARMScheduleM7.td
parent	2598ef1d0aee359b4b6d5fdd1758916d5907d04f (diff)
download	ydb-e9656aae26e0358d5378e5b63dcac5c8dbe0e4d0.tar.gz