contrib/libs/llvm16/lib/Target/ARM/ARMScheduleM4.td


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

//==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the SchedRead/Write data for the ARM Cortex-M4 processor.
//
//===----------------------------------------------------------------------===//

def CortexM4Model : SchedMachineModel {
  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
  let MicroOpBufferSize = 0; // In-order
  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
  let MispredictPenalty = 2; // Best case branch taken cost
  let PostRAScheduler   = 1;

  let CompleteModel = 0;
  let UnsupportedFeatures = [IsARM, HasNEON, HasDotProd, HasZCZ, HasMVEInt,
          IsNotMClass, HasDPVFP, HasFPARMv8, HasFullFP16, Has8MSecExt, HasV8,
          HasV8_3a, HasTrustZone, HasDFB, IsWindows];
}


// We model the entire cpu as a single pipeline with a BufferSize = 0 since
// Cortex-M4 is in-order.

def M4Unit : ProcResource<1> { let BufferSize = 0; }


let SchedModel = CortexM4Model in {

// Some definitions of latencies we apply to different instructions

class M4UnitL1<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 1; }
class M4UnitL2<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 2; }
class M4UnitL3<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 3; }
class M4UnitL14<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 14; }
def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; }
def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; }
class M4UnitL1I<dag instr> : InstRW<[M4UnitL1_wr], instr>;
class M4UnitL2I<dag instr> : InstRW<[M4UnitL2_wr], instr>;


// Loads, MAC's and DIV all get a higher latency of 2
def : M4UnitL2<WriteLd>;
def : M4UnitL2<WriteMAC32>;
def : M4UnitL2<WriteMAC64Hi>;
def : M4UnitL2<WriteMAC64Lo>;
def : M4UnitL2<WriteMAC16>;
def : M4UnitL2<WriteDIV>;

def : M4UnitL2I<(instregex "(t|t2)LDM")>;
def : M4UnitL2I<(instregex "(t|t2)LDR")>;


// Stores we use a latency of 1 as they have no outputs

def : M4UnitL1<WriteST>;
def : M4UnitL1I<(instregex "(t|t2)STM")>;


// Everything else has a Latency of 1

def : M4UnitL1<WriteALU>;
def : M4UnitL1<WriteALUsi>;
def : M4UnitL1<WriteALUsr>;
def : M4UnitL1<WriteALUSsr>;
def : M4UnitL1<WriteBr>;
def : M4UnitL1<WriteBrL>;
def : M4UnitL1<WriteBrTbl>;
def : M4UnitL1<WriteCMPsi>;
def : M4UnitL1<WriteCMPsr>;
def : M4UnitL1<WriteCMP>;
def : M4UnitL1<WriteMUL32>;
def : M4UnitL1<WriteMUL64Hi>;
def : M4UnitL1<WriteMUL64Lo>;
def : M4UnitL1<WriteMUL16>;
def : M4UnitL1<WriteNoop>;
def : M4UnitL1<WritePreLd>;
def : M4UnitL1I<(instregex "(t|t2)MOV")>;
def : M4UnitL1I<(instrs COPY)>;
def : M4UnitL1I<(instregex "t2IT", "t2MSR", "t2MRS")>;
def : M4UnitL1I<(instregex "t2CLREX")>;
def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", "t2SML[AS]",
    "t2(S|Q|SH|U|UQ|UH|QD)(ADD|ASX|SAX|SUB)", "t2USADA8", "(t|t2)REV")>;

// These instructions are not of much interest to scheduling as they will not
// be generated or it is not very useful to schedule them. They are here to make
// the model more complete.
def : M4UnitL1I<(instregex "t2CDP", "t2LDC", "t2MCR", "t2MRC", "t2MRRC", "t2STC")>;
def : M4UnitL1I<(instregex "tCPS", "t2ISB", "t2DSB", "t2DMB", "t2?HINT$")>;
def : M4UnitL1I<(instregex "t2?UDF$", "tBKPT", "t2DBG")>;
def : M4UnitL1I<(instregex "t?2?Int_eh_sjlj_", "tADDframe", "t?ADJCALL")>;
def : M4UnitL1I<(instregex "CMP_SWAP", "JUMPTABLE", "MEMCPY")>;
def : M4UnitL1I<(instregex "VSETLNi32", "VGETLNi32")>;

def : ReadAdvance<ReadALU, 0>;
def : ReadAdvance<ReadALUsr, 0>;
def : ReadAdvance<ReadMUL, 0>;
def : ReadAdvance<ReadMAC, 0>;

// Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's.
// Loads still take 2 cycles.

def : M4UnitL1<WriteFPCVT>;
def : M4UnitL1<WriteFPMOV>;
def : M4UnitL1<WriteFPALU32>;
def : M4UnitL1<WriteFPALU64>;
def : M4UnitL1<WriteFPMUL32>;
def : M4UnitL1<WriteFPMUL64>;
def : M4UnitL2I<(instregex "VLD")>;
def : M4UnitL1I<(instregex "VST")>;
def : M4UnitL3<WriteFPMAC32>;
def : M4UnitL3<WriteFPMAC64>;
def : M4UnitL14<WriteFPDIV32>;
def : M4UnitL14<WriteFPDIV64>;
def : M4UnitL14<WriteFPSQRT32>;
def : M4UnitL14<WriteFPSQRT64>;
def : M4UnitL1<WriteVLD1>;
def : M4UnitL1<WriteVLD2>;
def : M4UnitL1<WriteVLD3>;
def : M4UnitL1<WriteVLD4>;
def : M4UnitL1<WriteVST1>;
def : M4UnitL1<WriteVST2>;
def : M4UnitL1<WriteVST3>;
def : M4UnitL1<WriteVST4>;
def : M4UnitL1I<(instregex "VMOVS", "FCONSTS", "VCMP", "VNEG", "VABS")>;
def : M4UnitL2I<(instregex "VMOVD")>;
def : M4UnitL1I<(instregex "VMRS", "VMSR", "FMSTAT")>;

def : ReadAdvance<ReadFPMUL, 0>;
def : ReadAdvance<ReadFPMAC, 0>;

}