1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
POUT .req a1
PIN .req a2
PCOEF .req a3
OLDFPSCR .req a4
COUNTER .req ip
IN0 .req s4
IN1 .req s5
IN2 .req s6
IN3 .req s7
IN4 .req s0
IN5 .req s1
IN6 .req s2
IN7 .req s3
COEF0 .req s8 @ coefficient elements
COEF1 .req s9
COEF2 .req s10
COEF3 .req s11
COEF4 .req s12
COEF5 .req s13
COEF6 .req s14
COEF7 .req s15
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
ACCUM4 .req s20
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
POST1 .req s25
POST2 .req s26
POST3 .req s27
.macro inner_loop decifactor, dir, tail, head
.ifc "\dir","up"
.set X, 0
.set Y, 4
.else
.set X, 4*JMAX*4 - 4
.set Y, -4
.endif
.ifnc "\head",""
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
.endif
.ifnc "\tail",""
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
.endif
.ifnc "\head",""
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
.endif
.ifnc "\head",""
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
.ifc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
.ifnc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
.endif
.ifnc "\tail",""
vstmia POUT!, {POST0-POST3}
.endif
.ifnc "\head",""
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
.if \decifactor == 32
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
.endif
.endif
.endm
.macro dca_lfe_fir decifactor
function ff_dca_lfe_fir\decifactor\()_vfp, export=1
fmrx OLDFPSCR, FPSCR
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, ip
vldr IN0, [PIN, #-0*4]
vldr IN1, [PIN, #-1*4]
vldr IN2, [PIN, #-2*4]
vldr IN3, [PIN, #-3*4]
.if \decifactor == 32
.set JMAX, 8
vpush {s16-s31}
vldr IN4, [PIN, #-4*4]
vldr IN5, [PIN, #-5*4]
vldr IN6, [PIN, #-6*4]
vldr IN7, [PIN, #-7*4]
.else
.set JMAX, 4
vpush {s16-s27}
.endif
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, up,, head
1: add PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, up, tail, head
bne 1b
inner_loop \decifactor, up, tail
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, down,, head
1: sub PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, down, tail, head
bne 1b
inner_loop \decifactor, down, tail
.if \decifactor == 32
vpop {s16-s31}
.else
vpop {s16-s27}
.endif
fmxr FPSCR, OLDFPSCR
bx lr
endfunc
.endm
dca_lfe_fir 64
.ltorg
dca_lfe_fir 32
.unreq POUT
.unreq PIN
.unreq PCOEF
.unreq OLDFPSCR
.unreq COUNTER
.unreq IN0
.unreq IN1
.unreq IN2
.unreq IN3
.unreq IN4
.unreq IN5
.unreq IN6
.unreq IN7
.unreq COEF0
.unreq COEF1
.unreq COEF2
.unreq COEF3
.unreq COEF4
.unreq COEF5
.unreq COEF6
.unreq COEF7
.unreq ACCUM0
.unreq ACCUM4
.unreq POST0
.unreq POST1
.unreq POST2
.unreq POST3
IN .req a1
SBACT .req a2
OLDFPSCR .req a3
IMDCT .req a4
WINDOW .req v1
OUT .req v2
BUF .req v3
SCALEINT .req v4 @ only used in softfp case
COUNT .req v5
SCALE .req s0
/* Stack layout differs in softfp and hardfp cases:
*
* hardfp
* fp -> 6 arg words saved by caller
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* s0 on entry
* sp -> 3 arg words for callee
*
* softfp
* fp -> 7 arg words saved by caller
* a4,v1-v5,fp,lr on entry
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* sp -> 4 arg words for callee
*/
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
* SynthFilterContext *synth, FFTContext *imdct,
* float (*synth_buf_ptr)[512],
* int *synth_buf_offset, float (*synth_buf2)[32],
* const float (*window)[512], float *samples_out,
* float (*raXin)[32], float scale);
*/
function ff_dca_qmf_32_subbands_vfp, export=1
VFP push {a3-a4,v1-v3,v5,fp,lr}
NOVFP push {a4,v1-v5,fp,lr}
add fp, sp, #8*4
vpush {s16-s23}
@ The buffer pointed at by raXin isn't big enough for us to do a
@ complete matrix transposition as we want to, so allocate an
@ alternative buffer from the stack. Align to 4 words for speed.
sub BUF, sp, #8*32*4
bic BUF, BUF, #15
mov sp, BUF
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
fmrx OLDFPSCR, FPSCR
fmxr FPSCR, lr
@ COUNT is used to count down 2 things at once:
@ bits 0-4 are the number of word pairs remaining in the output row
@ bits 5-31 are the number of words to copy (with possible negation)
@ from the source matrix before we start zeroing the remainder
mov COUNT, #(-4 << 5) + 16
adds COUNT, COUNT, SBACT, lsl #5
bmi 2f
1:
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
vldr s9, [IN, #(3*8+0)*4]
vldr s11, [IN, #(3*8+1)*4]
vldr s13, [IN, #(3*8+2)*4]
vldr s15, [IN, #(3*8+3)*4]
vldr s17, [IN, #(3*8+4)*4]
vldr s19, [IN, #(3*8+5)*4]
vldr s21, [IN, #(3*8+6)*4]
vldr s23, [IN, #(3*8+7)*4]
vneg.f s9, s9
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vneg.f s17, s17
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vstr d4, [BUF, #(0*32+2)*4]
vstr d5, [BUF, #(1*32+2)*4]
vstr d6, [BUF, #(2*32+2)*4]
vstr d7, [BUF, #(3*32+2)*4]
vstr d8, [BUF, #(4*32+2)*4]
vstr d9, [BUF, #(5*32+2)*4]
vstr d10, [BUF, #(6*32+2)*4]
vstr d11, [BUF, #(7*32+2)*4]
add IN, IN, #4*8*4
add BUF, BUF, #4*4
subs COUNT, COUNT, #(4 << 5) + 2
bpl 1b
2: @ Now deal with trailing < 4 samples
adds COUNT, COUNT, #3 << 5
bmi 4f @ sb_act was a multiple of 4
bics lr, COUNT, #0x1F
bne 3f
@ sb_act was n*4+1
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vneg.f s16, s16
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
b 4f
3: @ sb_act was n*4+2 or n*4+3, so do the first 2
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #(2 << 5) + 1
bics lr, COUNT, #0x1F
bne 4f
@ sb_act was n*4+3
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
4: @ Now fill the remainder with 0
vldr s8, zero
vldr s9, zero
ands COUNT, COUNT, #0x1F
beq 6f
5: vstr d4, [BUF, #(0*32+0)*4]
vstr d4, [BUF, #(1*32+0)*4]
vstr d4, [BUF, #(2*32+0)*4]
vstr d4, [BUF, #(3*32+0)*4]
vstr d4, [BUF, #(4*32+0)*4]
vstr d4, [BUF, #(5*32+0)*4]
vstr d4, [BUF, #(6*32+0)*4]
vstr d4, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
subs COUNT, COUNT, #1
bne 5b
6:
fmxr FPSCR, OLDFPSCR
ldr WINDOW, [fp, #3*4]
ldr OUT, [fp, #4*4]
sub BUF, BUF, #32*4
NOVFP ldr SCALEINT, [fp, #6*4]
mov COUNT, #8
VFP vpush {SCALE}
VFP sub sp, sp, #3*4
NOVFP sub sp, sp, #4*4
7:
VFP ldr a1, [fp, #-7*4] @ imdct
NOVFP ldr a1, [fp, #-8*4]
ldmia fp, {a2-a4}
VFP stmia sp, {WINDOW, OUT, BUF}
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
VFP vldr SCALE, [sp, #3*4]
bl X(ff_synth_filter_float_vfp)
add OUT, OUT, #32*4
add BUF, BUF, #32*4
subs COUNT, COUNT, #1
bne 7b
A sub sp, fp, #(8+8)*4
T sub fp, fp, #(8+8)*4
T mov sp, fp
vpop {s16-s23}
VFP pop {a3-a4,v1-v3,v5,fp,pc}
NOVFP pop {a4,v1-v5,fp,pc}
endfunc
.unreq IN
.unreq SBACT
.unreq OLDFPSCR
.unreq IMDCT
.unreq WINDOW
.unreq OUT
.unreq BUF
.unreq SCALEINT
.unreq COUNT
.unreq SCALE
.align 2
zero: .word 0
|