diff options
author | Janne Grunau <janne-libav@jannau.net> | 2014-04-19 18:17:23 +0200 |
---|---|---|
committer | Janne Grunau <janne-libav@jannau.net> | 2014-04-22 22:01:45 +0200 |
commit | 8f9fe6ae3461ce270bce6b7083fda5ec314cdad4 (patch) | |
tree | 3ffcba81a51f791b6f15db95b98941d8bec1d2b3 /libavcodec/aarch64/mpegaudiodsp_neon.S | |
parent | f4d5a2cc35fcdf06ec031fabe8b0710e995fe924 (diff) | |
download | ffmpeg-8f9fe6ae3461ce270bce6b7083fda5ec314cdad4.tar.gz |
aarch64: NEON fixed/floating point MPADSP apply_window
30%/25% (fixed/float) faster mp3 decoding on Apple's A7. The floating
point decoder is approximately 7% faster.
Diffstat (limited to 'libavcodec/aarch64/mpegaudiodsp_neon.S')
-rw-r--r-- | libavcodec/aarch64/mpegaudiodsp_neon.S | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S new file mode 100644 index 0000000000..39875fed4a --- /dev/null +++ b/libavcodec/aarch64/mpegaudiodsp_neon.S @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define FRAC_BITS 23 // fractional bits for sb_samples and dct +#define WFRAC_BITS 16 // fractional bits for window +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) + +const tbl_rev128.s align=4 + .byte 12, 13, 14, 15 + .byte 8, 9, 10, 11 + .byte 4, 5, 6, 7 + .byte 0, 1, 2, 3 +endconst + +.macro apply_window type, st +function ff_mpadsp_apply_window_\type\()_neon, export=1 + mov x7, x0 + sxtw x4, w4 // incr + add x8, x0, #512<<2 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64 + st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64 + movrel x15, tbl_rev128.s + ld1 {v27.4s}, [x15] +.ifc \type, fixed + lsl x4, x4, #1 +.else + lsl x4, x4, #2 +.endif + add x10, x0, #45<<2 + add x0, x0, #16<<2 + add x1, x1, #16<<2 + add x5, x3, x4, lsl #5 + sub x5, x5, x4 // samples2 + neg x13, x4 // -incr + mov x9, #64<<2 +.ifc \type, fixed + ld1r {v16.2s}, [x2] // dither_state + sxtl v16.2d, v16.2s + movi v29.2d, #0 + movi v30.2d, #(1<<OUT_SHIFT)-1 + trn1 v31.2d, v29.2d, v30.2d + trn2 v30.2d, v30.2d, v29.2d + trn1 v16.2d, v16.2d, v29.2d +.else + movi v16.4s, #0 + movi v28.4s, #0 +.endif + mov x14, #4 +1: + mov x8, x0 + sub x7, x1, #3<<2 + sub x6, x1, x14, lsl #4 + add x7, x7, x14, lsl #4 + add x11, x6, #(32)<<2 // w + 32 + add x12, x7, #(32)<<2 // w2 + 32 + mov x15, #8 + movi v17.2d, #0 + movi v18.2d, #0 + movi v19.2d, #0 +2: + subs x15, x15, #1 + ld1 {v0.4s}, [x8], x9 + ld1 {v1.4s}, [x10], x9 + ld1 {v2.4s}, [x6], x9 + ld1 {v3.4s}, [x7], x9 + tbl v6.16b, {v0.16b}, v27.16b + tbl v7.16b, {v1.16b}, v27.16b + ld1 {v4.4s}, [x11], x9 + ld1 {v5.4s}, [x12], x9 + MLA v16, v2, v0 + MLA2 v17, v2, v0 + MLS v18, v3, v6 + MLS2 v19, v3, v6 + MLS v16, v4, v7 + MLS2 v17, v4, v7 + MLS v18, v5, v1 + MLS2 v19, v5, v1 + b.gt 2b + + cmp x14, #4 + sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t) + +.ifc \type, fixed + and v28.16b, v16.16b, v30.16b + ext v28.16b, v29.16b, v28.16b, #8 + + b.eq 4f + round_sample v19, 1, 1 +4: + round_sample v16, 1, 0 + shrn v16.2s, v16.2d, #OUT_SHIFT + round_sample v19, 0, 0 + shrn v19.2s, v19.2d, #OUT_SHIFT + round_sample v17, 0, 1 + round_sample v18, 1, 1 + round_sample v17, 1, 0 + shrn2 v16.4s, v17.2d, #OUT_SHIFT + round_sample v18, 0, 0 + shrn2 v19.4s, v18.2d, #OUT_SHIFT + sqxtn v16.4h, v16.4s + sqxtn v18.4h, v19.4s +.else + ext v18.16b, v18.16b, v18.16b, #8 +.endif + + st1 {v16.\st\()}[0], [x3], x4 + b.eq 4f + st1 {v18.\st\()}[1], [x5], x13 +4: + st1 {v16.\st\()}[1], [x3], x4 + st1 {v18.\st\()}[0], [x5], x13 + st1 {v16.\st\()}[2], [x3], x4 + st1 {v18.\st\()}[3], [x5], x13 + st1 {v16.\st\()}[3], [x3], x4 + st1 {v18.\st\()}[2], [x5], x13 + + mov v16.16b, v28.16b + + subs x14, x14, #1 + add x0, x0, #4<<2 + sub x10, x10, #4<<2 + b.gt 1b + +// comuting samples[16] + add x6, x1, #32<<2 + ld1 {v0.2s}, [x6], x9 + ld1 {v1.2s}, [x0], x9 +.rept 3 + ld1 {v2.2s}, [x6], x9 + ld1 {v3.2s}, [x0], x9 + MLS v16, v0, v1 + ld1 {v0.2s}, [x6], x9 + ld1 {v1.2s}, [x0], x9 + MLS v16, v2, v3 +.endr + ld1 {v2.2s}, [x6], x9 + ld1 {v3.2s}, [x0], x9 + MLS v16, v0, v1 + MLS v16, v2, v3 + +.ifc \type, fixed + and v28.16b, v16.16b, v30.16b + shrn v20.2s, v16.2d, #OUT_SHIFT + xtn v28.2s, v28.2d + sqxtn v20.4h, v20.4s + st1 {v28.s}[0], [x2] // save dither_state + st1 {v20.h}[0], [x3] +.else + st1 {v16.s}[0], [x3] +.endif + + ret +endfunc +.purgem round_sample +.purgem MLA +.purgem MLA2 +.purgem MLS +.purgem MLS2 +.endm + + +.macro round_sample r, idx, next + add \r\().2d, \r\().2d, v28.2d +.if \idx == 0 + and v28.16b, \r\().16b, v30.16b +.else // \idx == 1 + and v28.16b, \r\().16b, v31.16b +.endif +.if \idx != \next + .if \next == 0 + ext v28.16b, v28.16b, v29.16b, #8 + .else + ext v28.16b, v29.16b, v28.16b, #8 + .endif +.endif +.endm +.macro MLA d, s1, s2 + smlal \d\().2d, \s1\().2s, \s2\().2s +.endm +.macro MLA2 d, s1, s2 + smlal2 \d\().2d, \s1\().4s, \s2\().4s +.endm +.macro MLS d, s1, s2 + smlsl \d\().2d, \s1\().2s, \s2\().2s +.endm +.macro MLS2 d, s1, s2 + smlsl2 \d\().2d, \s1\().4s, \s2\().4s +.endm +apply_window fixed, h + + +// nothing to do for round_sample and ML{A,S}2 +.macro round_sample r, idx, next +.endm +.macro MLA2 d, s1, s2 +.endm +.macro MLS2 d, s1, s2 +.endm +.macro MLA d, s1, s2 + fmla \d\().4s, \s1\().4s, \s2\().4s +.endm +.macro MLS d, s1, s2 + fmls \d\().4s, \s1\().4s, \s2\().4s +.endm +apply_window float, s |