diff options
author | Janne Grunau <janne-libav@jannau.net> | 2017-01-10 00:15:07 +0200 |
---|---|---|
committer | Michael Niedermayer <michael@niedermayer.cc> | 2017-01-14 21:13:06 +0100 |
commit | 62ea07d797c503bc4b727e56d9c0f914a93c8ef6 (patch) | |
tree | cc117829b61ee24d877615cb41df2f7f72ccf856 /libavcodec | |
parent | 3ac46a0a62386a52e38c066379ff36b5038dd4d0 (diff) | |
download | ffmpeg-62ea07d797c503bc4b727e56d9c0f914a93c8ef6.tar.gz |
aarch64: vp9: use alternative returns in the core loop filter function
Since aarch64 has enough free general purpose registers use them to
branch to the appropiate storage code. 1-2 cycles faster for the
functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
(up to 2 cycles faster/slower) on a cortex-a57.
This is cherrypicked from libav commit
d7595de0b25e7064fd9e06dea5d0425536cef6dc.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aarch64/vp9lpf_neon.S | 48 |
1 files changed, 18 insertions, 30 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index e727a4d0de..78aae61e87 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -410,15 +410,19 @@ .endif // If no pixels needed flat8in nor flat8out, jump to a // writeout of the inner 4 pixels - cbz x5, 7f + cbnz x5, 1f + br x14 +1: mov x5, v7.d[0] .ifc \sz, .16b mov x6, v7.d[1] orr x5, x5, x6 .endif // If no pixels need flat8out, jump to a writeout of the inner 6 pixels - cbz x5, 8f + cbnz x5, 1f + br x15 +1: // flat8out // This writes all outputs into v2-v17 (skipping v6 and v16). // If this part is skipped, the output is read from v21-v26 (which is the input @@ -549,35 +553,24 @@ endfunc function vp9_loop_filter_8 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 - mov x5, #0 ret 6: - mov x5, #6 - ret + br x13 9: br x10 endfunc function vp9_loop_filter_8_16b_mix loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 - mov x5, #0 ret 6: - mov x5, #6 - ret + br x13 9: br x10 endfunc function vp9_loop_filter_16 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 - mov x5, #0 - ret -7: - mov x5, #7 - ret -8: - mov x5, #8 ret 9: ldp d8, d9, [sp], 0x10 @@ -589,13 +582,6 @@ endfunc function vp9_loop_filter_16_16b loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 - mov x5, #0 - ret -7: - mov x5, #7 - ret -8: - mov x5, #8 ret 9: ldp d8, d9, [sp], 0x10 @@ -614,11 +600,14 @@ endfunc .endm .macro loop_filter_8 + // calculate alternative 'return' targets + adr x13, 6f bl vp9_loop_filter_8 - cbnz x5, 6f .endm .macro loop_filter_8_16b_mix mix + // calculate alternative 'return' targets + adr x13, 6f .if \mix == 48 mov x11, #0xffffffff00000000 .elseif \mix == 84 @@ -627,21 +616,20 @@ endfunc mov x11, #0xffffffffffffffff .endif bl vp9_loop_filter_8_16b_mix - cbnz x5, 6f .endm .macro loop_filter_16 + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f bl vp9_loop_filter_16 - cmp x5, 7 - b.gt 8f - b.eq 7f .endm .macro loop_filter_16_16b + // calculate alternative 'return' targets + adr x14, 7f + adr x15, 8f bl vp9_loop_filter_16_16b - cmp x5, 7 - b.gt 8f - b.eq 7f .endm |