aarch64: vp9: use alternative returns in the core loop filter function

Since aarch64 has enough free general purpose registers use them to branch to the appropiate storage code. 1-2 cycles faster for the functions using loop_filter 8/16, ... on a cortex-a53. Mixed results (up to 2 cycles faster/slower) on a cortex-a57. This is cherrypicked from libav commit d7595de0b25e7064fd9e06dea5d0425536cef6dc. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
author: Janne Grunau <janne-libav@jannau.net> 2017-01-10 00:15:07 +0200
committer: Michael Niedermayer <michael@niedermayer.cc> 2017-01-14 21:13:06 +0100
commit: 62ea07d797c503bc4b727e56d9c0f914a93c8ef6 (patch)
tree: cc117829b61ee24d877615cb41df2f7f72ccf856
parent: 3ac46a0a62386a52e38c066379ff36b5038dd4d0 (diff)
download: ffmpeg-62ea07d797c503bc4b727e56d9c0f914a93c8ef6.tar.gz
1 files changed, 18 insertions, 30 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e727a4d0de..78aae61e87 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
 .endif
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        cbz             x5,  7f
+        cbnz            x5,  1f
+        br              x14
+1:
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
         orr             x5,  x5,  x6
 .endif
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        cbz             x5,  8f
+        cbnz            x5,  1f
+        br              x15
 
+1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@ endfunc
 
 function vp9_loop_filter_8
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_8_16b_mix
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_16
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -589,13 +582,6 @@ endfunc
 
 function vp9_loop_filter_16_16b
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -614,11 +600,14 @@ endfunc
 .endm
 
 .macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
         bl              vp9_loop_filter_8
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_8_16b_mix mix
+        // calculate alternative 'return' targets
+        adr             x13, 6f
 .if \mix == 48
         mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -627,21 +616,20 @@ endfunc
         mov             x11, #0xffffffffffffffff
 .endif
         bl              vp9_loop_filter_8_16b_mix
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm
 
 .macro loop_filter_16_16b
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16_16b
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm
author	Janne Grunau <janne-libav@jannau.net>	2017-01-10 00:15:07 +0200
committer	Michael Niedermayer <michael@niedermayer.cc>	2017-01-14 21:13:06 +0100
commit	62ea07d797c503bc4b727e56d9c0f914a93c8ef6 (patch)
tree	cc117829b61ee24d877615cb41df2f7f72ccf856
parent	3ac46a0a62386a52e38c066379ff36b5038dd4d0 (diff)
download	ffmpeg-62ea07d797c503bc4b727e56d9c0f914a93c8ef6.tar.gz