diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2020-09-28 13:35:51 +0100 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2021-11-16 13:43:56 +0200 |
commit | 6f04cf54f532ad5695072303d74cddaa591f9aa1 (patch) | |
tree | 56f9a47e38e4758f2f64ae3971704e5c9d288d14 /libavcodec/aarch64/vp9lpf_neon.S | |
parent | 20c66fe2f9ac69dbfa2f3cfc2386aae4ba7adc60 (diff) | |
download | ffmpeg-6f04cf54f532ad5695072303d74cddaa591f9aa1.tar.gz |
aarch64: Use ret x<n> instead of br x<n> where possible
Change AArch64 assembly code to use:
ret x<n>
instead of:
br x<n>
"ret x<n>" is already used in a lot of places so this patch makes it
consistent across the code base. This does not change behavior or
performance.
In addition, this change reduces the number of landing pads needed in
a subsequent patch to support the Armv8.5-A Branch Target
Identification (BTI) security feature.
Signed-off-by: Jonathan Wright <jonathan.wright@arm.com>
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64/vp9lpf_neon.S')
-rw-r--r-- | libavcodec/aarch64/vp9lpf_neon.S | 52 |
1 files changed, 26 insertions, 26 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 0878763020..694ff8956f 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -399,7 +399,7 @@ .endif // If no pixels needed flat8in nor flat8out, jump to a // writeout of the inner 4 pixels - br x14 + ret x14 1: mov x5, v7.d[0] @@ -411,7 +411,7 @@ cbnz x5, 1f .endif // If no pixels need flat8out, jump to a writeout of the inner 6 pixels - br x15 + ret x15 1: // flat8out @@ -532,32 +532,32 @@ function vp9_loop_filter_4 loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 ret 9: - br x10 + ret x10 endfunc function vp9_loop_filter_4_16b_mix_44 loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31 ret 9: - br x10 + ret x10 endfunc function vp9_loop_filter_8 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 ret 6: - br x13 + ret x13 9: - br x10 + ret x10 endfunc function vp9_loop_filter_8_16b_mix loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 ret 6: - br x13 + ret x13 9: - br x10 + ret x10 endfunc function vp9_loop_filter_16 @@ -568,7 +568,7 @@ function vp9_loop_filter_16 ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 endfunc function vp9_loop_filter_16_16b @@ -579,7 +579,7 @@ function vp9_loop_filter_16_16b ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 endfunc .macro loop_filter_4 @@ -648,7 +648,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1 st1 {v23.8b}, [x9], x1 st1 {v25.8b}, [x0], x1 - br x10 + ret x10 endfunc function ff_vp9_loop_filter_v_44_16_neon, export=1 @@ -672,7 +672,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1 st1 {v23.16b}, [x9], x1 st1 {v25.16b}, [x0], x1 - br x10 + ret x10 endfunc function ff_vp9_loop_filter_h_4_8_neon, export=1 @@ -714,7 +714,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1 st1 {v25.s}[0], [x9], x1 st1 {v25.s}[1], [x0], x1 - br x10 + ret x10 endfunc function ff_vp9_loop_filter_h_44_16_neon, export=1 @@ -766,7 +766,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1 st1 {v25.s}[1], [x9], x1 st1 {v25.s}[3], [x0], x1 - br x10 + ret x10 endfunc function ff_vp9_loop_filter_v_8_8_neon, export=1 @@ -793,14 +793,14 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1 st1 {v23.8b}, [x9], x1 st1 {v26.8b}, [x0], x1 - br x10 + ret x10 6: sub x9, x0, x1, lsl #1 st1 {v22.8b}, [x9], x1 st1 {v24.8b}, [x0], x1 st1 {v23.8b}, [x9], x1 st1 {v25.8b}, [x0], x1 - br x10 + ret x10 endfunc .macro mix_v_16 mix @@ -828,14 +828,14 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1 st1 {v23.16b}, [x9], x1 st1 {v26.16b}, [x0], x1 - br x10 + ret x10 6: sub x9, x0, x1, lsl #1 st1 {v22.16b}, [x9], x1 st1 {v24.16b}, [x0], x1 st1 {v23.16b}, [x9], x1 st1 {v25.16b}, [x0], x1 - br x10 + ret x10 endfunc .endm @@ -876,7 +876,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1 st1 {v23.8b}, [x9], x1 st1 {v27.8b}, [x0], x1 - br x10 + ret x10 6: // If we didn't need to do the flat8in part, we use the same writeback // as in loop_filter_h_4_8. @@ -891,7 +891,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1 st1 {v24.s}[1], [x0], x1 st1 {v25.s}[0], [x9], x1 st1 {v25.s}[1], [x0], x1 - br x10 + ret x10 endfunc .macro mix_h_16 mix @@ -942,7 +942,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 st1 {v27.8b}, [x9], x1 st1 {v27.d}[1], [x0], x1 - br x10 + ret x10 6: add x9, x9, #2 add x0, x0, #2 @@ -963,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x9], x1 st1 {v25.s}[3], [x0], x1 - br x10 + ret x10 endfunc .endm @@ -1022,7 +1022,7 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1 ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 8: add x9, x9, x1, lsl #2 // If we didn't do the flat8out part, the output is left in the @@ -1091,7 +1091,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1 ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 8: add x9, x9, x1, lsl #2 st1 {v21.16b}, [x9], x1 @@ -1168,7 +1168,7 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1 ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 8: // The same writeback as in loop_filter_h_8_8 sub x9, x0, #4 @@ -1287,7 +1287,7 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1 ldp d10, d11, [sp], 0x10 ldp d12, d13, [sp], 0x10 ldp d14, d15, [sp], 0x10 - br x10 + ret x10 8: sub x9, x0, #4 add x0, x9, x1, lsl #3 |