diff options
author | Martin Storsjö <martin@martin.st> | 2023-10-17 13:47:27 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2023-10-21 23:25:29 +0300 |
commit | 7f905f3672da4f1fa28d7cccf1fef7f9984e0480 (patch) | |
tree | a60676b089c38c5bd738ade95fb6892f805835bd | |
parent | 93cda5a9c292e47cf080e6158c5461455d28ccae (diff) | |
download | ffmpeg-7f905f3672da4f1fa28d7cccf1fef7f9984e0480.tar.gz |
aarch64: Make the indentation more consistent
Some functions have slightly different indentation styles; try
to match the surrounding code.
libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally
uses a layered indentation style to visually show how different
unrolled/interleaved phases fit together.
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r-- | libavcodec/aarch64/h264dsp_neon.S | 8 | ||||
-rw-r--r-- | libavcodec/aarch64/h264qpel_neon.S | 12 | ||||
-rw-r--r-- | libavcodec/aarch64/hevcdsp_idct_neon.S | 256 | ||||
-rw-r--r-- | libavcodec/aarch64/hevcdsp_qpel_neon.S | 24 | ||||
-rw-r--r-- | libavcodec/aarch64/opusdsp_neon.S | 8 | ||||
-rw-r--r-- | libavcodec/aarch64/vp8dsp_neon.S | 310 | ||||
-rw-r--r-- | libavutil/aarch64/tx_float_neon.S | 12 |
7 files changed, 315 insertions, 315 deletions
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 71c2ddfd0c..723b692019 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -526,7 +526,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 ld1 {v17.8b}, [x4], x1 ld1 {v19.8b}, [x4], x1 - transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra @@ -554,7 +554,7 @@ h_loop_filter_chroma420_intra: ld1 {v17.s}[1], [x4], x1 ld1 {v19.s}[1], [x4], x1 - transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra @@ -1017,7 +1017,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1 ld1 {v16.8h}, [x4], x1 ld1 {v19.8h}, [x9], x1 - transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra_10 @@ -1045,7 +1045,7 @@ h_loop_filter_chroma420_intra_10: ld1 {v19.4h}, [x4], x1 ld1 {v19.d}[1], [x9], x1 - transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 + transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra_10 diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S index 21906327cd..f4475d96f9 100644 --- a/libavcodec/aarch64/h264qpel_neon.S +++ b/libavcodec/aarch64/h264qpel_neon.S @@ -580,8 +580,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon endfunc .endm - h264_qpel16_hv put - h264_qpel16_hv avg + h264_qpel16_hv put + h264_qpel16_hv avg .macro h264_qpel8 type function ff_\type\()_h264_qpel8_mc10_neon, export=1 @@ -759,8 +759,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1 endfunc .endm - h264_qpel8 put - h264_qpel8 avg + h264_qpel8 put + h264_qpel8 avg .macro h264_qpel16 type function ff_\type\()_h264_qpel16_mc10_neon, export=1 @@ -931,5 +931,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1 endfunc .endm - h264_qpel16 put - h264_qpel16 avg + h264_qpel16 put + h264_qpel16 avg diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index ba8a1ebaed..3cac6e6db9 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -239,23 +239,23 @@ function hevc_add_residual_32x32_16_neon, export=0 endfunc .macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift - sshll v20.4s, \in0, #6 - sshll v21.4s, \in0, #6 - smull v22.4s, \in1, v4.h[1] - smull v23.4s, \in1, v4.h[3] - smlal v20.4s, \in2, v4.h[0] //e0 - smlsl v21.4s, \in2, v4.h[0] //e1 - smlal v22.4s, \in3, v4.h[3] //o0 - smlsl v23.4s, \in3, v4.h[1] //o1 - - add v24.4s, v20.4s, v22.4s - sub v20.4s, v20.4s, v22.4s - add v22.4s, v21.4s, v23.4s - sub v21.4s, v21.4s, v23.4s - sqrshrn \out0, v24.4s, #\shift - sqrshrn \out3, v20.4s, #\shift - sqrshrn \out1, v22.4s, #\shift - sqrshrn \out2, v21.4s, #\shift + sshll v20.4s, \in0, #6 + sshll v21.4s, \in0, #6 + smull v22.4s, \in1, v4.h[1] + smull v23.4s, \in1, v4.h[3] + smlal v20.4s, \in2, v4.h[0] //e0 + smlsl v21.4s, \in2, v4.h[0] //e1 + smlal v22.4s, \in3, v4.h[3] //o0 + smlsl v23.4s, \in3, v4.h[1] //o1 + + add v24.4s, v20.4s, v22.4s + sub v20.4s, v20.4s, v22.4s + add v22.4s, v21.4s, v23.4s + sub v21.4s, v21.4s, v23.4s + sqrshrn \out0, v24.4s, #\shift + sqrshrn \out3, v20.4s, #\shift + sqrshrn \out1, v22.4s, #\shift + sqrshrn \out2, v21.4s, #\shift .endm .macro idct_4x4 bitdepth @@ -294,19 +294,19 @@ endfunc // uses and clobbers v28-v31 as temp registers .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 - sshll\p1 v28.4s, \in0, #6 - mov v29.16b, v28.16b - smull\p1 v30.4s, \in1, v0.h[1] - smull\p1 v31.4s, \in1, v0.h[3] - smlal\p2 v28.4s, \in2, v0.h[0] //e0 - smlsl\p2 v29.4s, \in2, v0.h[0] //e1 - smlal\p2 v30.4s, \in3, v0.h[3] //o0 - smlsl\p2 v31.4s, \in3, v0.h[1] //o1 - - add \out0, v28.4s, v30.4s - add \out1, v29.4s, v31.4s - sub \out2, v29.4s, v31.4s - sub \out3, v28.4s, v30.4s + sshll\p1 v28.4s, \in0, #6 + mov v29.16b, v28.16b + smull\p1 v30.4s, \in1, v0.h[1] + smull\p1 v31.4s, \in1, v0.h[3] + smlal\p2 v28.4s, \in2, v0.h[0] //e0 + smlsl\p2 v29.4s, \in2, v0.h[0] //e1 + smlal\p2 v30.4s, \in3, v0.h[3] //o0 + smlsl\p2 v31.4s, \in3, v0.h[1] //o1 + + add \out0, v28.4s, v30.4s + add \out1, v29.4s, v31.4s + sub \out2, v29.4s, v31.4s + sub \out3, v28.4s, v30.4s .endm .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 @@ -362,11 +362,11 @@ endfunc .macro idct_8x8 bitdepth function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 //x0 - coeffs - mov x1, x0 + mov x1, x0 ld1 {v16.8h-v19.8h}, [x1], #64 ld1 {v20.8h-v23.8h}, [x1] - movrel x1, trans + movrel x1, trans ld1 {v0.8h}, [x1] tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h @@ -379,7 +379,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23 - mov x1, x0 + mov x1, x0 st1 {v16.8h-v19.8h}, [x1], #64 st1 {v20.8h-v23.8h}, [x1] @@ -388,8 +388,8 @@ endfunc .endm .macro butterfly e, o, tmp_p, tmp_m - add \tmp_p, \e, \o - sub \tmp_m, \e, \o + add \tmp_p, \e, \o + sub \tmp_m, \e, \o .endm .macro tr16_8x4 in0, in1, in2, in3, offset @@ -418,7 +418,7 @@ endfunc butterfly v25.4s, v29.4s, v17.4s, v22.4s butterfly v26.4s, v30.4s, v18.4s, v21.4s butterfly v27.4s, v31.4s, v19.4s, v20.4s - add x4, sp, #\offset + add x4, sp, #\offset st1 {v16.4s-v19.4s}, [x4], #64 st1 {v20.4s-v23.4s}, [x4] .endm @@ -435,14 +435,14 @@ endfunc .endm .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p - sum_sub v21.4s, \in, \t0, \op0, \p - sum_sub v22.4s, \in, \t1, \op1, \p - sum_sub v23.4s, \in, \t2, \op2, \p - sum_sub v24.4s, \in, \t3, \op3, \p - sum_sub v25.4s, \in, \t4, \op4, \p - sum_sub v26.4s, \in, \t5, \op5, \p - sum_sub v27.4s, \in, \t6, \op6, \p - sum_sub v28.4s, \in, \t7, \op7, \p + sum_sub v21.4s, \in, \t0, \op0, \p + sum_sub v22.4s, \in, \t1, \op1, \p + sum_sub v23.4s, \in, \t2, \op2, \p + sum_sub v24.4s, \in, \t3, \op3, \p + sum_sub v25.4s, \in, \t4, \op4, \p + sum_sub v26.4s, \in, \t5, \op5, \p + sum_sub v27.4s, \in, \t6, \op6, \p + sum_sub v28.4s, \in, \t7, \op7, \p .endm .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 @@ -528,20 +528,20 @@ endfunc .macro tr_16x4 name, shift, offset, step function func_tr_16x4_\name - mov x1, x5 - add x3, x5, #(\step * 64) - mov x2, #(\step * 128) + mov x1, x5 + add x3, x5, #(\step * 64) + mov x2, #(\step * 128) load16 v16.d, v17.d, v18.d, v19.d - movrel x1, trans + movrel x1, trans ld1 {v0.8h}, [x1] tr16_8x4 v16, v17, v18, v19, \offset - add x1, x5, #(\step * 32) - add x3, x5, #(\step * 3 *32) - mov x2, #(\step * 128) + add x1, x5, #(\step * 32) + add x3, x5, #(\step * 3 *32) + mov x2, #(\step * 128) load16 v20.d, v17.d, v18.d, v19.d - movrel x1, trans, 16 + movrel x1, trans, 16 ld1 {v1.8h}, [x1] smull v21.4s, v20.4h, v1.h[0] smull v22.4s, v20.4h, v1.h[1] @@ -560,19 +560,19 @@ function func_tr_16x4_\name add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, + add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2 - add x4, sp, #\offset + add x4, sp, #\offset ld1 {v16.4s-v19.4s}, [x4], #64 butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s .if \shift > 0 scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7 - mov x1, x6 - add x3, x6, #(24 +3*32) - mov x2, #32 - mov x4, #-32 + mov x1, x6 + add x3, x6, #(24 +3*32) + mov x2, #32 + mov x4, #-32 store16 v29.d, v30.d, v31.d, v24.d, x4 .else - store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s + store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s .endif add x4, sp, #(\offset + 64) @@ -582,13 +582,13 @@ function func_tr_16x4_\name scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7 - add x1, x6, #8 - add x3, x6, #(16 + 3 * 32) - mov x2, #32 - mov x4, #-32 + add x1, x6, #8 + add x3, x6, #(16 + 3 * 32) + mov x2, #32 + mov x4, #-32 store16 v29.d, v30.d, v31.d, v20.d, x4 .else - store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s + store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s .endif ret @@ -601,21 +601,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1 mov x15, x30 // allocate a temp buffer - sub sp, sp, #640 + sub sp, sp, #640 .irp i, 0, 1, 2, 3 - add x5, x0, #(8 * \i) - add x6, sp, #(8 * \i * 16) + add x5, x0, #(8 * \i) + add x6, sp, #(8 * \i * 16) bl func_tr_16x4_firstpass .endr .irp i, 0, 1, 2, 3 - add x5, sp, #(8 * \i) - add x6, x0, #(8 * \i * 16) + add x5, sp, #(8 * \i) + add x6, x0, #(8 * \i * 16) bl func_tr_16x4_secondpass_\bitdepth .endr - add sp, sp, #640 + add sp, sp, #640 ret x15 endfunc @@ -644,10 +644,10 @@ endfunc .endm .macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p - sum_sub v24.4s, \in, \t0, \op0, \p - sum_sub v25.4s, \in, \t1, \op1, \p - sum_sub v26.4s, \in, \t2, \op2, \p - sum_sub v27.4s, \in, \t3, \op3, \p + sum_sub v24.4s, \in, \t0, \op0, \p + sum_sub v25.4s, \in, \t1, \op1, \p + sum_sub v26.4s, \in, \t2, \op2, \p + sum_sub v27.4s, \in, \t3, \op3, \p .endm .macro butterfly32 in0, in1, in2, in3, out @@ -841,85 +841,85 @@ idct_32x32 8 idct_32x32 10 .macro tr4_luma_shift r0, r1, r2, r3, shift - saddl v0.4s, \r0, \r2 // c0 = src0 + src2 - saddl v1.4s, \r2, \r3 // c1 = src2 + src3 - ssubl v2.4s, \r0, \r3 // c2 = src0 - src3 - smull v3.4s, \r1, v21.4h // c3 = 74 * src1 - - saddl v7.4s, \r0, \r3 // src0 + src3 - ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3 - mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3) - - mul v5.4s, v0.4s, v19.4s // 29 * c0 - mul v6.4s, v1.4s, v20.4s // 55 * c1 - add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1 - add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3 - - mul v1.4s, v1.4s, v19.4s // 29 * c1 - mul v6.4s, v2.4s, v20.4s // 55 * c2 - sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1 - add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3 - - mul v0.4s, v0.4s, v20.4s // 55 * c0 - mul v2.4s, v2.4s, v19.4s // 29 * c2 - add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2 - sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3 - - sqrshrn \r0, v5.4s, \shift - sqrshrn \r1, v6.4s, \shift - sqrshrn \r2, v7.4s, \shift - sqrshrn \r3, v0.4s, \shift + saddl v0.4s, \r0, \r2 // c0 = src0 + src2 + saddl v1.4s, \r2, \r3 // c1 = src2 + src3 + ssubl v2.4s, \r0, \r3 // c2 = src0 - src3 + smull v3.4s, \r1, v21.4h // c3 = 74 * src1 + + saddl v7.4s, \r0, \r3 // src0 + src3 + ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3 + mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3) + + mul v5.4s, v0.4s, v19.4s // 29 * c0 + mul v6.4s, v1.4s, v20.4s // 55 * c1 + add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1 + add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3 + + mul v1.4s, v1.4s, v19.4s // 29 * c1 + mul v6.4s, v2.4s, v20.4s // 55 * c2 + sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1 + add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3 + + mul v0.4s, v0.4s, v20.4s // 55 * c0 + mul v2.4s, v2.4s, v19.4s // 29 * c2 + add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2 + sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3 + + sqrshrn \r0, v5.4s, \shift + sqrshrn \r1, v6.4s, \shift + sqrshrn \r2, v7.4s, \shift + sqrshrn \r3, v0.4s, \shift .endm function ff_hevc_transform_luma_4x4_neon_8, export=1 - ld1 {v28.4h-v31.4h}, [x0] - movi v18.4s, #74 - movi v19.4s, #29 - movi v20.4s, #55 - movi v21.4h, #74 + ld1 {v28.4h-v31.4h}, [x0] + movi v18.4s, #74 + movi v19.4s, #29 + movi v20.4s, #55 + movi v21.4h, #74 - tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7 - transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 - tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12 - transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 + tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12 + transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25 - st1 {v28.4h-v31.4h}, [x0] + st1 {v28.4h-v31.4h}, [x0] ret endfunc // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs) .macro idct_dc size, bitdepth function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 - ld1r {v4.8h}, [x0] - srshr v4.8h, v4.8h, #1 - srshr v0.8h, v4.8h, #(14 - \bitdepth) - srshr v1.8h, v4.8h, #(14 - \bitdepth) + ld1r {v4.8h}, [x0] + srshr v4.8h, v4.8h, #1 + srshr v0.8h, v4.8h, #(14 - \bitdepth) + srshr v1.8h, v4.8h, #(14 - \bitdepth) .if \size > 4 - srshr v2.8h, v4.8h, #(14 - \bitdepth) - srshr v3.8h, v4.8h, #(14 - \bitdepth) + srshr v2.8h, v4.8h, #(14 - \bitdepth) + srshr v3.8h, v4.8h, #(14 - \bitdepth) .if \size > 16 /* dc 32x32 */ - mov x2, #4 + mov x2, #4 1: - subs x2, x2, #1 + subs x2, x2, #1 .endif add x12, x0, #64 mov x13, #128 .if \size > 8 /* dc 16x16 */ - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 + st1 {v0.8h-v3.8h}, [x0], x13 + st1 {v0.8h-v3.8h}, [x12], x13 + st1 {v0.8h-v3.8h}, [x0], x13 + st1 {v0.8h-v3.8h}, [x12], x13 + st1 {v0.8h-v3.8h}, [x0], x13 + st1 {v0.8h-v3.8h}, [x12], x13 .endif /* dc 8x8 */ - st1 {v0.8h-v3.8h}, [x0], x13 - st1 {v0.8h-v3.8h}, [x12], x13 + st1 {v0.8h-v3.8h}, [x0], x13 + st1 {v0.8h-v3.8h}, [x12], x13 .if \size > 16 /* dc 32x32 */ bne 1b .endif .else /* dc 4x4 */ - st1 {v0.8h-v1.8h}, [x0] + st1 {v0.8h-v1.8h}, [x0] .endif ret endfunc diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index 1212eae63d..f3f24ab8b0 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -840,19 +840,19 @@ function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1 endfunc function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1 - b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon) + b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1 - b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1 - b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1 - b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) + b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon) endfunc function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 @@ -1560,21 +1560,21 @@ endfunc #if HAVE_I8MM .macro calc_all2 - calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31 + calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31 b.eq 2f - calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17 + calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17 b.eq 2f - calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19 + calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19 b.eq 2f - calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21 + calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21 b.eq 2f - calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23 + calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23 b.eq 2f - calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25 + calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25 b.eq 2f - calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27 + calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27 b.eq 2f - calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29 + calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29 b.hi 1b .endm diff --git a/libavcodec/aarch64/opusdsp_neon.S b/libavcodec/aarch64/opusdsp_neon.S index 46c2be0874..1c88d7d123 100644 --- a/libavcodec/aarch64/opusdsp_neon.S +++ b/libavcodec/aarch64/opusdsp_neon.S @@ -34,13 +34,13 @@ endconst function ff_opus_deemphasis_neon, export=1 movrel x4, tab_st - ld1 {v4.4s}, [x4] + ld1 {v4.4s}, [x4] movrel x4, tab_x0 - ld1 {v5.4s}, [x4] + ld1 {v5.4s}, [x4] movrel x4, tab_x1 - ld1 {v6.4s}, [x4] + ld1 {v6.4s}, [x4] movrel x4, tab_x2 - ld1 {v7.4s}, [x4] + ld1 {v7.4s}, [x4] fmul v0.4s, v4.4s, v0.s[0] diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 4bbf16d1a4..e385293ba7 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -330,32 +330,32 @@ endfunc // v17: hev // convert to signed value: - eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 - eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 - - movi v20.8h, #3 - ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 - ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) - eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 - eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 - mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) - mul v19.8h, v19.8h, v20.8h - - sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) - movi v22.16b, #4 - movi v23.16b, #3 + eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 + + movi v20.8h, #3 + ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 + ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) + eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 + mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) + mul v19.8h, v19.8h, v20.8h + + sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) + movi v22.16b, #4 + movi v23.16b, #3 .if \inner - and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) + and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) .endif - saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) - saddw2 v19.8h, v19.8h, v20.16b - sqxtn v18.8b, v18.8h // narrow result back into v18 - sqxtn2 v18.16b, v19.8h + saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) + saddw2 v19.8h, v19.8h, v20.16b + sqxtn v18.8b, v18.8h // narrow result back into v18 + sqxtn2 v18.16b, v19.8h .if !\inner && !\simple - eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 - eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 + eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 .endif - and v18.16b, v18.16b, v16.16b // w &= normal_limit + and v18.16b, v18.16b, v16.16b // w &= normal_limit // registers used at this point.. // v0 -> P3 (don't corrupt) @@ -375,44 +375,44 @@ endfunc // P0 = s2u(PS0 + c2); .if \simple - sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) - sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) - sshr v19.16b, v19.16b, #3 // c1 >>= 3 - sshr v20.16b, v20.16b, #3 // c2 >>= 3 - sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) - sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) - eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 - eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 - eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 - eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 .elseif \inner // the !is4tap case of filter_common, only used for inner blocks // c3 = ((c1&~hev) + 1) >> 1; // Q1 = s2u(QS1 - c3); // P1 = s2u(PS1 + c3); - sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) - sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) - sshr v19.16b, v19.16b, #3 // c1 >>= 3 - sshr v20.16b, v20.16b, #3 // c2 >>= 3 - sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) - sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) - bic v19.16b, v19.16b, v17.16b // c1 & ~hev - eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 - srshr v19.16b, v19.16b, #1 // c3 >>= 1 - eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 - sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) - sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) - eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 - eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + bic v19.16b, v19.16b, v17.16b // c1 & ~hev + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + srshr v19.16b, v19.16b, #1 // c3 >>= 1 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 .else - and v20.16b, v18.16b, v17.16b // w & hev - sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) - sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) - sshr v19.16b, v19.16b, #3 // c1 >>= 3 - sshr v20.16b, v20.16b, #3 // c2 >>= 3 - bic v18.16b, v18.16b, v17.16b // w &= ~hev - sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) - sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + and v20.16b, v18.16b, v17.16b // w & hev + sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + bic v18.16b, v18.16b, v17.16b // w &= ~hev + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) // filter_mbedge: // a = clamp((27*w + 63) >> 7); @@ -424,35 +424,35 @@ endfunc // a = clamp((9*w + 63) >> 7); // Q2 = s2u(QS2 - a); // P2 = s2u(PS2 + a); - movi v17.8h, #63 - sshll v22.8h, v18.8b, #3 - sshll2 v23.8h, v18.16b, #3 - saddw v22.8h, v22.8h, v18.8b - saddw2 v23.8h, v23.8h, v18.16b - add v16.8h, v17.8h, v22.8h - add v17.8h, v17.8h, v23.8h // 9*w + 63 - add v19.8h, v16.8h, v22.8h - add v20.8h, v17.8h, v23.8h // 18*w + 63 - add v22.8h, v19.8h, v22.8h - add v23.8h, v20.8h, v23.8h // 27*w + 63 - sqshrn v16.8b, v16.8h, #7 - sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) - sqshrn v19.8b, v19.8h, #7 - sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) - sqshrn v22.8b, v22.8h, #7 - sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) - sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) - sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) - sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) - sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) - sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) - sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) - eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 - eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 - eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 - eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 - eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 - eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 + movi v17.8h, #63 + sshll v22.8h, v18.8b, #3 + sshll2 v23.8h, v18.16b, #3 + saddw v22.8h, v22.8h, v18.8b + saddw2 v23.8h, v23.8h, v18.16b + add v16.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v23.8h // 9*w + 63 + add v19.8h, v16.8h, v22.8h + add v20.8h, v17.8h, v23.8h // 18*w + 63 + add v22.8h, v19.8h, v22.8h + add v23.8h, v20.8h, v23.8h // 27*w + 63 + sqshrn v16.8b, v16.8h, #7 + sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) + sqshrn v19.8b, v19.8h, #7 + sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) + sqshrn v22.8b, v22.8h, #7 + sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) + sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) + sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) + sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) + sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 .endif .endm @@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 sub x0, x0, x2, lsl #2 sub x1, x1, x2, lsl #2 // Load pixels: - ld1 {v0.d}[0], [x0], x2 // P3 - ld1 {v0.d}[1], [x1], x2 // P3 - ld1 {v1.d}[0], [x0], x2 // P2 - ld1 {v1.d}[1], [x1], x2 // P2 - ld1 {v2.d}[0], [x0], x2 // P1 - ld1 {v2.d}[1], [x1], x2 // P1 - ld1 {v3.d}[0], [x0], x2 // P0 - ld1 {v3.d}[1], [x1], x2 // P0 - ld1 {v4.d}[0], [x0], x2 // Q0 - ld1 {v4.d}[1], [x1], x2 // Q0 - ld1 {v5.d}[0], [x0], x2 // Q1 - ld1 {v5.d}[1], [x1], x2 // Q1 - ld1 {v6.d}[0], [x0], x2 // Q2 - ld1 {v6.d}[1], [x1], x2 // Q2 - ld1 {v7.d}[0], [x0] // Q3 - ld1 {v7.d}[1], [x1] // Q3 - - dup v22.16b, w3 // flim_E - dup v23.16b, w4 // flim_I + ld1 {v0.d}[0], [x0], x2 // P3 + ld1 {v0.d}[1], [x1], x2 // P3 + ld1 {v1.d}[0], [x0], x2 // P2 + ld1 {v1.d}[1], [x1], x2 // P2 + ld1 {v2.d}[0], [x0], x2 // P1 + ld1 {v2.d}[1], [x1], x2 // P1 + ld1 {v3.d}[0], [x0], x2 // P0 + ld1 {v3.d}[1], [x1], x2 // P0 + ld1 {v4.d}[0], [x0], x2 // Q0 + ld1 {v4.d}[1], [x1], x2 // Q0 + ld1 {v5.d}[0], [x0], x2 // Q1 + ld1 {v5.d}[1], [x1], x2 // Q1 + ld1 {v6.d}[0], [x0], x2 // Q2 + ld1 {v6.d}[1], [x1], x2 // Q2 + ld1 {v7.d}[0], [x0] // Q3 + ld1 {v7.d}[1], [x1] // Q3 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I vp8_loop_filter inner=\inner, hev_thresh=w5 // back up to P2: u,v -= stride * 6 - sub x0, x0, x2, lsl #2 - sub x1, x1, x2, lsl #2 - sub x0, x0, x2, lsl #1 - sub x1, x1, x2, lsl #1 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + sub x0, x0, x2, lsl #1 + sub x1, x1, x2, lsl #1 // Store pixels: - st1 {v1.d}[0], [x0], x2 // P2 - st1 {v1.d}[1], [x1], x2 // P2 - st1 {v2.d}[0], [x0], x2 // P1 - st1 {v2.d}[1], [x1], x2 // P1 - st1 {v3.d}[0], [x0], x2 // P0 - st1 {v3.d}[1], [x1], x2 // P0 - st1 {v4.d}[0], [x0], x2 // Q0 - st1 {v4.d}[1], [x1], x2 // Q0 - st1 {v5.d}[0], [x0], x2 // Q1 - st1 {v5.d}[1], [x1], x2 // Q1 - st1 {v6.d}[0], [x0] // Q2 - st1 {v6.d}[1], [x1] // Q2 + st1 {v1.d}[0], [x0], x2 // P2 + st1 {v1.d}[1], [x1], x2 // P2 + st1 {v2.d}[0], [x0], x2 // P1 + st1 {v2.d}[1], [x1], x2 // P1 + st1 {v3.d}[0], [x0], x2 // P0 + st1 {v3.d}[1], [x1], x2 // P0 + st1 {v4.d}[0], [x0], x2 // Q0 + st1 {v4.d}[1], [x1], x2 // Q0 + st1 {v5.d}[0], [x0], x2 // Q1 + st1 {v5.d}[1], [x1], x2 // Q1 + st1 {v6.d}[0], [x0] // Q2 + st1 {v6.d}[1], [x1] // Q2 ret endfunc @@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 ld1 {v6.d}[1], [x0], x1 ld1 {v7.d}[1], [x0], x1 - transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 dup v22.16b, w2 // flim_E .if !\simple @@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 sub x0, x0, x1, lsl #4 // backup 16 rows - transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 // Store pixels: st1 {v0.d}[0], [x0], x1 @@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 sub x1, x1, #4 // Load pixels: - ld1 {v0.d}[0], [x0], x2 // load u - ld1 {v0.d}[1], [x1], x2 // load v - ld1 {v1.d}[0], [x0], x2 - ld1 {v1.d}[1], [x1], x2 - ld1 {v2.d}[0], [x0], x2 - ld1 {v2.d}[1], [x1], x2 - ld1 {v3.d}[0], [x0], x2 - ld1 {v3.d}[1], [x1], x2 - ld1 {v4.d}[0], [x0], x2 - ld1 {v4.d}[1], [x1], x2 - ld1 {v5.d}[0], [x0], x2 - ld1 {v5.d}[1], [x1], x2 - ld1 {v6.d}[0], [x0], x2 - ld1 {v6.d}[1], [x1], x2 - ld1 {v7.d}[0], [x0], x2 - ld1 {v7.d}[1], [x1], x2 - - transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + ld1 {v0.d}[0], [x0], x2 // load u + ld1 {v0.d}[1], [x1], x2 // load v + ld1 {v1.d}[0], [x0], x2 + ld1 {v1.d}[1], [x1], x2 + ld1 {v2.d}[0], [x0], x2 + ld1 {v2.d}[1], [x1], x2 + ld1 {v3.d}[0], [x0], x2 + ld1 {v3.d}[1], [x1], x2 + ld1 {v4.d}[0], [x0], x2 + ld1 {v4.d}[1], [x1], x2 + ld1 {v5.d}[0], [x0], x2 + ld1 {v5.d}[1], [x1], x2 + ld1 {v6.d}[0], [x0], x2 + ld1 {v6.d}[1], [x1], x2 + ld1 {v7.d}[0], [x0], x2 + ld1 {v7.d}[1], [x1], x2 + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 dup v22.16b, w3 // flim_E dup v23.16b, w4 // flim_I @@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 sub x0, x0, x2, lsl #3 // backup u 8 rows sub x1, x1, x2, lsl #3 // backup v 8 rows - transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 // Store pixels: - st1 {v0.d}[0], [x0], x2 // load u - st1 {v0.d}[1], [x1], x2 // load v - st1 {v1.d}[0], [x0], x2 - st1 {v1.d}[1], [x1], x2 - st1 {v2.d}[0], [x0], x2 - st1 {v2.d}[1], [x1], x2 - st1 {v3.d}[0], [x0], x2 - st1 {v3.d}[1], [x1], x2 - st1 {v4.d}[0], [x0], x2 - st1 {v4.d}[1], [x1], x2 - st1 {v5.d}[0], [x0], x2 - st1 {v5.d}[1], [x1], x2 - st1 {v6.d}[0], [x0], x2 - st1 {v6.d}[1], [x1], x2 - st1 {v7.d}[0], [x0] - st1 {v7.d}[1], [x1] + st1 {v0.d}[0], [x0], x2 // load u + st1 {v0.d}[1], [x1], x2 // load v + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x1], x2 + st1 {v2.d}[0], [x0], x2 + st1 {v2.d}[1], [x1], x2 + st1 {v3.d}[0], [x0], x2 + st1 {v3.d}[1], [x1], x2 + st1 {v4.d}[0], [x0], x2 + st1 {v4.d}[1], [x1], x2 + st1 {v5.d}[0], [x0], x2 + st1 {v5.d}[1], [x1], x2 + st1 {v6.d}[0], [x0], x2 + st1 {v6.d}[1], [x1], x2 + st1 {v7.d}[0], [x0] + st1 {v7.d}[1], [x1] ret diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index e5531dcc7c..9916ad4142 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -729,9 +729,9 @@ FFT16_FN ns_float, 1 .endm .macro SR_COMBINE_4 len, part, off - add x10, x1, x21 - add x11, x1, x21, lsl #1 - add x12, x1, x22 + add x10, x1, x21 + add x11, x1, x21, lsl #1 + add x12, x1, x22 ldp q0, q1, [x1, #((0 + \part)*32 + \off)] ldp q4, q5, [x1, #((2 + \part)*32 + \off)] @@ -759,9 +759,9 @@ FFT16_FN ns_float, 1 .endm .macro SR_COMBINE_FULL len, off=0 - add x10, x1, x21 - add x11, x1, x21, lsl #1 - add x12, x1, x22 + add x10, x1, x21 + add x11, x1, x21, lsl #1 + add x12, x1, x22 SR_COMBINE_4 \len, 0, \off SR_COMBINE_4 \len, 1, \off |