diff options
author | Martin Storsjö <martin@martin.st> | 2021-09-03 13:56:05 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2021-10-18 14:27:58 +0300 |
commit | fd3bd5c492834bd100bb2462d1e1dc25a66f28f8 (patch) | |
tree | fefcefe8304a1cf70ce38914e094ee04173470ba /libavcodec | |
parent | 2d5a7f6d002813ee67bffa63d4afcd439dd329a3 (diff) | |
download | ffmpeg-fd3bd5c492834bd100bb2462d1e1dc25a66f28f8.tar.gz |
aarch64: h264qpel: Do vertical filtering without transposing
This gives rather big speedups on these functions:
Before:
put_h264_qpel_8_mc01_8_neon: 241.0 131.5 138.7
put_h264_qpel_8_mc02_8_neon: 214.7 121.2 127.5
put_h264_qpel_8_mc03_8_neon: 242.5 131.2 135.7
put_h264_qpel_8_mc11_8_neon: 421.2 218.7 251.0
put_h264_qpel_8_mc12_8_neon: 878.0 509.5 537.5
put_h264_qpel_8_mc13_8_neon: 423.7 217.0 252.0
put_h264_qpel_8_mc21_8_neon: 858.2 479.5 514.0
put_h264_qpel_8_mc22_8_neon: 649.7 385.2 403.0
put_h264_qpel_8_mc23_8_neon: 860.2 476.5 517.7
put_h264_qpel_8_mc31_8_neon: 437.2 219.5 252.5
put_h264_qpel_8_mc32_8_neon: 892.5 510.5 546.0
put_h264_qpel_8_mc33_8_neon: 438.2 218.5 257.0
put_h264_qpel_16_mc01_8_neon: 944.2 509.7 546.7
put_h264_qpel_16_mc02_8_neon: 878.7 469.5 509.7
put_h264_qpel_16_mc03_8_neon: 945.7 510.7 557.0
put_h264_qpel_16_mc11_8_neon: 1663.2 858.5 979.5
put_h264_qpel_16_mc12_8_neon: 3510.2 2027.7 2112.7
put_h264_qpel_16_mc13_8_neon: 1664.7 857.5 980.5
put_h264_qpel_16_mc21_8_neon: 3366.2 1928.5 2030.5
put_h264_qpel_16_mc22_8_neon: 2584.7 1514.7 1590.2
put_h264_qpel_16_mc23_8_neon: 3367.7 1927.7 2035.0
put_h264_qpel_16_mc31_8_neon: 1716.7 849.7 997.0
put_h264_qpel_16_mc32_8_neon: 3564.0 2044.2 3835.2
put_h264_qpel_16_mc33_8_neon: 1717.7 863.0 989.5
After:
put_h264_qpel_8_mc01_8_neon: 136.0 73.7 76.0
put_h264_qpel_8_mc02_8_neon: 108.7 65.0 64.0
put_h264_qpel_8_mc03_8_neon: 137.5 72.7 73.0
put_h264_qpel_8_mc11_8_neon: 316.2 159.0 188.5
put_h264_qpel_8_mc12_8_neon: 653.0 375.5 384.7
put_h264_qpel_8_mc13_8_neon: 318.7 165.5 189.5
put_h264_qpel_8_mc21_8_neon: 739.2 385.7 432.5
put_h264_qpel_8_mc22_8_neon: 530.7 295.5 309.5
put_h264_qpel_8_mc23_8_neon: 741.2 393.7 421.0
put_h264_qpel_8_mc31_8_neon: 332.2 162.5 190.0
put_h264_qpel_8_mc32_8_neon: 667.5 378.2 390.5
put_h264_qpel_8_mc33_8_neon: 332.7 166.5 195.5
put_h264_qpel_16_mc01_8_neon: 524.2 285.2 294.0
put_h264_qpel_16_mc02_8_neon: 454.7 252.2 250.2
put_h264_qpel_16_mc03_8_neon: 525.7 286.0 283.0
put_h264_qpel_16_mc11_8_neon: 1243.2 630.7 726.7
put_h264_qpel_16_mc12_8_neon: 2610.2 1479.7 1481.2
put_h264_qpel_16_mc13_8_neon: 1250.5 631.7 727.7
put_h264_qpel_16_mc21_8_neon: 2890.2 1571.2 1679.7
put_h264_qpel_16_mc22_8_neon: 2108.7 1177.5 1223.5
put_h264_qpel_16_mc23_8_neon: 2891.7 1578.7 1667.7
put_h264_qpel_16_mc31_8_neon: 1296.7 630.5 752.5
put_h264_qpel_16_mc32_8_neon: 2664.0 1483.2 1503.5
put_h264_qpel_16_mc33_8_neon: 1297.7 632.5 747.2
I.e. overall a 20%-60% reduction in runtime of these
functions.
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aarch64/h264qpel_neon.S | 111 |
1 files changed, 56 insertions, 55 deletions
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S index 186208b10a..451fd8af24 100644 --- a/libavcodec/aarch64/h264qpel_neon.S +++ b/libavcodec/aarch64/h264qpel_neon.S @@ -58,6 +58,24 @@ .endif .endm +//trashes v0-v4 +.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 + uaddl v2.8H, \r2\().8B, \r3\().8B + uaddl v0.8H, \r3\().8B, \r4\().8B + uaddl v4.8H, \r1\().8B, \r4\().8B + uaddl v1.8H, \r2\().8B, \r5\().8B + uaddl \d0\().8H, \r0\().8B, \r5\().8B + uaddl \d1\().8H, \r1\().8B, \r6\().8B + mla \d0\().8H, v2.8H, v6.H[1] + mls \d0\().8H, v4.8H, v6.H[0] + mla \d1\().8H, v0.8H, v6.H[1] + mls \d1\().8H, v1.8H, v6.H[0] + .if \narrow + sqrshrun \d0\().8B, \d0\().8H, #5 + sqrshrun \d1\().8B, \d1\().8H, #5 + .endif +.endm + //trashes v0-v5, v7, v30-v31 .macro lowpass_8H r0, r1 ext v0.16B, \r0\().16B, \r0\().16B, #2 @@ -100,18 +118,13 @@ .endm // trashed v0-v7 -.macro lowpass_8.16 r0, r1, r2 - ext v1.16B, \r0\().16B, \r1\().16B, #4 - ext v0.16B, \r0\().16B, \r1\().16B, #6 - saddl v5.4S, v1.4H, v0.4H - ext v2.16B, \r0\().16B, \r1\().16B, #2 - saddl2 v1.4S, v1.8H, v0.8H - ext v3.16B, \r0\().16B, \r1\().16B, #8 - saddl v6.4S, v2.4H, v3.4H - ext \r1\().16B, \r0\().16B, \r1\().16B, #10 - saddl2 v2.4S, v2.8H, v3.8H - saddl v0.4S, \r0\().4H, \r1\().4H - saddl2 v4.4S, \r0\().8H, \r1\().8H +.macro lowpass_8.16 r0, r1, r2, r3, r4, r5 + saddl v5.4S, \r2\().4H, \r3\().4H + saddl2 v1.4S, \r2\().8H, \r3\().8H + saddl v6.4S, \r1\().4H, \r4\().4H + saddl2 v2.4S, \r1\().8H, \r4\().8H + saddl v0.4S, \r0\().4H, \r5\().4H + saddl2 v4.4S, \r0\().8H, \r5\().8H shl v3.4S, v5.4S, #4 shl v5.4S, v5.4S, #2 @@ -134,7 +147,7 @@ rshrn v5.4H, v5.4S, #10 rshrn2 v5.8H, v1.4S, #10 - sqxtun \r2\().8B, v5.8H + sqxtun \r0\().8B, v5.8H .endm function put_h264_qpel16_h_lowpass_neon_packed @@ -258,27 +271,23 @@ endfunc function \type\()_h264_qpel8_v_lowpass_neon ld1 {v16.8B}, [x1], x3 + ld1 {v17.8B}, [x1], x3 ld1 {v18.8B}, [x1], x3 + ld1 {v19.8B}, [x1], x3 ld1 {v20.8B}, [x1], x3 + ld1 {v21.8B}, [x1], x3 ld1 {v22.8B}, [x1], x3 + ld1 {v23.8B}, [x1], x3 ld1 {v24.8B}, [x1], x3 + ld1 {v25.8B}, [x1], x3 ld1 {v26.8B}, [x1], x3 - ld1 {v28.8B}, [x1], x3 - ld1 {v30.8B}, [x1], x3 - ld1 {v17.8B}, [x1], x3 - ld1 {v19.8B}, [x1], x3 - ld1 {v21.8B}, [x1], x3 - ld1 {v23.8B}, [x1], x3 - ld1 {v25.8B}, [x1] - - transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 - transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 - lowpass_8 v16, v17, v18, v19, v16, v17 - lowpass_8 v20, v21, v22, v23, v18, v19 - lowpass_8 v24, v25, v26, v27, v20, v21 - lowpass_8 v28, v29, v30, v31, v22, v23 - transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + ld1 {v27.8B}, [x1], x3 + ld1 {v28.8B}, [x1] + lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 + lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 + lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 + lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 .ifc \type,avg ld1 {v24.8B}, [x0], x2 ld1 {v25.8B}, [x0], x2 @@ -335,26 +344,23 @@ endfunc function \type\()_h264_qpel8_v_lowpass_l2_neon ld1 {v16.8B}, [x1], x3 + ld1 {v17.8B}, [x1], x3 ld1 {v18.8B}, [x1], x3 + ld1 {v19.8B}, [x1], x3 ld1 {v20.8B}, [x1], x3 + ld1 {v21.8B}, [x1], x3 ld1 {v22.8B}, [x1], x3 + ld1 {v23.8B}, [x1], x3 ld1 {v24.8B}, [x1], x3 + ld1 {v25.8B}, [x1], x3 ld1 {v26.8B}, [x1], x3 - ld1 {v28.8B}, [x1], x3 - ld1 {v30.8B}, [x1], x3 - ld1 {v17.8B}, [x1], x3 - ld1 {v19.8B}, [x1], x3 - ld1 {v21.8B}, [x1], x3 - ld1 {v23.8B}, [x1], x3 - ld1 {v25.8B}, [x1] + ld1 {v27.8B}, [x1], x3 + ld1 {v28.8B}, [x1] - transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 - transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 - lowpass_8 v16, v17, v18, v19, v16, v17 - lowpass_8 v20, v21, v22, v23, v18, v19 - lowpass_8 v24, v25, v26, v27, v20, v21 - lowpass_8 v28, v29, v30, v31, v22, v23 - transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 + lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 + lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 + lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 ld1 {v24.8B}, [x12], x2 ld1 {v25.8B}, [x12], x2 @@ -432,22 +438,17 @@ function put_h264_qpel8_hv_lowpass_neon_top lowpass_8H v26, v27 lowpass_8H v28, v29 - transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 - transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 - - lowpass_8.16 v16, v24, v16 - lowpass_8.16 v17, v25, v17 - - lowpass_8.16 v18, v26, v18 - lowpass_8.16 v19, v27, v19 + lowpass_8.16 v16, v17, v18, v19, v20, v21 + lowpass_8.16 v17, v18, v19, v20, v21, v22 - lowpass_8.16 v20, v28, v20 - lowpass_8.16 v21, v29, v21 + lowpass_8.16 v18, v19, v20, v21, v22, v23 + lowpass_8.16 v19, v20, v21, v22, v23, v24 - lowpass_8.16 v22, v30, v22 - lowpass_8.16 v23, v31, v23 + lowpass_8.16 v20, v21, v22, v23, v24, v25 + lowpass_8.16 v21, v22, v23, v24, v25, v26 - transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + lowpass_8.16 v22, v23, v24, v25, v26, v27 + lowpass_8.16 v23, v24, v25, v26, v27, v28 ret endfunc |