diff options
author | Martin Storsjö <martin@martin.st> | 2017-01-14 20:49:19 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2017-02-24 00:02:36 +0200 |
commit | c582cb8537367721bb399a5d01b652c20142b756 (patch) | |
tree | 5c98407f9482c39205005749da642a9a1723d299 /libavcodec/aarch64 | |
parent | ed6a891c364f8b0850b557d9578b8920cc15a937 (diff) | |
download | ffmpeg-c582cb8537367721bb399a5d01b652c20142b756.tar.gz |
arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.
Before: Cortex A7 A8 A9 A53 A53/AArch64
vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7
vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7
vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7
vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0
After:
vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0
vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7
vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0
vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r-- | libavcodec/aarch64/vp9lpf_neon.S | 40 |
1 files changed, 9 insertions, 31 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 5fafc7ad5c..48cac4cac6 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -51,13 +51,6 @@ // see the arm version instead. -.macro uabdl_sz dst1, dst2, in1, in2, sz - uabdl \dst1, \in1\().8b, \in2\().8b -.ifc \sz, .16b - uabdl2 \dst2, \in1\().16b, \in2\().16b -.endif -.endm - .macro add_sz dst1, dst2, in1, in2, in3, in4, sz add \dst1, \in1, \in3 .ifc \sz, .16b @@ -86,20 +79,6 @@ .endif .endm -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz - cmhs \dst1, \in1, \in3 -.ifc \sz, .16b - cmhs \dst2, \in2, \in4 -.endif -.endm - -.macro xtn_sz dst, in1, in2, sz - xtn \dst\().8b, \in1 -.ifc \sz, .16b - xtn2 \dst\().16b, \in2 -.endif -.endm - .macro usubl_sz dst1, dst2, in1, in2, sz usubl \dst1, \in1\().8b, \in2\().8b .ifc \sz, .16b @@ -179,20 +158,20 @@ // tmpq2 == tmp3 + tmp4, etc. .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 .if \mix == 0 - dup v0.8h, w2 // E - dup v1.8h, w2 // E + dup v0\sz, w2 // E dup v2\sz, w3 // I dup v3\sz, w4 // H .else - dup v0.8h, w2 // E + dup v0.8b, w2 // E dup v2.8b, w3 // I dup v3.8b, w4 // H + lsr w5, w2, #8 lsr w6, w3, #8 lsr w7, w4, #8 - ushr v1.8h, v0.8h, #8 // E + dup v1.8b, w5 // E dup v4.8b, w6 // I - bic v0.8h, #255, lsl 8 // E dup v5.8b, w7 // H + trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d .endif @@ -206,16 +185,15 @@ umax v4\sz, v4\sz, v5\sz umax v5\sz, v6\sz, v7\sz umax \tmp1\sz, \tmp1\sz, \tmp2\sz - uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) + uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) umax v4\sz, v4\sz, v5\sz - add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2 + uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) ushr v5\sz, v5\sz, #1 cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I - uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 - cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz - xtn_sz v5, v6.8h, v7.8h, \sz + uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 + cmhs v5\sz, v0\sz, v6\sz and v4\sz, v4\sz, v5\sz // fm // If no pixels need filtering, just exit as soon as possible |