aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/arm/vp9lpf_neon.S
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2017-01-14 20:49:19 +0200
committerMartin Storsjö <martin@martin.st>2017-03-11 13:14:50 +0200
commit3fbbad29847c79f422128ad88f174c53a5f6c449 (patch)
tree8a216f6a2b93a2a706384540d89d134a14539bc6 /libavcodec/arm/vp9lpf_neon.S
parentdda45c087b2c09ba9e485c51ff9c8f2aaca709a9 (diff)
downloadffmpeg-3fbbad29847c79f422128ad88f174c53a5f6c449.tar.gz
arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
The theoretical maximum value of E is 193, so we can just saturate the addition to 255. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 After: vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0 vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7 vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0 vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7 This is cherrypicked from libav commit c582cb8537367721bb399a5d01b652c20142b756. Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/vp9lpf_neon.S')
-rw-r--r--libavcodec/arm/vp9lpf_neon.S11
1 files changed, 5 insertions, 6 deletions
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index b90c53630a..2d91092ee0 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -51,7 +51,7 @@
@ and d28-d31 as temp registers, or d8-d15.
@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
- vdup.u16 q0, r2 @ E
+ vdup.u8 d0, r2 @ E
vdup.u8 d2, r3 @ I
ldr r3, [sp]
@@ -64,16 +64,15 @@
vmax.u8 d4, d4, d5
vmax.u8 d5, d6, d7
vmax.u8 \tmp1, \tmp1, \tmp2
- vabdl.u8 q3, d23, d24 @ abs(p0 - q0)
+ vabd.u8 d6, d23, d24 @ abs(p0 - q0)
vmax.u8 d4, d4, d5
- vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2
+ vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
vabd.u8 d5, d22, d25 @ abs(p1 - q1)
vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
vshr.u8 d5, d5, #1
vcle.u8 d4, d4, d2 @ max(abs()) <= I
- vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
- vcle.u16 q3, q3, q0
- vmovn.u16 d5, q3
+ vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ vcle.u8 d5, d6, d0
vand d4, d4, d5 @ fm
vdup.u8 d3, r3 @ H