aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2017-02-23 23:33:58 +0200
committerMartin Storsjö <martin@martin.st>2017-03-11 13:14:50 +0200
commitf32690a298badbf2df66319e9b38236ad3d3e321 (patch)
tree63a763f839fd8739fd459c72293b6aee9a0e674d
parent3fbbad29847c79f422128ad88f174c53a5f6c449 (diff)
downloadffmpeg-f32690a298badbf2df66319e9b38236ad3d3e321.tar.gz
aarch64: vp9lpf: Use dup+rev16+uzp1 instead of dup+lsr+dup+trn1
This is one cycle faster in total, and three instructions fewer. Before: vp9_loop_filter_mix2_v_44_16_neon: 123.2 After: vp9_loop_filter_mix2_v_44_16_neon: 122.2 This is cherrypicked from libav commit 3bf9c48320f25f3d5557485b0202f22ae60748b0. Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r--libavcodec/aarch64/vp9lpf_neon.S21
1 files changed, 9 insertions, 12 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index a9eea7f951..0878763020 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -162,18 +162,15 @@
dup v2\sz, w3 // I
dup v3\sz, w4 // H
.else
- dup v0.8b, w2 // E
- dup v2.8b, w3 // I
- dup v3.8b, w4 // H
- lsr w5, w2, #8
- lsr w6, w3, #8
- lsr w7, w4, #8
- dup v1.8b, w5 // E
- dup v4.8b, w6 // I
- dup v5.8b, w7 // H
- trn1 v0.2d, v0.2d, v1.2d
- trn1 v2.2d, v2.2d, v4.2d
- trn1 v3.2d, v3.2d, v5.2d
+ dup v0.8h, w2 // E
+ dup v2.8h, w3 // I
+ dup v3.8h, w4 // H
+ rev16 v1.16b, v0.16b // E
+ rev16 v4.16b, v2.16b // I
+ rev16 v5.16b, v3.16b // H
+ uzp1 v0.16b, v0.16b, v1.16b
+ uzp1 v2.16b, v2.16b, v4.16b
+ uzp1 v3.16b, v3.16b, v5.16b
.endif
uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2)