aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2019-03-14 16:22:29 -0300
committerJames Almer <jamrial@gmail.com>2019-03-14 16:22:29 -0300
commit5c363d3e595a9e5b7c42897b7aab91b91b154ac1 (patch)
tree297d1fdcc5f76f3d1016124b374a10bece268b44
parent409e684e79b6ee0c511292326f09b13fe230e58e (diff)
parent7e42d5f0ab2aeac811fd01e122627c9198b13f01 (diff)
downloadffmpeg-5c363d3e595a9e5b7c42897b7aab91b91b154ac1.tar.gz
Merge commit '7e42d5f0ab2aeac811fd01e122627c9198b13f01'
* commit '7e42d5f0ab2aeac811fd01e122627c9198b13f01': aarch64: vp8: Optimize vp8_idct_add_neon for aarch64 Merged-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/aarch64/vp8dsp_neon.S49
1 files changed, 25 insertions, 24 deletions
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index be4f26c481..4bbf16d1a4 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -125,36 +125,37 @@ function ff_vp8_idct_add_neon, export=1
sub v17.4h, v0.4h, v2.4h
add v18.4h, v20.4h, v23.4h
- ld1 {v24.d}[0], [x0], x2
- zip1 v16.2d, v16.2d, v17.2d
- sub v19.4h, v21.4h, v22.4h
- ld1 {v25.d}[0], [x0], x2
- zip1 v18.2d, v18.2d, v19.2d
- add v0.8h, v16.8h, v18.8h
- ld1 {v25.d}[1], [x0], x2
- sub v1.8h, v16.8h, v18.8h
- ld1 {v24.d}[1], [x0], x2
- srshr v0.8h, v0.8h, #3
- trn1 v24.4s, v24.4s, v25.4s
- srshr v1.8h, v1.8h, #3
+ ld1 {v24.s}[0], [x0], x2
+ sub v19.4h, v21.4h, v22.4h
+ ld1 {v25.s}[0], [x0], x2
+ add v0.4h, v16.4h, v18.4h
+ add v1.4h, v17.4h, v19.4h
+ ld1 {v26.s}[0], [x0], x2
+ sub v3.4h, v16.4h, v18.4h
+ sub v2.4h, v17.4h, v19.4h
+ ld1 {v27.s}[0], [x0], x2
+ srshr v0.4h, v0.4h, #3
+ srshr v1.4h, v1.4h, #3
+ srshr v2.4h, v2.4h, #3
+ srshr v3.4h, v3.4h, #3
+
sub x0, x0, x2, lsl #2
- ext v1.16b, v1.16b, v1.16b, #8
- trn1 v3.2d, v0.2d, v1.2d
- trn2 v0.2d, v0.2d, v1.2d
- trn1 v1.8h, v3.8h, v0.8h
- trn2 v3.8h, v3.8h, v0.8h
- uzp1 v0.4s, v1.4s, v3.4s
- uzp2 v1.4s, v3.4s, v1.4s
+ transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
uaddw v0.8h, v0.8h, v24.8b
- uaddw2 v1.8h, v1.8h, v24.16b
+ uaddw v1.8h, v1.8h, v25.8b
+ uaddw v2.8h, v2.8h, v26.8b
+ uaddw v3.8h, v3.8h, v27.8b
sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+
st1 {v0.s}[0], [x0], x2
- st1 {v0.s}[1], [x0], x2
- st1 {v0.s}[3], [x0], x2
- st1 {v0.s}[2], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v2.s}[0], [x0], x2
+ st1 {v3.s}[0], [x0], x2
ret
endfunc