diff options
author | Martin Storsjö <martin@martin.st> | 2019-02-01 00:12:46 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2019-02-19 11:46:28 +0200 |
commit | 7e42d5f0ab2aeac811fd01e122627c9198b13f01 (patch) | |
tree | 7a04b33369318adf05a6a859f80519abff58b71d /libavcodec | |
parent | 49f9c4272c4029b57ff300d908ba03c6332fc9c4 (diff) | |
download | ffmpeg-7e42d5f0ab2aeac811fd01e122627c9198b13f01.tar.gz |
aarch64: vp8: Optimize vp8_idct_add_neon for aarch64
The previous version was a pretty exact translation of the arm
version. This version does do some unnecessary arithemetic (it does
more operations on vectors that are only half filled; it does 4
uaddw and 4 sqxtun instead of 2 of each), but it reduces the overhead
of packing data together (which could be done for free in the arm
version).
This gives a decent speedup on Cortex A53, a minor speedup on
A72 and a very minor slowdown on Cortex A73.
Before: Cortex A53 A72 A73
vp8_idct_add_neon: 79.7 67.5 65.0
After:
vp8_idct_add_neon: 67.7 64.8 66.7
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/aarch64/vp8dsp_neon.S | 49 |
1 files changed, 25 insertions, 24 deletions
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index cac4558ab2..47fdc21c67 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -125,36 +125,37 @@ function ff_vp8_idct_add_neon, export=1 sub v17.4h, v0.4h, v2.4h add v18.4h, v20.4h, v23.4h - ld1 {v24.d}[0], [x0], x2 - zip1 v16.2d, v16.2d, v17.2d - sub v19.4h, v21.4h, v22.4h - ld1 {v25.d}[0], [x0], x2 - zip1 v18.2d, v18.2d, v19.2d - add v0.8h, v16.8h, v18.8h - ld1 {v25.d}[1], [x0], x2 - sub v1.8h, v16.8h, v18.8h - ld1 {v24.d}[1], [x0], x2 - srshr v0.8h, v0.8h, #3 - trn1 v24.4s, v24.4s, v25.4s - srshr v1.8h, v1.8h, #3 + ld1 {v24.s}[0], [x0], x2 + sub v19.4h, v21.4h, v22.4h + ld1 {v25.s}[0], [x0], x2 + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + ld1 {v26.s}[0], [x0], x2 + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + ld1 {v27.s}[0], [x0], x2 + srshr v0.4h, v0.4h, #3 + srshr v1.4h, v1.4h, #3 + srshr v2.4h, v2.4h, #3 + srshr v3.4h, v3.4h, #3 + sub x0, x0, x2, lsl #2 - ext v1.16b, v1.16b, v1.16b, #8 - trn1 v3.2d, v0.2d, v1.2d - trn2 v0.2d, v0.2d, v1.2d - trn1 v1.8h, v3.8h, v0.8h - trn2 v3.8h, v3.8h, v0.8h - uzp1 v0.4s, v1.4s, v3.4s - uzp2 v1.4s, v3.4s, v1.4s + transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 uaddw v0.8h, v0.8h, v24.8b - uaddw2 v1.8h, v1.8h, v24.16b + uaddw v1.8h, v1.8h, v25.8b + uaddw v2.8h, v2.8h, v26.8b + uaddw v3.8h, v3.8h, v27.8b sqxtun v0.8b, v0.8h - sqxtun2 v0.16b, v1.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.s}[0], [x0], x2 - st1 {v0.s}[1], [x0], x2 - st1 {v0.s}[3], [x0], x2 - st1 {v0.s}[2], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 ret endfunc |