diff options
author | Martin Storsjö <martin@martin.st> | 2019-02-01 10:05:56 +0200 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2019-02-19 11:46:08 +0200 |
commit | cc7ba00c35faf0478f1f56215e926f70ccb31282 (patch) | |
tree | 8fe9dd0b63aa9d58eee1c023999fa8e7c9be386d /libavcodec/aarch64/vp8dsp_neon.S | |
parent | 52c9b0a6c0d02cff6caebcf6989e565e05b55200 (diff) | |
download | ffmpeg-cc7ba00c35faf0478f1f56215e926f70ccb31282.tar.gz |
aarch64: vp8: Port missing epel8 functions from arm version
Cortex A53 A72 A73
vp8_put_epel8_h4_c: 2594.8 1159.6 1374.8
vp8_put_epel8_h4_neon: 506.4 244.2 314.0
vp8_put_epel8_h6_c: 3445.8 1677.1 1811.3
vp8_put_epel8_h6_neon: 634.4 371.7 433.0
vp8_put_epel8_v4_c: 2614.0 1174.8 1378.0
vp8_put_epel8_v4_neon: 321.0 221.7 235.8
vp8_put_epel8_v6_c: 3635.5 1703.0 2079.2
vp8_put_epel8_v6_neon: 416.9 317.0 295.5
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64/vp8dsp_neon.S')
-rw-r--r-- | libavcodec/aarch64/vp8dsp_neon.S | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 4ea62c0644..c5badc432f 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -957,6 +957,51 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 ret endfunc +function ff_put_vp8_epel8_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x2], x3 + ld1 {v28.8b}, [x2] + + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b, v3.8b}, [x2], x3 + + vp8_epel8_h6 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h6v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #2 @@ -1003,6 +1048,48 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 ret endfunc +function ff_put_vp8_epel8_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + + vp8_epel8_h4 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h4v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #1 |