diff options
author | James Almer <jamrial@gmail.com> | 2019-03-14 16:17:33 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2019-03-14 16:17:33 -0300 |
commit | 2ac399d7faa5ac80088715780769522d1141b549 (patch) | |
tree | 3cd595c375cdade91148eb1f04db7f611bef4520 | |
parent | c6892f59eb0e9f2a9ec1f55b21a5841a60540e1f (diff) | |
parent | 58d154922707bfeb873cb3a7476e0f94b17463dd (diff) | |
download | ffmpeg-2ac399d7faa5ac80088715780769522d1141b549.tar.gz |
Merge commit '58d154922707bfeb873cb3a7476e0f94b17463dd'
* commit '58d154922707bfeb873cb3a7476e0f94b17463dd':
aarch64: vp8: Port epel4 functions from arm version
Merged-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/aarch64/vp8dsp_init_aarch64.c | 10 | ||||
-rw-r--r-- | libavcodec/aarch64/vp8dsp_neon.S | 284 |
2 files changed, 294 insertions, 0 deletions
diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index d41dee0d25..61701fc4c1 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -34,6 +34,7 @@ VP8_LF(neon); VP8_EPEL(16, neon); VP8_EPEL(8, neon); +VP8_EPEL(4, neon); av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) @@ -54,6 +55,15 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; } av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 1b79db65a1..2c86eef1e5 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -1225,3 +1225,287 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 add sp, sp, #168+16 ret endfunc + +function ff_put_vp8_epel4_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2], x3 + ld1r {v7.2s}, [x2], x3 + ld1r {v28.2s}, [x2] + sub x2, x2, x3, lsl #2 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2], x3 + ld1 {v7.s}[1], [x2], x3 + ld1 {v28.s}[1], [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +function ff_put_vp8_epel4_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2] + sub x2, x2, x3, lsl #1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[2], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v2.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc |