diff options
author | Ilia Valiakhmetov <zakne0ne@gmail.com> | 2017-06-05 00:52:27 +0700 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2017-06-06 08:05:03 -0400 |
commit | 73d9a9a6af5d00cfa9b98c7d9fc9abd0c734ba8e (patch) | |
tree | 71bd6f602170a2baf41fd2166b73dadf1ddc4abf /libavcodec | |
parent | dc70ea8c193a08aebb1e0eeb2accc12322497ade (diff) | |
download | ffmpeg-73d9a9a6af5d00cfa9b98c7d9fc9abd0c734ba8e.tar.gz |
libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation
vp9_diag_downleft_32x32_8bpp_c: 580.2
vp9_diag_downleft_32x32_8bpp_sse2: 75.6
vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
vp9_diag_downleft_32x32_8bpp_avx: 72.7
vp9_diag_downleft_32x32_10bpp_c: 1101.2
vp9_diag_downleft_32x32_10bpp_sse2: 145.4
vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
vp9_diag_downleft_32x32_10bpp_avx: 134.8
vp9_diag_downleft_32x32_10bpp_avx2: 94.0
vp9_diag_downleft_32x32_12bpp_c: 1108.5
vp9_diag_downleft_32x32_12bpp_sse2: 145.5
vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
vp9_diag_downleft_32x32_12bpp_avx: 135.2
vp9_diag_downleft_32x32_12bpp_avx2: 94.0
~30% faster than avx implementation
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec')
-rw-r--r-- | libavcodec/x86/vp9dsp_init_16bpp.c | 2 | ||||
-rw-r--r-- | libavcodec/x86/vp9intrapred_16bpp.asm | 63 |
2 files changed, 65 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 4576ff1692..d1b8fcdaef 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); + init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 212e4130e8..92333bce20 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a DEFINE_ARGS dst, stride, stride3, cnt mov cntd, 2 lea stride3q, [strideq*3] + .loop: mova [dstq+strideq*0], m0 vpalignr m3, m2, m0, 2 @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a dec cntd jg .loop RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq+mmsize*0+ 0] ; abcdefghijklmnop + mova m1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 + vpbroadcastw xm4, [aq+mmsize*1+30] ; 55555555 + vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx + vpalignr m2, m5, m0, 2 ; bcdefghijklmnopq + vpalignr m3, m5, m0, 4 ; cdefghijklmnopqr + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ + vperm2i128 m5, m1, m4, q0201 ; yz01234555555555 + vpalignr m2, m5, m1, 2 ; rstuvwxyz0123455 + vpalignr m3, m5, m1, 4 ; stuvwxyz01234555 + LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 + vperm2i128 m2, m1, m4, q0201 ; Z......555555555 + vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 + +.loop: + mova [dstq+strideq*0 + 0], m0 + mova [dstq+strideq*0 +32], m1 + vpalignr m3, m5, m0, 2 + vpalignr m4, m2, m1, 2 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 4 + vpalignr m4, m2, m1, 4 + mova [dstq+strideq*2 + 0], m3 + mova [dstq+strideq*2 +32], m4 + vpalignr m3, m5, m0, 6 + vpalignr m4, m2, m1, 6 + mova [dstq+stride3q*1+ 0], m3 + mova [dstq+stride3q*1+32], m4 + lea dstq, [dstq+strideq*4] + vpalignr m3, m5, m0, 8 + vpalignr m4, m2, m1, 8 + mova [dstq+strideq*0 + 0], m3 + mova [dstq+strideq*0 +32], m4 + vpalignr m3, m5, m0, 10 + vpalignr m4, m2, m1, 10 + mova [dstq+strideq*1 + 0], m3 + mova [dstq+strideq*1 +32], m4 + vpalignr m3, m5, m0, 12 + vpalignr m4, m2, m1, 12 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + vpalignr m3, m5, m0, 14 + vpalignr m4, m2, m1, 14 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + vpalignr m3, m5, m0, 16 + vpalignr m4, m2, m1, 16 + vperm2i128 m5, m3, m4, q0201 + vperm2i128 m2, m4, m4, q0101 + mova m0, m3 + mova m1, m4 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET %endif %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function |