aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorIlia <zakne0ne@gmail.com>2017-03-13 05:06:26 +0700
committerRonald S. Bultje <rsbultje@gmail.com>2017-03-20 09:47:43 -0400
commit2f3d10a01ac5f613f80db8542bf3ecda1dd40d79 (patch)
tree02a546d4500fe4cb3cc70cb1197f452263a3bc3d /libavcodec/x86
parent5eb4f95bef2f95c4e776c60613b2825064d02ba9 (diff)
downloadffmpeg-2f3d10a01ac5f613f80db8542bf3ecda1dd40d79.tar.gz
avcodec/vp9: avx2 implementation of ipred_dl_16x16_16
vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 10000 runs Signed-off-by: Ilia <zakne0ne@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp.c2
-rw-r--r--libavcodec/x86/vp9intrapred_16bpp.asm39
2 files changed, 41 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index eb67499c96..4576ff1692 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl, 16, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
@@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, avx2);
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
+ init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
}
#endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index c0ac16d3eb..212e4130e8 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -847,6 +847,45 @@ DL_FUNCS
INIT_XMM avx
DL_FUNCS
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+ movifnidn aq, amp
+ mova m0, [aq] ; abcdefghijklmnop
+ vpbroadcastw xm1, [aq+30] ; pppppppp
+ vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp
+ vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp
+ vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp
+ LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp
+ vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp
+ DEFINE_ARGS dst, stride, stride3, cnt
+ mov cntd, 2
+ lea stride3q, [strideq*3]
+.loop:
+ mova [dstq+strideq*0], m0
+ vpalignr m3, m2, m0, 2
+ vpalignr m4, m2, m0, 4
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 6
+ vpalignr m4, m2, m0, 8
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], m4
+ vpalignr m3, m2, m0, 10
+ vpalignr m4, m2, m0, 12
+ mova [dstq+strideq*1], m3
+ mova [dstq+strideq*2], m4
+ vpalignr m3, m2, m0, 14
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ mova m0, m2
+ vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp
+ dec cntd
+ jg .loop
+ RET
+%endif
+
%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
movh m0, [lq] ; wxyz....