aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlia Valiakhmetov <zakne0ne@gmail.com>2017-06-10 17:01:12 +0700
committerRonald S. Bultje <rsbultje@gmail.com>2017-06-12 12:40:58 -0400
commit81fc617c125734aa6f3b3d938af75fef6db750e7 (patch)
treee1061533509e584c8caa9c167430bbfcee481a8e
parent1a30bf60be9243830b68e8fe2e20539f08a85926 (diff)
downloadffmpeg-81fc617c125734aa6f3b3d938af75fef6db750e7.tar.gz
avcodec/vp9: ipred_dr_16x16_16 avx2 implementation
Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
-rw-r--r--libavcodec/x86/vp9dsp_init_16bpp.c2
-rw-r--r--libavcodec/x86/vp9intrapred_16bpp.asm56
2 files changed, 58 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c
index d1b8fcdaef..8d1aa1368a 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
decl_ipred_fn(dl, 16, 16, avx2);
+decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dl, 32, 16, avx2);
#define decl_ipred_dir_funcs(type) \
@@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+ init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
}
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm
index 92333bce20..764f7041d3 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1170,6 +1170,62 @@ DR_FUNCS 2
INIT_XMM avx
DR_FUNCS 2
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+ mova m0, [lq] ; klmnopqrstuvwxyz
+ movu m1, [aq-2] ; *abcdefghijklmno
+ mova m2, [aq] ; abcdefghijklmnop
+ vperm2i128 m4, m2, m2, q2001 ; ijklmnop........
+ vpalignr m5, m4, m2, 2 ; bcdefghijklmnop.
+ vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg
+ LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO.
+ vpalignr m4, m3, m0, 2 ; lmnopqrstuvwxyz*
+ vpalignr m5, m3, m0, 4 ; mnopqrstuvwxyz*a
+ LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ#
+ vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH
+ DEFINE_ARGS dst, stride, stride3, stride5, dst3
+ lea dst3q, [dstq+strideq*4]
+ lea stride3q, [strideq*3]
+ lea stride5q, [stride3q+strideq*2]
+
+ vpalignr m3, m5, m0, 2
+ vpalignr m4, m1, m5, 2
+ mova [dst3q+stride5q*2], m3 ; 14
+ mova [ dstq+stride3q*2], m4 ; 6
+ vpalignr m3, m5, m0, 4
+ vpalignr m4, m1, m5, 4
+ sub dst3q, strideq
+ mova [dst3q+stride5q*2], m3 ; 13
+ mova [dst3q+strideq*2 ], m4 ; 5
+ mova [dst3q+stride3q*4], m0 ; 15
+ vpalignr m3, m5, m0, 6
+ vpalignr m4, m1, m5, 6
+ mova [dstq+stride3q*4], m3 ; 12
+ mova [dst3q+strideq*1], m4 ; 4
+ vpalignr m3, m5, m0, 8
+ vpalignr m4, m1, m5, 8
+ mova [dst3q+strideq*8], m3 ; 11
+ mova [dst3q+strideq*0], m4 ; 3
+ vpalignr m3, m5, m0, 10
+ vpalignr m4, m1, m5, 10
+ mova [dstq+stride5q*2], m3 ; 10
+ mova [dstq+strideq*2 ], m4 ; 2
+ vpalignr m3, m5, m0, 12
+ vpalignr m4, m1, m5, 12
+ mova [dst3q+stride3q*2], m3 ; 9
+ mova [dstq+strideq*1 ], m4 ; 1
+ vpalignr m3, m5, m0, 14
+ vpalignr m4, m1, m5, 14
+ mova [dstq+strideq*8], m3 ; 8
+ mova [dstq+strideq*0], m4 ; 0
+ sub dstq, strideq
+ mova [dst3q+strideq*4], m5 ; 7
+ mova [ dstq+strideq*0], m1 ; -1
+ RET
+%endif
+
+
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
movifnidn aq, amp