diff options
author | James Almer <jamrial@gmail.com> | 2014-05-19 00:02:21 -0300 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-06-08 02:37:20 +0200 |
commit | fc8db12a73f12377b56d8bf53cd6ce25968094c4 (patch) | |
tree | 8f2cb55d121b951057bffec4e24207fe219dc47b | |
parent | 5183fac92fc5c574a053dd06b84e735a1ec1cfa6 (diff) | |
download | ffmpeg-fc8db12a73f12377b56d8bf53cd6ce25968094c4.tar.gz |
x86/vp9: inital AVX2 intra_pred
tos3k-vp9-b10000.webm on a Core i5-4200U @1.6GHz
1219 decicycles in ff_vp9_ipred_dc_32x32_ssse3, 131070 runs, 2 skips
439 decicycles in ff_vp9_ipred_dc_32x32_avx2, 131070 runs, 2 skips
3570 decicycles in ff_vp9_ipred_dc_top_32x32_ssse3, 4096 runs, 0 skips
2494 decicycles in ff_vp9_ipred_dc_top_32x32_avx2, 4096 runs, 0 skips
1419 decicycles in ff_vp9_ipred_dc_left_32x32_ssse3, 16384 runs, 0 skips
717 decicycles in ff_vp9_ipred_dc_left_32x32_avx2, 16384 runs, 0 skips
2737 decicycles in ff_vp9_ipred_tm_32x32_avx, 1024 runs, 0 skips
2088 decicycles in ff_vp9_ipred_tm_32x32_avx2, 1024 runs, 0 skips
3090 decicycles in ff_vp9_ipred_v_32x32_avx, 512 runs, 0 skips
2226 decicycles in ff_vp9_ipred_v_32x32_avx2, 512 runs, 0 skips
1565 decicycles in ff_vp9_ipred_h_32x32_avx, 1024 runs, 0 skips
922 decicycles in ff_vp9_ipred_h_32x32_avx2, 1024 runs, 0 skips
Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/constants.c | 8 | ||||
-rw-r--r-- | libavcodec/x86/constants.h | 4 | ||||
-rw-r--r-- | libavcodec/x86/vp9dsp_init.c | 16 | ||||
-rw-r--r-- | libavcodec/x86/vp9intrapred.asm | 157 | ||||
-rw-r--r-- | libavutil/x86/asm.h | 1 |
5 files changed, 177 insertions, 9 deletions
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 3bba80bd87..7608bb32e1 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -1,5 +1,5 @@ /* - * MMX/SSE constants used across x86 dsp optimizations. + * MMX/SSE/AVX constants used across x86 dsp optimizations. * * This file is part of FFmpeg. * @@ -47,7 +47,9 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x020 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; -DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL, + 0x0101010101010101ULL, 0x0101010101010101ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL, + 0x0303030303030303ULL, 0x0303030303030303ULL }; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 4bf74ff76c..3ebf171adc 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -44,8 +44,8 @@ extern const uint64_t ff_pw_96; extern const uint64_t ff_pw_128; extern const uint64_t ff_pw_255; -extern const xmm_reg ff_pb_1; -extern const xmm_reg ff_pb_3; +extern const ymm_reg ff_pb_1; +extern const ymm_reg ff_pb_3; extern const xmm_reg ff_pb_80; extern const xmm_reg ff_pb_F8; extern const uint64_t ff_pb_FC; diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 3fd274d17f..b04e678118 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -241,6 +241,13 @@ ipred_funcs(hd, ssse3, avx); ipred_funcs(vl, ssse3, avx); ipred_funcs(vr, ssse3, avx); +ipred_func(32, dc, avx2); +ipred_func(32, dc_left, avx2); +ipred_func(32, dc_top, avx2); +ipred_func(32, v, avx2); +ipred_func(32, h, avx2); +ipred_func(32, tm, avx2); + #undef ipred_funcs #undef ipred_func_set #undef ipred_func @@ -388,6 +395,15 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_ipred(TX_32X32, 32, avx); } + if (EXTERNAL_AVX2(cpu_flags)) { + dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2; + dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2; + dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2; + dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2; + dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2; + } + #undef init_fpel #undef init_subpel1 #undef init_subpel2 diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm index 3faf1c564d..2cab05ab2f 100644 --- a/libavcodec/x86/vp9intrapred.asm +++ b/libavcodec/x86/vp9intrapred.asm @@ -29,10 +29,10 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 -pw_m256: times 8 dw -256 -pw_m255: times 8 dw -255 +pw_m256: times 16 dw -256 +pw_m255: times 16 dw -255 pw_512: times 8 dw 512 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 @@ -72,7 +72,7 @@ pb_3to1_5x0: db 3, 2, 1 times 9 db 0 pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -pb_2: times 16 db 2 +pb_2: times 32 db 2 pb_15: times 16 db 15 cextern pb_1 @@ -180,6 +180,40 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a jg .loop RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [lq] + mova m1, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + psadbw m1, m2 + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_512] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l) @@ -267,6 +301,37 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a dec cntd jg .loop RET + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a + mova m0, [%2q] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + pxor m2, m2 + psadbw m0, m2 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + movhlps xm1, xm0 + paddw xm0, xm1 + pmulhrsw xm0, [pw_1024] + vpbroadcastb m0, xm0 + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif %endmacro DC_1D_FUNCS top, a @@ -327,6 +392,29 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a jg .loop RET +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a + mova m0, [aq] + DEFINE_ARGS dst, stride, stride3, cnt + lea stride3q, [strideq*3] + mov cntd, 4 +.loop: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec cntd + jg .loop + RET +%endif + ; h INIT_XMM ssse3 @@ -417,6 +505,32 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt H_XMM_FUNCS ssse3 H_XMM_FUNCS avx +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt + mova m5, [pb_1] + mova m6, [pb_2] + mova m7, [pb_3] + pxor m4, m4 + lea stride3q, [strideq*3] + mov cntq, 7 +.loop: + movd xm3, [lq+cntq*4] + vinserti128 m3, m3, xm3, 1 + pshufb m0, m3, m7 + pshufb m1, m3, m6 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m2, m3, m5 + pshufb m3, m4 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + dec cntq + jge .loop + RET +%endif + ; tm INIT_MMX ssse3 @@ -554,6 +668,41 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a TM_XMM_FUNCS ssse3 TM_XMM_FUNCS avx +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a + pxor m3, m3 + pinsrw xm2, [aq-1], 0 + vinserti128 m2, m2, xm2, 1 + mova m0, [aq] + DEFINE_ARGS dst, stride, l, cnt + mova m4, [pw_m256] + mova m5, [pw_m255] + pshufb m2, m4 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + psubw m1, m2 + psubw m0, m2 + mov cntq, 15 +.loop: + pinsrw xm7, [lq+cntq*2], 0 + vinserti128 m7, m7, xm7, 1 + pshufb m3, m7, m5 + pshufb m7, m4 + paddw m2, m3, m0 + paddw m3, m1 + paddw m6, m7, m0 + paddw m7, m1 + packuswb m2, m3 + packuswb m6, m7 + mova [dstq+strideq*0], m2 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + dec cntq + jge .loop + RET +%endif + ; dl %macro LOWPASS 4 ; left [dst], center, right, tmp diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h index 2cecc980a5..616ad6c96f 100644 --- a/libavutil/x86/asm.h +++ b/libavutil/x86/asm.h @@ -25,6 +25,7 @@ #include "config.h" typedef struct xmm_reg { uint64_t a, b; } xmm_reg; +typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; #if ARCH_X86_64 # define OPSIZE "q" |