diff options
author | James Almer <jamrial@gmail.com> | 2014-05-29 00:57:39 -0300 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-29 18:40:23 +0200 |
commit | 05de4d30111ef16c9721f1bea6ad14795407c7fc (patch) | |
tree | 37329863deea089c745c5db639fb570f5f64483f | |
parent | 232959f184e106c08ba196f800f15047e74ce4e8 (diff) | |
download | ffmpeg-05de4d30111ef16c9721f1bea6ad14795407c7fc.tar.gz |
x86/dsputilenc: implement XOP version of pix_sum16
SSE2: 137 cycles
XOP: 87 cycles
Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 29 | ||||
-rw-r--r-- | libavcodec/x86/dsputilenc_mmx.c | 5 |
2 files changed, 29 insertions, 5 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 263516a43c..2cb70f0a37 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -446,13 +446,24 @@ cglobal diff_pixels, 4, 5, 5 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) ; %1 = number of xmm registers used ; %2 = number of loops -%macro PIX_SUM16 2 -cglobal pix_sum16, 2, 3, %1 +; %3 = number of GPRs used +%macro PIX_SUM16 4 +cglobal pix_sum16, 2, %3, %1 movsxdifnidn r1, r1d mov r2, %2 +%if cpuflag(xop) + lea r3, [r1*3] +%else pxor m5, m5 +%endif pxor m4, m4 .loop: +%if cpuflag(xop) + vphaddubq m0, [r0] + vphaddubq m1, [r0+r1] + vphaddubq m2, [r0+r1*2] + vphaddubq m3, [r0+r3] +%else mova m0, [r0] %if mmsize == 8 mova m1, [r0+8] @@ -463,6 +474,7 @@ cglobal pix_sum16, 2, 3, %1 punpcklbw m0, m5 punpckhbw m3, m1, m5 punpcklbw m1, m5 +%endif ; cpuflag(xop) paddw m1, m0 paddw m3, m2 paddw m3, m1 @@ -470,19 +482,26 @@ cglobal pix_sum16, 2, 3, %1 %if mmsize == 8 add r0, r1 %else - lea r0, [r0+r1*2] + lea r0, [r0+r1*%4] %endif dec r2 jne .loop +%if cpuflag(xop) + pshufd m0, m4, q0032 + paddd m4, m0 +%else HADDW m4, m5 +%endif movd eax, m4 RET %endmacro INIT_MMX mmx -PIX_SUM16 0, 16 +PIX_SUM16 0, 16, 3, 0 INIT_XMM sse2 -PIX_SUM16 6, 8 +PIX_SUM16 6, 8, 3, 2 +INIT_XMM xop +PIX_SUM16 5, 4, 4, 4 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) ; %1 = number of xmm registers used diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index efe835fac9..4280f4ba5e 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -39,6 +39,7 @@ void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_sum16_sse2(uint8_t *pix, int line_size); +int ff_pix_sum16_xop(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_sse2(uint8_t *pix, int line_size); int ff_sum_abs_dctelem_mmx(int16_t *block); @@ -925,5 +926,9 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, #endif } + if (EXTERNAL_XOP(cpu_flags)) { + c->pix_sum = ff_pix_sum16_xop; + } + ff_dsputil_init_pix_mmx(c, avctx); } |