diff options
author | James Almer <jamrial@gmail.com> | 2014-09-30 22:21:40 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-10-01 13:07:22 -0300 |
commit | acebff8e5dc0789c228b10ffcae2f2eb6c30a91d (patch) | |
tree | 33bebe65a886d5ee908242045e51591d0f052229 | |
parent | f2e53808e329c403d1a409f3c35a93854064b6a2 (diff) | |
download | ffmpeg-acebff8e5dc0789c228b10ffcae2f2eb6c30a91d.tar.gz |
x86/mpegvideoencdsp: improve ff_pix_sum16_sse2
~15% faster.
Also add an mmxext version that takes advantage of the new code, and
build it alongside with the mmx version only on x86_32.
Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/mpegvideoencdsp.asm | 51 | ||||
-rw-r--r-- | libavcodec/x86/mpegvideoencdsp_init.c | 7 |
2 files changed, 41 insertions, 17 deletions
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm index 4fe6cfe5a6..aec73f82dc 100644 --- a/libavcodec/x86/mpegvideoencdsp.asm +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -29,16 +29,16 @@ cextern pw_1 SECTION .text ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -; %1 = number of xmm registers used -; %2 = number of loops -; %3 = number of GPRs used -%macro PIX_SUM16 4 -cglobal pix_sum16, 2, %3, %1 +; %1 = number of loops +; %2 = number of GPRs used +%macro PIX_SUM16 3 +cglobal pix_sum16, 2, %2, 6 movsxdifnidn r1, r1d - mov r2, %2 -%if cpuflag(xop) + mov r2, %1 +%if mmsize == 16 lea r3, [r1*3] -%else +%endif +%if notcpuflag(xop) pxor m5, m5 %endif pxor m4, m4 @@ -52,42 +52,59 @@ cglobal pix_sum16, 2, %3, %1 mova m0, [r0] %if mmsize == 8 mova m1, [r0+8] -%else +%if cpuflag(mmxext) + mova m2, [r0+r1] + mova m3, [r0+r1+8] +%endif +%else ; sse2 mova m1, [r0+r1] + mova m2, [r0+r1*2] + mova m3, [r0+r3] %endif +%if cpuflag(mmxext) + psadbw m0, m5 + psadbw m1, m5 + psadbw m2, m5 + psadbw m3, m5 +%else ; mmx punpckhbw m2, m0, m5 punpcklbw m0, m5 punpckhbw m3, m1, m5 punpcklbw m1, m5 +%endif ; cpuflag(mmxext) %endif ; cpuflag(xop) paddw m1, m0 paddw m3, m2 paddw m3, m1 paddw m4, m3 -%if mmsize == 8 - add r0, r1 +%if cpuflag(mmxext) + lea r0, [r0+r1*%3] %else - lea r0, [r0+r1*%4] + add r0, r1 %endif dec r2 jne .loop -%if cpuflag(xop) +%if mmsize == 16 pshufd m0, m4, q0032 paddd m4, m0 -%else +%elif notcpuflag(mmxext) HADDW m4, m5 %endif movd eax, m4 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx -PIX_SUM16 0, 16, 3, 0 +PIX_SUM16 16, 3, 0 +INIT_MMX mmxext +PIX_SUM16 8, 4, 2 +%endif INIT_XMM sse2 -PIX_SUM16 6, 8, 3, 2 +PIX_SUM16 4, 4, 4 %if HAVE_XOP_EXTERNAL INIT_XMM xop -PIX_SUM16 5, 4, 4, 4 +PIX_SUM16 4, 4, 4 %endif ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index d91b902187..2a4db61511 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -24,6 +24,7 @@ #include "libavcodec/mpegvideoencdsp.h" int ff_pix_sum16_mmx(uint8_t *pix, int line_size); +int ff_pix_sum16_mmxext(uint8_t *pix, int line_size); int ff_pix_sum16_sse2(uint8_t *pix, int line_size); int ff_pix_sum16_xop(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size); @@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, { int cpu_flags = av_get_cpu_flags(); +#if ARCH_X86_32 if (EXTERNAL_MMX(cpu_flags)) { c->pix_sum = ff_pix_sum16_mmx; c->pix_norm1 = ff_pix_norm1_mmx; } + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->pix_sum = ff_pix_sum16_mmxext; + } +#endif + if (EXTERNAL_SSE2(cpu_flags)) { c->pix_sum = ff_pix_sum16_sse2; c->pix_norm1 = ff_pix_norm1_sse2; |