aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-09-30 22:21:40 -0300
committerJames Almer <jamrial@gmail.com>2014-10-01 13:07:22 -0300
commitacebff8e5dc0789c228b10ffcae2f2eb6c30a91d (patch)
tree33bebe65a886d5ee908242045e51591d0f052229
parentf2e53808e329c403d1a409f3c35a93854064b6a2 (diff)
downloadffmpeg-acebff8e5dc0789c228b10ffcae2f2eb6c30a91d.tar.gz
x86/mpegvideoencdsp: improve ff_pix_sum16_sse2
~15% faster. Also add an mmxext version that takes advantage of the new code, and build it alongside with the mmx version only on x86_32. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/x86/mpegvideoencdsp.asm51
-rw-r--r--libavcodec/x86/mpegvideoencdsp_init.c7
2 files changed, 41 insertions, 17 deletions
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 4fe6cfe5a6..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -29,16 +29,16 @@ cextern pw_1
SECTION .text
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-; %1 = number of xmm registers used
-; %2 = number of loops
-; %3 = number of GPRs used
-%macro PIX_SUM16 4
-cglobal pix_sum16, 2, %3, %1
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
movsxdifnidn r1, r1d
- mov r2, %2
-%if cpuflag(xop)
+ mov r2, %1
+%if mmsize == 16
lea r3, [r1*3]
-%else
+%endif
+%if notcpuflag(xop)
pxor m5, m5
%endif
pxor m4, m4
@@ -52,42 +52,59 @@ cglobal pix_sum16, 2, %3, %1
mova m0, [r0]
%if mmsize == 8
mova m1, [r0+8]
-%else
+%if cpuflag(mmxext)
+ mova m2, [r0+r1]
+ mova m3, [r0+r1+8]
+%endif
+%else ; sse2
mova m1, [r0+r1]
+ mova m2, [r0+r1*2]
+ mova m3, [r0+r3]
%endif
+%if cpuflag(mmxext)
+ psadbw m0, m5
+ psadbw m1, m5
+ psadbw m2, m5
+ psadbw m3, m5
+%else ; mmx
punpckhbw m2, m0, m5
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
+%endif ; cpuflag(mmxext)
%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m4, m3
-%if mmsize == 8
- add r0, r1
+%if cpuflag(mmxext)
+ lea r0, [r0+r1*%3]
%else
- lea r0, [r0+r1*%4]
+ add r0, r1
%endif
dec r2
jne .loop
-%if cpuflag(xop)
+%if mmsize == 16
pshufd m0, m4, q0032
paddd m4, m0
-%else
+%elif notcpuflag(mmxext)
HADDW m4, m5
%endif
movd eax, m4
RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
-PIX_SUM16 0, 16, 3, 0
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16 8, 4, 2
+%endif
INIT_XMM sse2
-PIX_SUM16 6, 8, 3, 2
+PIX_SUM16 4, 4, 4
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
-PIX_SUM16 5, 4, 4, 4
+PIX_SUM16 4, 4, 4
%endif
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index d91b902187..2a4db61511 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -24,6 +24,7 @@
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
@@ -218,11 +219,17 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
{
int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->pix_sum = ff_pix_sum16_mmx;
c->pix_norm1 = ff_pix_norm1_mmx;
}
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_mmxext;
+ }
+#endif
+
if (EXTERNAL_SSE2(cpu_flags)) {
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;