aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-05-29 00:57:39 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-05-29 18:40:23 +0200
commit05de4d30111ef16c9721f1bea6ad14795407c7fc (patch)
tree37329863deea089c745c5db639fb570f5f64483f
parent232959f184e106c08ba196f800f15047e74ce4e8 (diff)
downloadffmpeg-05de4d30111ef16c9721f1bea6ad14795407c7fc.tar.gz
x86/dsputilenc: implement XOP version of pix_sum16
SSE2: 137 cycles XOP: 87 cycles Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/dsputilenc.asm29
-rw-r--r--libavcodec/x86/dsputilenc_mmx.c5
2 files changed, 29 insertions, 5 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 263516a43c..2cb70f0a37 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -446,13 +446,24 @@ cglobal diff_pixels, 4, 5, 5
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
; %2 = number of loops
-%macro PIX_SUM16 2
-cglobal pix_sum16, 2, 3, %1
+; %3 = number of GPRs used
+%macro PIX_SUM16 4
+cglobal pix_sum16, 2, %3, %1
movsxdifnidn r1, r1d
mov r2, %2
+%if cpuflag(xop)
+ lea r3, [r1*3]
+%else
pxor m5, m5
+%endif
pxor m4, m4
.loop:
+%if cpuflag(xop)
+ vphaddubq m0, [r0]
+ vphaddubq m1, [r0+r1]
+ vphaddubq m2, [r0+r1*2]
+ vphaddubq m3, [r0+r3]
+%else
mova m0, [r0]
%if mmsize == 8
mova m1, [r0+8]
@@ -463,6 +474,7 @@ cglobal pix_sum16, 2, 3, %1
punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
+%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
@@ -470,19 +482,26 @@ cglobal pix_sum16, 2, 3, %1
%if mmsize == 8
add r0, r1
%else
- lea r0, [r0+r1*2]
+ lea r0, [r0+r1*%4]
%endif
dec r2
jne .loop
+%if cpuflag(xop)
+ pshufd m0, m4, q0032
+ paddd m4, m0
+%else
HADDW m4, m5
+%endif
movd eax, m4
RET
%endmacro
INIT_MMX mmx
-PIX_SUM16 0, 16
+PIX_SUM16 0, 16, 3, 0
INIT_XMM sse2
-PIX_SUM16 6, 8
+PIX_SUM16 6, 8, 3, 2
+INIT_XMM xop
+PIX_SUM16 5, 4, 4, 4
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index efe835fac9..4280f4ba5e 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -39,6 +39,7 @@ void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
int ff_sum_abs_dctelem_mmx(int16_t *block);
@@ -925,5 +926,9 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif
}
+ if (EXTERNAL_XOP(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_xop;
+ }
+
ff_dsputil_init_pix_mmx(c, avctx);
}