aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2015-03-10 23:11:54 +0000
committerMichael Niedermayer <michaelni@gmx.at>2015-03-14 13:36:47 +0100
commit15ce160183c61fbd98915e07879f1225fcf1c080 (patch)
tree2ab1d5b2bb2266bec1483475b19d42d8d04e17da
parentdecd5193e1d409a4d8dd4c55ca46467d51c86976 (diff)
downloadffmpeg-15ce160183c61fbd98915e07879f1225fcf1c080.tar.gz
x86: xvid_idct: SSE2 merged add version
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/xvididct.asm92
-rw-r--r--libavcodec/x86/xvididct_init.c9
2 files changed, 91 insertions, 10 deletions
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 58ffb11806..0220885da6 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -384,6 +384,12 @@ SECTION .text
; Must now load args as gprs are no longer used for masks
; DEST is set to where address of dest was loaded
%if ARCH_X86_32
+ %if %2 == 2 ; Not enough xmms, store
+ movdqa [%1+1*16], TAN3
+ movdqa [%1+2*16], xmm3
+ movdqa [%1+5*16], REG0
+ movdqa [%1+6*16], xmm5
+ %endif
%xdefine DEST r2q ; BLOCK is r0, stride r1
movifnidn DEST, destm
movifnidn strideq, stridem
@@ -397,8 +403,6 @@ SECTION .text
movq [DEST + strideq], TAN3
movhps [DEST + 2*strideq], TAN3
; REG0 and TAN3 are now available (and likely used in second half)
- %else
- %warning Unimplemented
%endif
%endif
%endmacro
@@ -427,7 +431,88 @@ SECTION .text
movq [DEST + 2*strideq], xmm5
movhps [DEST + strideq], xmm5
%elif %2 == 2
-%warning Unimplemented
+ pxor xmm0, xmm0
+ %if ARCH_X86_32
+ ; free: m3 REG0=m4 m5
+ ; input: m1, m7, m2, m6
+ movq xmm3, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ paddsw xmm3, %3
+ paddsw xmm4, [%1 + 1*16]
+ movq %3, [DEST+2*strideq]
+ movq xmm5, [DEST+ r3q]
+ punpcklbw %3, xmm0
+ punpcklbw xmm5, xmm0
+ paddsw %3, [%1 + 2*16]
+ paddsw xmm5, %5
+ packuswb xmm3, xmm4
+ packuswb %3, xmm5
+ movq [DEST+0*strideq], xmm3
+ movhps [DEST+1*strideq], xmm3
+ movq [DEST+2*strideq], %3
+ movhps [DEST+ r3q], %3
+ lea DEST, [DEST+4*strideq]
+ movq xmm3, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq %3, [DEST+2*strideq]
+ movq xmm5, [DEST+ r3q]
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw %3, xmm0
+ punpcklbw xmm5, xmm0
+ paddsw xmm3, %6
+ paddsw xmm4, [%1 + 5*16]
+ paddsw %3, [%1 + 6*16]
+ paddsw xmm5, %4
+ packuswb xmm3, xmm4
+ packuswb %3, xmm5
+ movq [DEST+0*strideq], xmm3
+ movhps [DEST+1*strideq], xmm3
+ movq [DEST+2*strideq], %3
+ movhps [DEST+ r3q], %3
+ %else
+ ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5
+ ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
+ movq xmm2, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq xmm12, [DEST+2*strideq]
+ movq xmm11, [DEST+ r3q]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm12, xmm0
+ punpcklbw xmm11, xmm0
+ paddsw xmm2, %3
+ paddsw xmm4, TAN3
+ paddsw xmm12, xmm3
+ paddsw xmm11, %5
+ packuswb xmm2, xmm4
+ packuswb xmm12, xmm11
+ movq [DEST+0*strideq], xmm2
+ movhps [DEST+1*strideq], xmm2
+ movq [DEST+2*strideq], xmm12
+ movhps [DEST+ r3q], xmm12
+ lea DEST, [DEST+4*strideq]
+ movq xmm2, [DEST+0*strideq]
+ movq xmm4, [DEST+1*strideq]
+ movq xmm12, [DEST+2*strideq]
+ movq xmm11, [DEST+ r3q]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm12, xmm0
+ punpcklbw xmm11, xmm0
+ paddsw xmm2, %6
+ paddsw xmm4, REG0
+ paddsw xmm12, xmm5
+ paddsw xmm11, %4
+ packuswb xmm2, xmm4
+ packuswb xmm12, xmm11
+ movq [DEST+0*strideq], xmm2
+ movhps [DEST+1*strideq], xmm2
+ movq [DEST+2*strideq], xmm12
+ movhps [DEST+ r3q], xmm12
+ %endif
%endif
%endmacro
@@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
INIT_XMM sse2
IDCT_SSE2 0
IDCT_SSE2 1
+IDCT_SSE2 2
%if ARCH_X86_32
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index 2530d7aad2..57f6ed6dc3 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -27,12 +27,7 @@
#include "xvididct.h"
void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
-
-static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
-{
- ff_xvid_idct_sse2(block);
- ff_add_pixels_clamped(block, dest, line_size);
-}
+void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block);
#if ARCH_X86_32
static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block)
@@ -88,7 +83,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_xvid_idct_put_sse2;
- c->idct_add = xvid_idct_sse2_add;
+ c->idct_add = ff_xvid_idct_add_sse2;
c->idct = ff_xvid_idct_sse2;
c->perm_type = FF_IDCT_PERM_SSE2;
}