diff options
author | Diego Biurrun <diego@biurrun.de> | 2012-07-16 02:05:16 +0200 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2012-11-28 16:05:44 +0100 |
commit | 9b15c0a9b38e92565bce98996587e4decdc1e64e (patch) | |
tree | 33d203a54ac64b6b169fdb4b43942e0500cbee3a | |
parent | 1f3f896564501c23b44fcf605567c78ce066b539 (diff) | |
download | ffmpeg-9b15c0a9b38e92565bce98996587e4decdc1e64e.tar.gz |
x86: dsputilenc: port to cpuflags
-rw-r--r-- | libavcodec/x86/dsputilenc.asm | 112 |
1 files changed, 55 insertions, 57 deletions
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index c5ba78a4af..104ec585e8 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -99,35 +99,33 @@ SECTION .text paddusw m0, m1 %endmacro -; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to +; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. -%macro HSUM_MMX 3 - mova %2, %1 - psrlq %1, 32 +%macro HSUM 3 +%if cpuflag(sse2) + movhlps %2, %1 paddusw %1, %2 - mova %2, %1 - psrlq %1, 16 + pshuflw %2, %1, 0xE + paddusw %1, %2 + pshuflw %2, %1, 0x1 paddusw %1, %2 movd %3, %1 -%endmacro - -%macro HSUM_MMXEXT 3 +%elif cpuflag(mmxext) pshufw %2, %1, 0xE paddusw %1, %2 pshufw %2, %1, 0x1 paddusw %1, %2 movd %3, %1 -%endmacro - -%macro HSUM_SSE2 3 - movhlps %2, %1 - paddusw %1, %2 - pshuflw %2, %1, 0xE +%elif cpuflag(mmx) + mova %2, %1 + psrlq %1, 32 paddusw %1, %2 - pshuflw %2, %1, 0x1 + mova %2, %1 + psrlq %1, 16 paddusw %1, %2 movd %3, %1 +%endif %endmacro %macro STORE4 5 @@ -144,30 +142,30 @@ SECTION .text mova %5, [%1+mmsize*3] %endmacro -%macro hadamard8_16_wrapper 3 -cglobal hadamard8_diff_%1, 4, 4, %2 +%macro hadamard8_16_wrapper 2 +cglobal hadamard8_diff, 4, 4, %1 %ifndef m8 - %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) SUB rsp, pad %endif - call hadamard8x8_diff_%1 + call hadamard8x8_diff %+ SUFFIX %ifndef m8 ADD rsp, pad %endif RET -cglobal hadamard8_diff16_%1, 5, 6, %2 +cglobal hadamard8_diff16, 5, 6, %1 %ifndef m8 - %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) SUB rsp, pad %endif - call hadamard8x8_diff_%1 + call hadamard8x8_diff %+ SUFFIX mov r5d, eax add r1, 8 add r2, 8 - call hadamard8x8_diff_%1 + call hadamard8x8_diff %+ SUFFIX add r5d, eax cmp r4d, 16 @@ -175,12 +173,12 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 lea r1, [r1+r3*8-8] lea r2, [r2+r3*8-8] - call hadamard8x8_diff_%1 + call hadamard8x8_diff %+ SUFFIX add r5d, eax add r1, 8 add r2, 8 - call hadamard8x8_diff_%1 + call hadamard8x8_diff %+ SUFFIX add r5d, eax .done: @@ -191,7 +189,25 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 RET %endmacro -%macro HADAMARD8_DIFF_MMX 1 +%macro HADAMARD8_DIFF 0-1 +%if cpuflag(sse2) +hadamard8x8_diff %+ SUFFIX: + lea r0, [r3*3] + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize + HADAMARD8 +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] +%endif + HADAMARD8 + ABS_SUM_8x8 rsp+gprsize + HSUM m0, m1, eax + and eax, 0xFFFF + ret + +hadamard8_16_wrapper %1, 3 +%elif cpuflag(mmx) ALIGN 16 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, ; int stride, int h) @@ -199,7 +215,7 @@ ALIGN 16 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 ; can simply call this 2x2x (and that's why we access rsp+gprsize ; everywhere, which is rsp of calling func -hadamard8x8_diff_%1: +hadamard8x8_diff %+ SUFFIX: lea r0, [r3*3] ; first 4x8 pixels @@ -236,53 +252,35 @@ hadamard8x8_diff_%1: and rax, 0xFFFF ret -hadamard8_16_wrapper %1, 0, 14 -%endmacro - -%macro HADAMARD8_DIFF_SSE2 2 -hadamard8x8_diff_%1: - lea r0, [r3*3] - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize - HADAMARD8 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] +hadamard8_16_wrapper 0, 14 %endif - HADAMARD8 - ABS_SUM_8x8 rsp+gprsize - HSUM_SSE2 m0, m1, eax - and eax, 0xFFFF - ret - -hadamard8_16_wrapper %1, %2, 3 %endmacro -INIT_MMX +INIT_MMX mmx %define ABS1 ABS1_MMX -%define HSUM HSUM_MMX -HADAMARD8_DIFF_MMX mmx +HADAMARD8_DIFF +INIT_MMX mmxext %define ABS1 ABS1_MMXEXT -%define HSUM HSUM_MMXEXT -HADAMARD8_DIFF_MMX mmxext +HADAMARD8_DIFF -INIT_XMM +INIT_XMM sse2 %define ABS2 ABS2_MMXEXT %if ARCH_X86_64 %define ABS_SUM_8x8 ABS_SUM_8x8_64 %else %define ABS_SUM_8x8 ABS_SUM_8x8_32 %endif -HADAMARD8_DIFF_SSE2 sse2, 10 +HADAMARD8_DIFF 10 +INIT_XMM ssse3 %define ABS2 ABS2_SSSE3 %define ABS_SUM_8x8 ABS_SUM_8x8_64 -HADAMARD8_DIFF_SSE2 ssse3, 9 +HADAMARD8_DIFF 9 -INIT_XMM +INIT_XMM sse2 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) -cglobal sse16_sse2, 5, 5, 8 +cglobal sse16, 5, 5, 8 shr r4d, 1 pxor m0, m0 ; mm0 = 0 pxor m7, m7 ; mm7 holds the sum |