diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-07-26 21:36:03 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-07-26 21:37:15 +0200 |
commit | 7333798c85837f1cf175f39bc4acb5664fa6cacc (patch) | |
tree | 60036638a0962b3cb966d62da2eda81f93ac3267 /libavcodec/x86/vp3dsp.asm | |
parent | 307a20cca216356aec30f5bb102c633169cbc0c1 (diff) | |
parent | 44dc9c6af0377faf2a99889d1f949e32a1102e84 (diff) | |
download | ffmpeg-7333798c85837f1cf175f39bc4acb5664fa6cacc.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master:
libopenjpeg: support YUV and deep RGB pixel formats
Fix typo in v410 decoder.
vf_yadif: unset cur_buf on the input link.
vf_overlay: ensure the overlay frame does not get leaked.
vf_overlay: prevent premature freeing of cur_buf
Support urlencoded http authentication credentials
rtmp: Return an error when the client bandwidth is incorrect
rtmp: Return proper error code in handle_server_bw
rtmp: Return proper error code in handle_client_bw
rtmp: Return proper error codes in handle_chunk_size
lavr: x86: add missing vzeroupper in ff_mix_1_to_2_fltp_flt()
vp8: Replace x*155/100 by x*101581>>16.
vp3: don't use calls to inline asm in yasm code.
x86/dsputil: put inline asm under HAVE_INLINE_ASM.
dsputil_mmx: fix incorrect assembly code
rtmp: Factorize the code by adding handle_invoke
rtmp: Factorize the code by adding handle_chunk_size
rtmp: Factorize the code by adding handle_ping
rtmp: Factorize the code by adding handle_client_bw
rtmp: Factorize the code by adding handle_server_bw
Conflicts:
libavcodec/libopenjpegdec.c
libavcodec/x86/dsputil_mmx.c
libavfilter/vf_overlay.c
libavformat/Makefile
libavformat/version.h
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/vp3dsp.asm')
-rw-r--r-- | libavcodec/x86/vp3dsp.asm | 120 |
1 files changed, 79 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 0e0bd29a99..46bd9d8f86 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -38,13 +38,11 @@ cextern pb_1 cextern pb_3 cextern pb_7 cextern pb_1F +cextern pb_80 cextern pb_81 cextern pw_8 -cextern put_signed_pixels_clamped_mmx -cextern add_pixels_clamped_mmx - SECTION .text ; this is off by one or two for some cases when filter_limit is greater than 63 @@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 %endmacro -%macro vp3_idct_funcs 3 -cglobal vp3_idct_put_%1, 3, %3, %2 +%macro vp3_idct_funcs 1 +cglobal vp3_idct_put_%1, 3, 4, 9 VP3_IDCT_%1 r2 -%if ARCH_X86_64 - mov r3, r2 - mov r2, r1 - mov r1, r0 - mov r0, r3 + + movsxdifnidn r1, r1d + mova m4, [pb_80] + lea r3, [r1*3] +%assign %%i 0 +%rep 16/mmsize + mova m0, [r2+mmsize*0+%%i] + mova m1, [r2+mmsize*2+%%i] + mova m2, [r2+mmsize*4+%%i] + mova m3, [r2+mmsize*6+%%i] + packsswb m0, [r2+mmsize*1+%%i] + packsswb m1, [r2+mmsize*3+%%i] + packsswb m2, [r2+mmsize*5+%%i] + packsswb m3, [r2+mmsize*7+%%i] + paddb m0, m4 + paddb m1, m4 + paddb m2, m4 + paddb m3, m4 + movq [r0 ], m0 +%if mmsize == 8 + movq [r0+r1 ], m1 + movq [r0+r1*2], m2 + movq [r0+r3 ], m3 %else - mov r0m, r2 - mov r1m, r0 - mov r2m, r1 + movhps [r0+r1 ], m0 + movq [r0+r1*2], m1 + movhps [r0+r3 ], m1 %endif -%if WIN64 - call put_signed_pixels_clamped_mmx - RET -%else - jmp put_signed_pixels_clamped_mmx +%if %%i == 0 + lea r0, [r0+r1*4] +%endif +%if mmsize == 16 + movq [r0 ], m2 + movhps [r0+r1 ], m2 + movq [r0+r1*2], m3 + movhps [r0+r3 ], m3 %endif +%assign %%i %%i+64 +%endrep + RET -cglobal vp3_idct_add_%1, 3, %3, %2 +cglobal vp3_idct_add_%1, 3, 4, 9 VP3_IDCT_%1 r2 -%if ARCH_X86_64 - mov r3, r2 - mov r2, r1 - mov r1, r0 - mov r0, r3 -%else - mov r0m, r2 - mov r1m, r0 - mov r2m, r1 + + mov r3, 4 + pxor m4, m4 + movsxdifnidn r1, r1d +.loop: + movq m0, [r0] + movq m1, [r0+r1] +%if mmsize == 8 + mova m2, m0 + mova m3, m1 %endif -%if WIN64 - call add_pixels_clamped_mmx - RET -%else - jmp add_pixels_clamped_mmx + punpcklbw m0, m4 + punpcklbw m1, m4 +%if mmsize == 8 + punpckhbw m2, m4 + punpckhbw m3, m4 +%endif + paddsw m0, [r2+ 0] + paddsw m1, [r2+16] +%if mmsize == 8 + paddsw m2, [r2+ 8] + paddsw m3, [r2+24] + packuswb m0, m2 + packuswb m1, m3 +%else ; mmsize == 16 + packuswb m0, m1 %endif + movq [r0 ], m0 +%if mmsize == 8 + movq [r0+r1], m1 +%else ; mmsize == 16 + movhps [r0+r1], m0 +%endif + lea r0, [r0+r1*2] + add r2, 32 + dec r3 + jg .loop + RET %endmacro -%if ARCH_X86_64 -%define REGS 4 -%else -%define REGS 3 -%endif INIT_MMX -vp3_idct_funcs mmx, 0, REGS +vp3_idct_funcs mmx INIT_XMM -vp3_idct_funcs sse2, 9, REGS -%undef REGS +vp3_idct_funcs sse2 %macro DC_ADD 0 movq m2, [r0 ] |