diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-01-03 00:22:11 +0100 |
---|---|---|
committer | Janne Grunau <janne-libav@jannau.net> | 2012-01-16 00:41:51 +0100 |
commit | d78062386e425deafe9a08d109cff70b7a2de22c (patch) | |
tree | 273cea0c591a61c47f60e455d4be02da5135f67d /libavcodec/x86/rv34dsp.asm | |
parent | 3eeb7557637e8e48fbc64e844a94775edb496906 (diff) | |
download | ffmpeg-d78062386e425deafe9a08d109cff70b7a2de22c.tar.gz |
rv34: Intra 16x16 handling
Extract processing of intra 16x16 blocks from intra macroblock
processing.
Also implement a function performing inverse transform and block
reconstruction for DC-only blocks in 1 pass instead of 2.
Diffstat (limited to 'libavcodec/x86/rv34dsp.asm')
-rw-r--r-- | libavcodec/x86/rv34dsp.asm | 83 |
1 files changed, 73 insertions, 10 deletions
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 58f1af0495..c8eeebbfeb 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -35,21 +35,84 @@ SECTION .text sar %1, 10 %endmacro -%macro rv34_idct_dequant4x4_dc 1 -cglobal rv34_idct_dequant4x4_%1_mmx2, 1, 2, 0 +%macro rv34_idct 1 +cglobal rv34_idct_%1_mmx2, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 - movd mm0, r1 - pshufw mm0, mm0, 0 - movq [r0+ 0], mm0 - movq [r0+16], mm0 - movq [r0+32], mm0 - movq [r0+48], mm0 + movd m0, r1 + pshufw m0, m0, 0 + movq [r0+ 0], m0 + movq [r0+16], m0 + movq [r0+32], m0 + movq [r0+48], m0 REP_RET %endmacro INIT_MMX %define IDCT_DC IDCT_DC_ROUND -rv34_idct_dequant4x4_dc dc +rv34_idct dc %define IDCT_DC IDCT_DC_NOROUND -rv34_idct_dequant4x4_dc dc_noround +rv34_idct dc_noround + +; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); +cglobal rv34_idct_dc_add_mmx, 3, 3 + ; calculate DC + IDCT_DC_ROUND r2 + pxor m1, m1 + movd m0, r2 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + + ; add DC + lea r2, [r0+r1*2] + movh m2, [r0] + movh m3, [r0+r1] + movh m4, [r2] + movh m5, [r2+r1] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + movh [r0], m2 + movh [r0+r1], m3 + movh [r2], m4 + movh [r2+r1], m5 + RET + +; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); +INIT_XMM +cglobal rv34_idct_dc_add_sse4, 3, 3, 6 + ; load data + IDCT_DC_ROUND r2 + pxor m1, m1 + + ; calculate DC + movd m0, r2 + lea r2, [r0+r1*2] + movd m2, [r0] + movd m3, [r0+r1] + pshuflw m0, m0, 0 + movd m4, [r2] + movd m5, [r2+r1] + punpcklqdq m0, m0 + punpckldq m2, m3 + punpckldq m4, m5 + punpcklbw m2, m1 + punpcklbw m4, m1 + paddw m2, m0 + paddw m4, m0 + packuswb m2, m4 + movd [r0], m2 + pextrd [r0+r1], m2, 1 + pextrd [r2], m2, 2 + pextrd [r2+r1], m2, 3 + RET |