diff options
author | Christophe GISQUET <christophe.gisquet@gmail.com> | 2012-01-03 00:22:11 +0100 |
---|---|---|
committer | Janne Grunau <janne-libav@jannau.net> | 2012-01-16 00:41:51 +0100 |
commit | d78062386e425deafe9a08d109cff70b7a2de22c (patch) | |
tree | 273cea0c591a61c47f60e455d4be02da5135f67d /libavcodec/x86 | |
parent | 3eeb7557637e8e48fbc64e844a94775edb496906 (diff) | |
download | ffmpeg-d78062386e425deafe9a08d109cff70b7a2de22c.tar.gz |
rv34: Intra 16x16 handling
Extract processing of intra 16x16 blocks from intra macroblock
processing.
Also implement a function performing inverse transform and block
reconstruction for DC-only blocks in 1 pass instead of 2.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/rv34dsp.asm | 83 | ||||
-rw-r--r-- | libavcodec/x86/rv34dsp_init.c | 14 |
2 files changed, 83 insertions, 14 deletions
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 58f1af0495..c8eeebbfeb 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -35,21 +35,84 @@ SECTION .text sar %1, 10 %endmacro -%macro rv34_idct_dequant4x4_dc 1 -cglobal rv34_idct_dequant4x4_%1_mmx2, 1, 2, 0 +%macro rv34_idct 1 +cglobal rv34_idct_%1_mmx2, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 - movd mm0, r1 - pshufw mm0, mm0, 0 - movq [r0+ 0], mm0 - movq [r0+16], mm0 - movq [r0+32], mm0 - movq [r0+48], mm0 + movd m0, r1 + pshufw m0, m0, 0 + movq [r0+ 0], m0 + movq [r0+16], m0 + movq [r0+32], m0 + movq [r0+48], m0 REP_RET %endmacro INIT_MMX %define IDCT_DC IDCT_DC_ROUND -rv34_idct_dequant4x4_dc dc +rv34_idct dc %define IDCT_DC IDCT_DC_NOROUND -rv34_idct_dequant4x4_dc dc_noround +rv34_idct dc_noround + +; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); +cglobal rv34_idct_dc_add_mmx, 3, 3 + ; calculate DC + IDCT_DC_ROUND r2 + pxor m1, m1 + movd m0, r2 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + punpcklbw m0, m0 + punpcklbw m1, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + + ; add DC + lea r2, [r0+r1*2] + movh m2, [r0] + movh m3, [r0+r1] + movh m4, [r2] + movh m5, [r2+r1] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + movh [r0], m2 + movh [r0+r1], m3 + movh [r2], m4 + movh [r2+r1], m5 + RET + +; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); +INIT_XMM +cglobal rv34_idct_dc_add_sse4, 3, 3, 6 + ; load data + IDCT_DC_ROUND r2 + pxor m1, m1 + + ; calculate DC + movd m0, r2 + lea r2, [r0+r1*2] + movd m2, [r0] + movd m3, [r0+r1] + pshuflw m0, m0, 0 + movd m4, [r2] + movd m5, [r2+r1] + punpcklqdq m0, m0 + punpckldq m2, m3 + punpckldq m4, m5 + punpcklbw m2, m1 + punpcklbw m4, m1 + paddw m2, m0 + paddw m4, m0 + packuswb m2, m4 + movd [r0], m2 + pextrd [r0+r1], m2, 1 + pextrd [r2], m2, 2 + pextrd [r2+r1], m2, 3 + RET diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c index 4317e9b23b..c10ae4ee96 100644 --- a/libavcodec/x86/rv34dsp_init.c +++ b/libavcodec/x86/rv34dsp_init.c @@ -24,17 +24,23 @@ #include "libavcodec/dsputil.h" #include "libavcodec/rv34dsp.h" -void ff_rv34_idct_dequant4x4_dc_mmx2(DCTELEM *block); -void ff_rv34_idct_dequant4x4_dc_noround_mmx2(DCTELEM *block); +void ff_rv34_idct_dc_mmx2(DCTELEM *block); +void ff_rv34_idct_dc_noround_mmx2(DCTELEM *block); +void ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); +void ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp) { #if HAVE_YASM int mm_flags = av_get_cpu_flags(); + if (mm_flags & AV_CPU_FLAG_MMX) + c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; if (mm_flags & AV_CPU_FLAG_MMX2) { - c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dequant4x4_dc_mmx2; - c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dequant4x4_dc_noround_mmx2; + c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dc_mmx2; + c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dc_noround_mmx2; } + if (mm_flags & AV_CPU_FLAG_SSE4) + c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; #endif } |