aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorChristophe GISQUET <christophe.gisquet@gmail.com>2012-01-03 00:22:11 +0100
committerJanne Grunau <janne-libav@jannau.net>2012-01-16 00:41:51 +0100
commitd78062386e425deafe9a08d109cff70b7a2de22c (patch)
tree273cea0c591a61c47f60e455d4be02da5135f67d /libavcodec/x86
parent3eeb7557637e8e48fbc64e844a94775edb496906 (diff)
downloadffmpeg-d78062386e425deafe9a08d109cff70b7a2de22c.tar.gz
rv34: Intra 16x16 handling
Extract processing of intra 16x16 blocks from intra macroblock processing. Also implement a function performing inverse transform and block reconstruction for DC-only blocks in 1 pass instead of 2.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/rv34dsp.asm83
-rw-r--r--libavcodec/x86/rv34dsp_init.c14
2 files changed, 83 insertions, 14 deletions
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 58f1af0495..c8eeebbfeb 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -35,21 +35,84 @@ SECTION .text
sar %1, 10
%endmacro
-%macro rv34_idct_dequant4x4_dc 1
-cglobal rv34_idct_dequant4x4_%1_mmx2, 1, 2, 0
+%macro rv34_idct 1
+cglobal rv34_idct_%1_mmx2, 1, 2, 0
movsx r1, word [r0]
IDCT_DC r1
- movd mm0, r1
- pshufw mm0, mm0, 0
- movq [r0+ 0], mm0
- movq [r0+16], mm0
- movq [r0+32], mm0
- movq [r0+48], mm0
+ movd m0, r1
+ pshufw m0, m0, 0
+ movq [r0+ 0], m0
+ movq [r0+16], m0
+ movq [r0+32], m0
+ movq [r0+48], m0
REP_RET
%endmacro
INIT_MMX
%define IDCT_DC IDCT_DC_ROUND
-rv34_idct_dequant4x4_dc dc
+rv34_idct dc
%define IDCT_DC IDCT_DC_NOROUND
-rv34_idct_dequant4x4_dc dc_noround
+rv34_idct dc_noround
+
+; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+cglobal rv34_idct_dc_add_mmx, 3, 3
+ ; calculate DC
+ IDCT_DC_ROUND r2
+ pxor m1, m1
+ movd m0, r2
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+
+ ; add DC
+ lea r2, [r0+r1*2]
+ movh m2, [r0]
+ movh m3, [r0+r1]
+ movh m4, [r2]
+ movh m5, [r2+r1]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ movh [r0], m2
+ movh [r0+r1], m3
+ movh [r2], m4
+ movh [r2+r1], m5
+ RET
+
+; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
+INIT_XMM
+cglobal rv34_idct_dc_add_sse4, 3, 3, 6
+ ; load data
+ IDCT_DC_ROUND r2
+ pxor m1, m1
+
+ ; calculate DC
+ movd m0, r2
+ lea r2, [r0+r1*2]
+ movd m2, [r0]
+ movd m3, [r0+r1]
+ pshuflw m0, m0, 0
+ movd m4, [r2]
+ movd m5, [r2+r1]
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [r0], m2
+ pextrd [r0+r1], m2, 1
+ pextrd [r2], m2, 2
+ pextrd [r2+r1], m2, 3
+ RET
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 4317e9b23b..c10ae4ee96 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -24,17 +24,23 @@
#include "libavcodec/dsputil.h"
#include "libavcodec/rv34dsp.h"
-void ff_rv34_idct_dequant4x4_dc_mmx2(DCTELEM *block);
-void ff_rv34_idct_dequant4x4_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+void ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
{
#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
+ if (mm_flags & AV_CPU_FLAG_MMX)
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
if (mm_flags & AV_CPU_FLAG_MMX2) {
- c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dequant4x4_dc_mmx2;
- c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dequant4x4_dc_noround_mmx2;
+ c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dc_mmx2;
+ c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dc_noround_mmx2;
}
+ if (mm_flags & AV_CPU_FLAG_SSE4)
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
#endif
}