diff options
author | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
---|---|---|
committer | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
commit | 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b (patch) | |
tree | 220be84d79d9c771c1afeab43fdd2aaa82fea01d /libavcodec/x86 | |
parent | 6c18f1cda2e2b2471ebf75d30d552cb0cb61b6ad (diff) | |
download | ffmpeg-19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b.tar.gz |
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed.
NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.
Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 1 | ||||
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 154 | ||||
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 4 |
3 files changed, 159 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 909ec414e7..375a4c5e09 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = {0x8000000080000000ULL, 0x8000000080000000ULL}; +DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 9c154f80b3..fdb35003a8 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 %endif cextern pw_32 +cextern pw_1 SECTION .text @@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8 add8_sse2_cycle 2, 0x21 add8_sse2_cycle 3, 0x29 RET + +;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) + +%macro WALSH4_1D 5 + SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 + SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 + SWAP %1, %4, %3 +%endmacro + +%macro DEQUANT_MMX 3 + mova m7, [pw_1] + mova m4, %1 + punpcklwd %1, m7 + punpckhwd m4, m7 + mova m5, %2 + punpcklwd %2, m7 + punpckhwd m5, m7 + movd m7, t3d + punpckldq m7, m7 + pmaddwd %1, m7 + pmaddwd %2, m7 + pmaddwd m4, m7 + pmaddwd m5, m7 + psrad %1, %3 + psrad %2, %3 + psrad m4, %3 + psrad m5, %3 + packssdw %1, m4 + packssdw %2, m5 +%endmacro + +%macro STORE_WORDS_MMX 5 + movd t0d, %1 + psrlq %1, 32 + movd t1d, %1 + mov [t2+%2*32], t0w + mov [t2+%4*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%3*32], t0w + mov [t2+%5*32], t1w +%endmacro + +%macro DEQUANT_STORE_MMX 1 + DEQUANT_MMX m0, m1, %1 + STORE_WORDS_MMX m0, 0, 1, 4, 5 + STORE_WORDS_MMX m1, 2, 3, 6, 7 + + DEQUANT_MMX m2, m3, %1 + STORE_WORDS_MMX m2, 8, 9, 12, 13 + STORE_WORDS_MMX m3, 10, 11, 14, 15 +%endmacro + +%macro STORE_WORDS_SSE 9 + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 + psrldq %1, 4 + mov [t2+%2*32], t0w + mov [t2+%4*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%3*32], t0w + mov [t2+%5*32], t1w + movd t0d, %1 + psrldq %1, 4 + movd t1d, %1 + mov [t2+%6*32], t0w + mov [t2+%8*32], t1w + shr t0d, 16 + shr t1d, 16 + mov [t2+%7*32], t0w + mov [t2+%9*32], t1w +%endmacro + +%macro DEQUANT_STORE_SSE2 1 + movd xmm4, t3d + movq xmm5, [pw_1] + pshufd xmm4, xmm4, 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 + punpcklwd xmm0, xmm5 + punpcklwd xmm1, xmm5 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm5 + pmaddwd xmm0, xmm4 + pmaddwd xmm1, xmm4 + pmaddwd xmm2, xmm4 + pmaddwd xmm3, xmm4 + psrad xmm0, %1 + psrad xmm1, %1 + psrad xmm2, %1 + psrad xmm3, %1 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 + STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 +%endmacro + +%macro IDCT_DC_DEQUANT 2 +cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 + movq m3, [r1+24] + movq m2, [r1+16] + movq m1, [r1+ 8] + movq m0, [r1+ 0] + WALSH4_1D 0,1,2,3,4 + TRANSPOSE4x4W 0,1,2,3,4 + WALSH4_1D 0,1,2,3,4 + +; shift, tmp, output, qmul +%ifdef WIN64 + DECLARE_REG_TMP 0,3,1,2 + ; we can't avoid this, because r0 is the shift register (ecx) on win64 + xchg r0, t2 +%elifdef ARCH_X86_64 + DECLARE_REG_TMP 3,1,0,2 +%else + DECLARE_REG_TMP 1,3,0,2 +%endif + + cmp t3d, 32767 + jg .big_qmul + add t3d, 128 << 16 +%ifidn %1,mmx + DEQUANT_STORE_MMX 8 +%else + DEQUANT_STORE_SSE2 8 +%endif + RET +.big_qmul: + bsr t0d, t3d + add t3d, 128 << 16 + mov t1d, 7 + cmp t0d, t1d + cmovg t0d, t1d + inc t1d + shr t3d, t0b + sub t1d, t0d +%ifidn %1,mmx + movd m6, t1d + DEQUANT_STORE_MMX m6 +%else + movd xmm6, t1d + DEQUANT_STORE_SSE2 xmm6 +%endif + RET +%endmacro + +INIT_MMX +IDCT_DC_DEQUANT mmx, 0 +IDCT_DC_DEQUANT sse2, 7 diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 401a488cb5..d9e45f8b03 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM int stride, const uint8_t nnzc[6*8]); void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); /***********************************/ /* deblocking */ @@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; c->h264_idct_add8 = ff_h264_idct_add8_mmx; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; if (mm_flags & AV_CPU_FLAG_MMX2) { c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; @@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) if (mm_flags&AV_CPU_FLAG_SSE2) { c->h264_idct8_add = ff_h264_idct8_add_sse2; c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; |