diff options
author | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
---|---|---|
committer | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
commit | 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b (patch) | |
tree | 220be84d79d9c771c1afeab43fdd2aaa82fea01d /libavcodec/h264.c | |
parent | 6c18f1cda2e2b2471ebf75d30d552cb0cb61b6ad (diff) | |
download | ffmpeg-19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b.tar.gz |
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed.
NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.
Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/h264.c')
-rw-r--r-- | libavcodec/h264.c | 50 |
1 files changed, 8 insertions, 42 deletions
diff --git a/libavcodec/h264.c b/libavcodec/h264.c index b11d947b77..f3470474ea 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ return 0; } -/** - * IDCT transforms the 16 dc values and dequantizes them. - * @param qp quantization parameter - */ -static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){ -#define stride 16 - int i; - int temp[16]; //FIXME check if this is a good idea - static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; - static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; - -//memset(block, 64, 2*256); -//return; - for(i=0; i<4; i++){ - const int offset= y_offset[i]; - const int z0= block[offset+stride*0] + block[offset+stride*4]; - const int z1= block[offset+stride*0] - block[offset+stride*4]; - const int z2= block[offset+stride*1] - block[offset+stride*5]; - const int z3= block[offset+stride*1] + block[offset+stride*5]; - - temp[4*i+0]= z0+z3; - temp[4*i+1]= z1+z2; - temp[4*i+2]= z1-z2; - temp[4*i+3]= z0-z3; - } - - for(i=0; i<4; i++){ - const int offset= x_offset[i]; - const int z0= temp[4*0+i] + temp[4*2+i]; - const int z1= temp[4*0+i] - temp[4*2+i]; - const int z2= temp[4*1+i] - temp[4*3+i]; - const int z3= temp[4*1+i] + temp[4*3+i]; - - block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual - block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); - block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); - block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); - } -} - #if 0 /** * DCT transforms the 16 dc values. @@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); if(is_h264){ if(!transform_bypass) - h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]); + h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); + else{ + static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16, + 8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16}; + for(i = 0; i < 16; i++) + h->mb[dc_mapping[i]] = h->mb_luma_dc[i]; + } }else - ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); + ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); } if(h->deblocking_filter) xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); |