diff options
author | Loren Merritt <lorenm@u.washington.edu> | 2006-02-10 06:55:25 +0000 |
---|---|---|
committer | Loren Merritt <lorenm@u.washington.edu> | 2006-02-10 06:55:25 +0000 |
commit | ef9d1d15751c6a2e4c570727c198854ce8b44603 (patch) | |
tree | c2ed9fe8f2bf17d05109a494357d40737dd16146 /libavcodec/h264.c | |
parent | a283db3962c07f9dfab87dc7553b61cbc4e6efb8 (diff) | |
download | ffmpeg-ef9d1d15751c6a2e4c570727c198854ce8b44603.tar.gz |
h264: special case dc-only idct. ~1% faster overall
Originally committed as revision 4971 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/h264.c')
-rw-r--r-- | libavcodec/h264.c | 97 |
1 files changed, 59 insertions, 38 deletions
diff --git a/libavcodec/h264.c b/libavcodec/h264.c index cd21c26037..014d635f6b 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -3314,6 +3314,7 @@ static void hl_decode_mb(H264Context *h){ const unsigned int bottom = mb_y & 1; const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); if(!s->decode) return; @@ -3337,9 +3338,16 @@ static void hl_decode_mb(H264Context *h){ // dct_offset = s->linesize * 16; } - idct_add = transform_bypass - ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4 - : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add; + if(transform_bypass){ + idct_dc_add = + idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4; + }else if(IS_8x8DCT(mb_type)){ + idct_dc_add = s->dsp.h264_idct8_dc_add; + idct_add = s->dsp.h264_idct8_add; + }else{ + idct_dc_add = s->dsp.h264_idct_dc_add; + idct_add = s->dsp.h264_idct_add; + } if (IS_INTRA_PCM(mb_type)) { unsigned int x, y; @@ -3389,17 +3397,22 @@ static void hl_decode_mb(H264Context *h){ for(i=0; i<16; i+=4){ uint8_t * const ptr= dest_y + block_offset[i]; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; + const int nnz = h->non_zero_count_cache[ scan8[i] ]; h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000, (h->topright_samples_available<<(i+1))&0x8000, linesize); - if(h->non_zero_count_cache[ scan8[i] ]) - idct_add(ptr, h->mb + i*16, linesize); + if(nnz){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + } } }else for(i=0; i<16; i++){ uint8_t * const ptr= dest_y + block_offset[i]; uint8_t *topright; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; - int tr; + int nnz, tr; if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ const int topright_avail= (h->topright_samples_available<<i)&0x8000; @@ -3413,10 +3426,14 @@ static void hl_decode_mb(H264Context *h){ topright= NULL; h->pred4x4[ dir ](ptr, topright, linesize); - if(h->non_zero_count_cache[ scan8[i] ]){ - if(s->codec_id == CODEC_ID_H264) - idct_add(ptr, h->mb + i*16, linesize); - else + nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(s->codec_id == CODEC_ID_H264){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + }else svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); } } @@ -3453,11 +3470,23 @@ static void hl_decode_mb(H264Context *h){ if(!IS_INTRA4x4(mb_type)){ if(s->codec_id == CODEC_ID_H264){ - const int di = IS_8x8DCT(mb_type) ? 4 : 1; - for(i=0; i<16; i+=di){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below - uint8_t * const ptr= dest_y + block_offset[i]; - idct_add(ptr, h->mb + i*16, linesize); + if(IS_INTRA16x16(mb_type)){ + for(i=0; i<16; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else if(h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } + }else{ + const int di = IS_8x8DCT(mb_type) ? 4 : 1; + for(i=0; i<16; i+=di){ + int nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz==1 && h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } } } }else{ @@ -3471,34 +3500,26 @@ static void hl_decode_mb(H264Context *h){ } if(!(s->flags&CODEC_FLAG_GRAY)){ - idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add; - if(!transform_bypass){ + uint8_t *dest[2] = {dest_cb, dest_cr}; + if(transform_bypass){ + idct_add = idct_dc_add = s->dsp.add_pixels4; + }else{ + idct_add = s->dsp.h264_idct_add; + idct_dc_add = s->dsp.h264_idct_dc_add; chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); } if(s->codec_id == CODEC_ID_H264){ - for(i=16; i<16+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } - } - for(i=20; i<20+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } + for(i=16; i<16+8; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); + else if(h->mb[i*16]) + idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); } }else{ - for(i=16; i<16+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); - } - } - for(i=20; i<20+4; i++){ + for(i=16; i<16+8; i++){ if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; + uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); } } @@ -5131,7 +5152,7 @@ decode_intra_mb: return -1; } nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; - nnz[0] |= nnz[1] | nnz[8] | nnz[9]; + nnz[0] += nnz[1] + nnz[8] + nnz[9]; }else{ for(i4x4=0; i4x4<4; i4x4++){ const int index= i4x4 + 4*i8x8; @@ -5690,7 +5711,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n h->non_zero_count_cache[scan8[16+n]] = coeff_count; else { assert( cat == 5 ); - fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); } for( i = coeff_count - 1; i >= 0; i-- ) { |