diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2010-01-17 20:35:55 +0000 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2010-01-17 20:35:55 +0000 |
commit | c988f97566cdf536ba0dcbc0d77d885456852060 (patch) | |
tree | dcaf443e415311e25f4012d0bc504659e51ed48a /libavcodec/h264.h | |
parent | 00c4127ec9ff30caaa4579d2d1ef1557d870a7f1 (diff) | |
download | ffmpeg-c988f97566cdf536ba0dcbc0d77d885456852060.tar.gz |
Rearchitecturing the stiched up goose part 1
Run loop filter per row instead of per MB, this also should make it
much easier to switch to per frame filtering and also doing so in a
seperate thread in the future if some volunteer wants to try.
Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample)
This change also allows some optimizations to be tried that would not have
been possible before.
Originally committed as revision 21270 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/h264.h')
-rw-r--r-- | libavcodec/h264.h | 57 |
1 files changed, 44 insertions, 13 deletions
diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 6d18af431a..bcc3ef537d 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -300,7 +300,7 @@ typedef struct H264Context{ * is 64 if not available. */ DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); - uint8_t (*non_zero_count)[16]; + uint8_t (*non_zero_count)[32]; /** * Motion vector cache. @@ -423,6 +423,7 @@ typedef struct H264Context{ */ unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode unsigned int list_count; + uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type Picture *short_ref[32]; Picture *long_ref[32]; Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture @@ -736,8 +737,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){ top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); //FIXME deblocking could skip the intra and nnz parts. - if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) - return; +// if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) +// return; /* Wow, what a mess, why didn't they simplify the interlacing & intra * stuff, I can't imagine that these complex rules are worth it. */ @@ -793,20 +794,33 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){ left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; - if(MB_MBAFF && !IS_INTRA(mb_type)){ + if(!IS_INTRA(mb_type)){ int list; for(list=0; list<h->list_count; list++){ - //These values where changed for ease of performing MC, we need to change them back - //FIXME maybe we can make MC and loop filter use the same values or prevent - //the MC code from changing ref_cache and rather use a temporary array. - if(USES_LIST(mb_type,list)){ - int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; + int8_t *ref; + int y, b_xy; + if(!USES_LIST(mb_type, list)){ + fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = - *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; - ref += h->b8_stride; + *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = - *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; + *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101; + continue; } + + ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; + *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = + *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; + ref += h->b8_stride; + *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = + *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; + + b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; + for(y=0; y<4; y++){ + *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; + *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]; + } + } } }else{ @@ -1196,6 +1210,23 @@ static inline void write_back_non_zero_count(H264Context *h){ h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; + + //FIXME sort better how things are stored in non_zero_count + + + h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1]; + h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2]; + h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3]; + h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1]; + h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2]; + h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3]; + h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1]; + h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2]; + h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3]; + + h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1]; + h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4]; + } static inline void write_back_motion(H264Context *h, int mb_type){ @@ -1271,7 +1302,7 @@ static void decode_mb_skip(H264Context *h){ const int mb_xy= h->mb_xy; int mb_type=0; - memset(h->non_zero_count[mb_xy], 0, 16); + memset(h->non_zero_count[mb_xy], 0, 32); memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui if(MB_FIELD) |