aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-02-17 14:52:24 -0800
committerMichael Niedermayer <michaelni@gmx.at>2013-02-18 01:21:23 +0100
commitc63f9fb37a7b7da03bed6d79115f7f2e36607808 (patch)
treef405ac2af54069fd224ea5a06c071a023869efff
parent54b2bddd22fb32a67038848b8d2394bee671b143 (diff)
downloadffmpeg-c63f9fb37a7b7da03bed6d79115f7f2e36607808.tar.gz
h264: don't store intra pcm samples in h->mb.
Instead, keep them in the bitstream buffer until we read them verbatim, this saves a memcpy() and a subsequent clearing of the target buffer. decode_cabac+decode_mb for a sample file (CAPM3_Sony_D.jsv) goes from 6121.4 to 6095.5 cycles, i.e. 26 cycles faster. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/h264.c2
-rw-r--r--libavcodec/h264.h1
-rw-r--r--libavcodec/h264_cabac.c3
-rw-r--r--libavcodec/h264_cavlc.c11
-rw-r--r--libavcodec/h264_mb_template.c29
5 files changed, 22 insertions, 24 deletions
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 0644dadbb1..d4e5d25ca9 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1528,7 +1528,7 @@ static int decode_update_thread_context(AVCodecContext *dst,
for (i = 0; i < MAX_PPS_COUNT; i++)
av_freep(h->pps_buffers + i);
- memcpy(h, h1, offsetof(H264Context, mb));
+ memcpy(h, h1, offsetof(H264Context, intra_pcm_ptr));
memcpy(&h->cabac, &h1->cabac,
sizeof(H264Context) - offsetof(H264Context, cabac));
av_assert0(&h->cabac == &h->mb_padding + 1);
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index 024b24a76d..29965e5c88 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -420,6 +420,7 @@ typedef struct H264Context {
GetBitContext *intra_gb_ptr;
GetBitContext *inter_gb_ptr;
+ const uint8_t *intra_pcm_ptr;
DECLARE_ALIGNED(16, int16_t, mb)[16 * 48 * 2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
DECLARE_ALIGNED(16, int16_t, mb_luma_dc)[3][16 * 2];
int16_t mb_padding[256 * 2]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index f073b7bcf2..7709c17a67 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -2004,7 +2004,8 @@ decode_intra_mb:
// The pixels are stored in the same order as levels in h->mb array.
if ((int) (h->cabac.bytestream_end - ptr) < mb_size)
return -1;
- memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
+ h->intra_pcm_ptr = ptr;
+ ptr += mb_size;
ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index e6f4b25663..b75e653268 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -762,17 +762,12 @@ decode_intra_mb:
h->slice_table[ mb_xy ]= h->slice_num;
if(IS_INTRA_PCM(mb_type)){
- unsigned int x;
const int mb_size = ff_h264_mb_sizes[h->sps.chroma_format_idc] *
- h->sps.bit_depth_luma >> 3;
+ h->sps.bit_depth_luma;
// We assume these blocks are very rare so we do not optimize it.
- align_get_bits(&h->gb);
-
- // The pixels are stored in the same order as levels in h->mb array.
- for(x=0; x < mb_size; x++){
- ((uint8_t*)h->mb)[x]= get_bits(&h->gb, 8);
- }
+ h->intra_pcm_ptr = align_get_bits(&h->gb);
+ skip_bits_long(&h->gb, mb_size);
// In deblocking, the quantizer is 0
h->cur_pic.f.qscale_table[mb_xy] = 0;
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index 2f4890a98b..b617029a9f 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -102,7 +102,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
if (PIXEL_SHIFT) {
int j;
GetBitContext gb;
- init_get_bits(&gb, (uint8_t *)h->mb,
+ init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr,
ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
for (i = 0; i < 16; i++) {
@@ -134,7 +134,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
}
} else {
for (i = 0; i < 16; i++)
- memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
+ memcpy(dest_y + i * linesize, (uint8_t *)h->intra_pcm_ptr + i * 16, 16);
if (SIMPLE || !CONFIG_GRAY || !(h->flags & CODEC_FLAG_GRAY)) {
if (!h->sps.chroma_format_idc) {
for (i = 0; i < 8; i++) {
@@ -142,8 +142,8 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8);
}
} else {
- uint8_t *src_cb = (uint8_t *)h->mb + 256;
- uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
+ uint8_t *src_cb = (uint8_t *)h->intra_pcm_ptr + 256;
+ uint8_t *src_cr = (uint8_t *)h->intra_pcm_ptr + 256 + block_h * 8;
for (i = 0; i < block_h; i++) {
memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
@@ -258,10 +258,10 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
}
}
}
- }
- if (h->cbp || IS_INTRA(mb_type)) {
- h->dsp.clear_blocks(h->mb);
- h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+ if (h->cbp || IS_INTRA(mb_type)) {
+ h->dsp.clear_blocks(h->mb);
+ h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+ }
}
}
@@ -325,7 +325,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
if (PIXEL_SHIFT) {
const int bit_depth = h->sps.bit_depth_luma;
GetBitContext gb;
- init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
+ init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr, 768 * bit_depth);
for (p = 0; p < plane_count; p++)
for (i = 0; i < 16; i++) {
@@ -337,7 +337,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
for (p = 0; p < plane_count; p++)
for (i = 0; i < 16; i++)
memcpy(dest[p] + i * linesize,
- (uint8_t *)h->mb + p * 256 + i * 16, 16);
+ (uint8_t *)h->intra_pcm_ptr + p * 256 + i * 16, 16);
}
} else {
if (IS_INTRA(mb_type)) {
@@ -365,10 +365,11 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
PIXEL_SHIFT, block_offset, linesize,
dest[p], p);
- }
- if (h->cbp || IS_INTRA(mb_type)) {
- h->dsp.clear_blocks(h->mb);
- h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+
+ if (h->cbp || IS_INTRA(mb_type)) {
+ h->dsp.clear_blocks(h->mb);
+ h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+ }
}
}