[FFmpeg-devel] [PATCH] h264: don't store intra pcm samples in h->mb.

Ronald S. Bultje rsbultje at gmail.com
Sat Feb 16 18:35:12 CET 2013


From: "Ronald S. Bultje" <rsbultje at gmail.com>

Instead, keep them in the bitstream buffer until we read them verbatim,
this saves a memcpy() and a subsequent clearing of the target buffer.
decode_cabac+decode_mb for a sample file (CAPM3_Sony_D.jsv) goes from
6121.4 to 6095.5 cycles, i.e. 26 cycles faster.
---
 libavcodec/h264.c             |  2 +-
 libavcodec/h264.h             |  1 +
 libavcodec/h264_cabac.c       |  3 ++-
 libavcodec/h264_cavlc.c       | 11 +++--------
 libavcodec/h264_mb_template.c | 29 +++++++++++++++--------------
 5 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 792e3e1..b0dd780 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1253,7 +1253,7 @@ static int decode_update_thread_context(AVCodecContext *dst,
 
         // copy all fields after MpegEnc, except mb
         memcpy(&h->s + 1, &h1->s + 1,
-               offsetof(H264Context, mb) - sizeof(MpegEncContext));
+               offsetof(H264Context, intra_pcm_ptr) - sizeof(MpegEncContext));
         av_assert0(&h->cabac == &h->mb_padding + 1);
         memcpy(&h->cabac, &h1->cabac,
                sizeof(H264Context) - offsetof(H264Context, cabac));
diff --git a/libavcodec/h264.h b/libavcodec/h264.h
index b0d44cc..cfb8170 100644
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -390,6 +390,7 @@ typedef struct H264Context {
     GetBitContext *intra_gb_ptr;
     GetBitContext *inter_gb_ptr;
 
+    const uint8_t *intra_pcm_ptr;
     DECLARE_ALIGNED(16, int16_t, mb)[16 * 48 * 2]; ///< as a dct coeffecient is int32_t in high depth, we need to reserve twice the space.
     DECLARE_ALIGNED(16, int16_t, mb_luma_dc)[3][16 * 2];
     int16_t mb_padding[256 * 2];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 54b3775..015f948 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -2009,7 +2009,8 @@ decode_intra_mb:
         // The pixels are stored in the same order as levels in h->mb array.
         if ((int) (h->cabac.bytestream_end - ptr) < mb_size)
             return -1;
-        memcpy(h->mb, ptr, mb_size); ptr+=mb_size;
+        h->intra_pcm_ptr = ptr;
+        ptr += mb_size;
 
         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
 
diff --git a/libavcodec/h264_cavlc.c b/libavcodec/h264_cavlc.c
index 7ca2bb2..8f1dd80 100644
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -765,17 +765,12 @@ decode_intra_mb:
     h->slice_table[ mb_xy ]= h->slice_num;
 
     if(IS_INTRA_PCM(mb_type)){
-        unsigned int x;
         const int mb_size = ff_h264_mb_sizes[h->sps.chroma_format_idc] *
-                            h->sps.bit_depth_luma >> 3;
+                            h->sps.bit_depth_luma;
 
         // We assume these blocks are very rare so we do not optimize it.
-        align_get_bits(&s->gb);
-
-        // The pixels are stored in the same order as levels in h->mb array.
-        for(x=0; x < mb_size; x++){
-            ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
-        }
+        h->intra_pcm_ptr = align_get_bits(&s->gb);
+        skip_bits_long(&s->gb, mb_size);
 
         // In deblocking, the quantizer is 0
         s->current_picture.f.qscale_table[mb_xy] = 0;
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index 0f0e451..eac5d1b 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -103,7 +103,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
         if (PIXEL_SHIFT) {
             int j;
             GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb,
+            init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr,
                           ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
 
             for (i = 0; i < 16; i++) {
@@ -135,7 +135,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
             }
         } else {
             for (i = 0; i < 16; i++)
-                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
+                memcpy(dest_y + i * linesize, (uint8_t *)h->intra_pcm_ptr + i * 16, 16);
             if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
                 if (!h->sps.chroma_format_idc) {
                     for (i = 0; i < 8; i++) {
@@ -143,8 +143,8 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
                         memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8);
                     }
                 } else {
-                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
-                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
+                    uint8_t *src_cb = (uint8_t *)h->intra_pcm_ptr + 256;
+                    uint8_t *src_cr = (uint8_t *)h->intra_pcm_ptr + 256 + block_h * 8;
                     for (i = 0; i < block_h; i++) {
                         memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
                         memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
@@ -259,10 +259,10 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
                 }
             }
         }
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+        if (h->cbp || IS_INTRA(mb_type)) {
+            s->dsp.clear_blocks(h->mb);
+            s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+        }
     }
 }
 
@@ -327,7 +327,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
         if (PIXEL_SHIFT) {
             const int bit_depth = h->sps.bit_depth_luma;
             GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
+            init_get_bits(&gb, (uint8_t *)h->intra_pcm_ptr, 768 * bit_depth);
 
             for (p = 0; p < plane_count; p++)
                 for (i = 0; i < 16; i++) {
@@ -339,7 +339,7 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
             for (p = 0; p < plane_count; p++)
                 for (i = 0; i < 16; i++)
                     memcpy(dest[p] + i * linesize,
-                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
+                           (uint8_t *)h->intra_pcm_ptr + p * 256 + i * 16, 16);
         }
     } else {
         if (IS_INTRA(mb_type)) {
@@ -367,10 +367,11 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
             hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
                                    PIXEL_SHIFT, block_offset, linesize,
                                    dest[p], p);
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+
+        if (h->cbp || IS_INTRA(mb_type)) {
+            s->dsp.clear_blocks(h->mb);
+            s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+        }
     }
 }
 
-- 
1.7.11.3



More information about the ffmpeg-devel mailing list