[FFmpeg-cvslog] diracdec: rewrite HQ slice decoding

Rostislav Pehlivanov git at videolan.org
Tue Jul 12 00:45:36 CEST 2016


ffmpeg | branch: master | Rostislav Pehlivanov <rpehlivanov at ob-encoder.com> | Thu Jun 23 18:07:00 2016 +0100| [c43485f70765cb488bfdf95dc783bb9b14eb1179] | committer: Rostislav Pehlivanov

diracdec: rewrite HQ slice decoding

Now coefficients are written to a buffer and are then dequantized by the
new SIMD dequantization functions. For the lower bands without enough
coefficients to fill a register (and hence they overwrite) the C version
of the dequantization function is used.

The buffer is per-thread and will be realloc'd if anything changes.
This prevents regressions and having to limit slice size.

Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c43485f70765cb488bfdf95dc783bb9b14eb1179
---

 libavcodec/diracdec.c |  126 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 111 insertions(+), 15 deletions(-)

diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
index ad33809..7913656 100644
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@@ -161,6 +161,10 @@ typedef struct DiracContext {
     unsigned num_x;              /* number of horizontal slices               */
     unsigned num_y;              /* number of vertical slices                 */
 
+    uint8_t *thread_buf;         /* Per-thread buffer for coefficient storage */
+    int threads_num_buf;         /* Current # of buffers allocated            */
+    int thread_buf_size;         /* Each thread has a buffer this size        */
+
     struct {
         unsigned width;
         unsigned height;
@@ -370,6 +374,10 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     s->frame_number = -1;
 
+    s->thread_buf = NULL;
+    s->threads_num_buf = -1;
+    s->thread_buf_size = -1;
+
     ff_diracdsp_init(&s->diracdsp);
     ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
     ff_videodsp_init(&s->vdsp, 8);
@@ -403,6 +411,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
     for (i = 0; i < MAX_FRAMES; i++)
         av_frame_free(&s->all_frames[i].avframe);
 
+    av_freep(&s->thread_buf);
+
     return 0;
 }
 
@@ -760,46 +770,108 @@ static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
     return 0;
 }
 
+typedef struct SliceCoeffs {
+    int left;
+    int top;
+    int tot_h;
+    int tot_v;
+    int tot;
+} SliceCoeffs;
+
+static int subband_coeffs(DiracContext *s, int x, int y, int p,
+                          SliceCoeffs c[MAX_DWT_LEVELS])
+{
+    int level, coef = 0;
+    for (level = 0; level < s->wavelet_depth; level++) {
+        SliceCoeffs *o = &c[level];
+        SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */
+        o->top   = b->height * y / s->num_y;
+        o->left  = b->width  * x / s->num_x;
+        o->tot_h = ((b->width  * (x + 1)) / s->num_x) - o->left;
+        o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top;
+        o->tot   = o->tot_h*o->tot_v;
+        coef    += o->tot * (4 - !!level);
+    }
+    return coef;
+}
+
 /**
  * VC-2 Specification ->
  * 13.5.3 hq_slice(sx,sy)
  */
-static int decode_hq_slice(AVCodecContext *avctx, void *arg)
+static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
 {
-    int i, quant, level, orientation, quant_idx;
-    uint8_t quants[MAX_DWT_LEVELS][4];
-    DiracContext *s = avctx->priv_data;
-    DiracSlice *slice = arg;
+    int i, level, orientation, quant_idx;
+    int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4];
     GetBitContext *gb = &slice->gb;
+    SliceCoeffs coeffs_num[MAX_DWT_LEVELS];
 
     skip_bits_long(gb, 8*s->highquality.prefix_bytes);
     quant_idx = get_bits(gb, 8);
 
+    if (quant_idx > DIRAC_MAX_QUANT_INDEX) {
+        av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx);
+        return AVERROR_INVALIDDATA;
+    }
+
     /* Slice quantization (slice_quantizers() in the specs) */
     for (level = 0; level < s->wavelet_depth; level++) {
         for (orientation = !!level; orientation < 4; orientation++) {
-            quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
-            quants[level][orientation] = quant;
+            const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
+            qfactor[level][orientation] = ff_dirac_qscale_tab[quant];
+            qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2;
         }
     }
 
     /* Luma + 2 Chroma planes */
     for (i = 0; i < 3; i++) {
-        int64_t length = s->highquality.size_scaler * get_bits(gb, 8);
-        int64_t bits_left = 8 * length;
-        int64_t bits_end = get_bits_count(gb) + bits_left;
+        int c, coef_num, coef_par, off = 0;
+        int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
+        int64_t start = get_bits_count(gb);
+        int64_t bits_end = start + 8*length;
 
         if (bits_end >= INT_MAX) {
             av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
             return AVERROR_INVALIDDATA;
         }
 
+        coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
+
+        if (s->pshift) {
+            int32_t *dst = (int32_t *)tmp_buf;
+            for (c = 0; c < coef_num; c++)
+                dst[c] = dirac_get_se_golomb(gb);
+            coef_par = c;
+        } else {
+            int16_t *dst = (int16_t *)tmp_buf;
+            for (c = 0; c < coef_num; c++)
+                dst[c] = dirac_get_se_golomb(gb);
+            coef_par = c;
+        }
+
+        if (coef_num > coef_par) {
+            const int start_b = coef_par * (4 >> s->pshift);
+            const int end_b   = coef_num * (4 >> s->pshift);
+            memset(&tmp_buf[start_b], 0, end_b - start_b);
+        }
+
         for (level = 0; level < s->wavelet_depth; level++) {
+            const SliceCoeffs *c = &coeffs_num[level];
             for (orientation = !!level; orientation < 4; orientation++) {
-                decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end,
-                               &s->plane[i].band[level][orientation], NULL);
+                const SubBand *b1 = &s->plane[i].band[level][orientation];
+                uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1));
+
+                /* Change to c->tot_h <= 4 for AVX2 dequantization */
+                const int qfunc = s->pshift + 2*(c->tot_h <= 2);
+                s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride,
+                                                   qfactor[level][orientation],
+                                                   qoffset[level][orientation],
+                                                   c->tot_v, c->tot_h);
+
+                off += c->tot << (s->pshift + 1);
             }
         }
+
         skip_bits_long(gb, bits_end - get_bits_count(gb));
     }
 
@@ -811,8 +883,9 @@ static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int
     int i;
     DiracContext *s = avctx->priv_data;
     DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr;
+    uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr];
     for (i = 0; i < s->num_x; i++)
-        decode_hq_slice(avctx, &slices[i]);
+        decode_hq_slice(s, &slices[i], thread_buf);
     return 0;
 }
 
@@ -824,15 +897,32 @@ static int decode_lowdelay(DiracContext *s)
 {
     AVCodecContext *avctx = s->avctx;
     int slice_x, slice_y, bufsize;
-    int64_t bytes = 0;
+    int64_t coef_buf_size, bytes = 0;
     const uint8_t *buf;
     DiracSlice *slices;
+    SliceCoeffs tmp[MAX_DWT_LEVELS];
     int slice_num = 0;
 
     slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice));
     if (!slices)
         return AVERROR(ENOMEM);
 
+    /* 8 becacuse that's how much the golomb reader could overread junk data
+     * from another plane/slice at most, and 512 because SIMD */
+    coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8;
+    coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512;
+
+    if (s->threads_num_buf != avctx->thread_count ||
+        s->thread_buf_size != coef_buf_size) {
+        s->threads_num_buf  = avctx->thread_count;
+        s->thread_buf_size  = coef_buf_size;
+        s->thread_buf       = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size);
+        if (!s->thread_buf) {
+            av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
     align_get_bits(&s->gb);
     /*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
     buf = s->gb.buffer + get_bits_count(&s->gb)/8;
@@ -848,7 +938,7 @@ static int decode_lowdelay(DiracContext *s)
                     if (bytes <= bufsize/8)
                         bytes += buf[bytes] * s->highquality.size_scaler + 1;
                 }
-                if (bytes >= INT_MAX) {
+                if (bytes >= INT_MAX || bytes*8 > bufsize) {
                     av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
                     av_free(slices);
                     return AVERROR_INVALIDDATA;
@@ -867,6 +957,12 @@ static int decode_lowdelay(DiracContext *s)
                     bufsize = 0;
             }
         }
+
+        if (s->num_x*s->num_y != slice_num) {
+            av_log(s->avctx, AV_LOG_ERROR, "too few slices\n");
+            return AVERROR_INVALIDDATA;
+        }
+
         avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y);
     } else {
         for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {



More information about the ffmpeg-cvslog mailing list