[FFmpeg-devel] [PATCH] avcodec/vp9: Add tile threading support

Wed Aug 23 19:11:17 EEST 2017

Hi,

now for review of the vp9-specific bits.

On Tue, Aug 22, 2017 at 7:59 PM, Ilia Valiakhmetov <zakne0ne at gmail.com>
wrote:

> +                                lflvl_len*sizeof(*s->lflvl) + 16 *
> sizeof(*s->above_mv_ctx)));
>

lflvl_len * sizeof (space).

> -    // these will be re-allocated a little later
> -    av_freep(&s->b_base);
> -    av_freep(&s->block_base);
>

I don't think you can remove that. I understand that td may or may not be
allocated here, so you probably need an if or so, but its critically
important that we re-allocate these upon a size change, so this behaviour
probably needs to stay in.

>      } else {
> -        s->b_base = av_malloc(sizeof(VP9Block));
> -        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) *
> bytesperpixel * sizeof(int16_t) +
> -                                   16 * 16 + 2 * chroma_eobs);
> -        if (!s->b_base || !s->block_base)
> -            return AVERROR(ENOMEM);
> -        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
> -        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks *
> bytesperpixel;
> -        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks *
> bytesperpixel);
> -        s->uveob_base[0] = s->eob_base + 16 * 16;
> -        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
> +        for (i = 1; i < s->s.h.tiling.tile_cols; i++) {
> +            if (s->td[i].b_base && s->td[i].block_base) {
> +                av_free(s->td[i].b_base);
> +                av_free(s->td[i].block_base);
> +            }
> +        }
> +        for (i = 0; i < s->s.h.tiling.tile_cols; i++) {
> +            s->td[i].b_base = av_malloc(sizeof(VP9Block));
> +            s->td[i].block_base = av_mallocz((64 * 64 + 2 *
> chroma_blocks) * bytesperpixel * sizeof(int16_t) +
> +                                       16 * 16 + 2 * chroma_eobs);
> +            if (!s->td[i].b_base || !s->td[i].block_base)
> +                return AVERROR(ENOMEM);
> +            s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 *
> bytesperpixel;
> +            s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] +
> chroma_blocks * bytesperpixel;
> +            s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] +
> chroma_blocks * bytesperpixel);
> +            s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
> +            s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
> +        }
>

This makes the non-threaded decoder use a lot more memory, I'm not sure
that's a good idea, it's certainly not required, since I believe the
codepath of non-threaded is shared with frame-mt, so it only uses s->td[0].
Can we allocate less memory here if there's no frame-threading at all?

> +static int decode_tiles(AVCodecContext *avctx)
> +{
> +    VP9Context *s = avctx->priv_data;
> +    VP9TileData *td = &s->td[0];
> +    int row, col, tile_row, tile_col;
> +    int bytesperpixel;
> +    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
> +    AVFrame *f;
> +    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
> +
> +    f = s->s.frames[CUR_FRAME].tf.f;
> +    ls_y = f->linesize[0];
> +    ls_uv =f->linesize[1];
> +    bytesperpixel = s->bytesperpixel;
> +
> +    yoff = uvoff = 0;
> +    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
> +        set_tile_offset(&tile_row_start, &tile_row_end,
> +                        tile_row, s->s.h.tiling.log2_tile_rows,
> s->sb_rows);
> +
> +        for (row = tile_row_start; row < tile_row_end;
> +             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v)
> {
> +            VP9Filter *lflvl_ptr = s->lflvl;
> +            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
> +
> +            for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols;
> tile_col++) {
> +                set_tile_offset(&tile_col_start, &tile_col_end,
> +                                tile_col, s->s.h.tiling.log2_tile_cols,
> s->sb_cols);
> +                td->tile_col_start = tile_col_start;
> +                if (s->pass != 2) {
> +                    memset(td->left_partition_ctx, 0, 8);
> +                    memset(td->left_skip_ctx, 0, 8);
> +                    if (s->s.h.keyframe || s->s.h.intraonly) {
> +                        memset(td->left_mode_ctx, DC_PRED, 16);
> +                    } else {
> +                        memset(td->left_mode_ctx, NEARESTMV, 8);
> +                    }
> +                    memset(td->left_y_nnz_ctx, 0, 16);
> +                    memset(td->left_uv_nnz_ctx, 0, 32);
> +                    memset(td->left_segpred_ctx, 0, 8);
> +
> +                    memcpy(&td->c, &s->td[tile_col].c_b[tile_row],
> sizeof(td->c));
> +                }
> +
> +                for (col = tile_col_start;
> +                     col < tile_col_end;
> +                     col += 8, yoff2 += 64 * bytesperpixel,
> +                     uvoff2 += 64 * bytesperpixel >> s->ss_h,
> lflvl_ptr++) {
> +                    // FIXME integrate with lf code (i.e. zero after each
> +                    // use, similar to invtxfm coefficients, or similar)
> +                    if (s->pass != 1) {
> +                        memset(lflvl_ptr->mask, 0,
> sizeof(lflvl_ptr->mask));
> +                    }
> +
> +                    if (s->pass == 2) {
> +                        decode_sb_mem(td, row, col, lflvl_ptr,
> +                                      yoff2, uvoff2, BL_64X64);
> +                    } else {
> +                        decode_sb(td, row, col, lflvl_ptr,
> +                                  yoff2, uvoff2, BL_64X64);
> +                    }
> +                }
> +                if (s->pass != 2)
> +                    memcpy(&s->td[tile_col].c_b[tile_row], &td->c,
> sizeof(td->c));
> +            }
> +
> +            if (s->pass == 1)
> +                continue;
> +
> +            // backup pre-loopfilter reconstruction data for intra
> +            // prediction of next row of sb64s
> +            if (row + 8 < s->rows) {
> +                memcpy(s->intra_pred_data[0],
> +                       f->data[0] + yoff + 63 * ls_y,
> +                       8 * s->cols * bytesperpixel);
> +                memcpy(s->intra_pred_data[1],
> +                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> +                       8 * s->cols * bytesperpixel >> s->ss_h);
> +                memcpy(s->intra_pred_data[2],
> +                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> +                       8 * s->cols * bytesperpixel >> s->ss_h);
> +            }
> +
> +            // loopfilter one row
> +            if (s->s.h.filter.level) {
> +                yoff2 = yoff;
> +                uvoff2 = uvoff;
> +                lflvl_ptr = s->lflvl;
> +                for (col = 0; col < s->cols;
> +                     col += 8, yoff2 += 64 * bytesperpixel,
> +                     uvoff2 += 64 * bytesperpixel >> s->ss_h,
> lflvl_ptr++) {
> +                    ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
> +                                         yoff2, uvoff2);
> +                }
> +            }
> +
> +            // FIXME maybe we can make this more finegrained by running
> the
> +            // loopfilter per-block instead of after each sbrow
> +            // In fact that would also make intra pred left preparation
> easier?
> +            ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >>
> 3, 0);
> +        }
> +    }
> +    return 0;
> +}
> +
> +
> +static av_always_inline
> +int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
> +                              int threadnr)
> +{
> +    VP9Context *s = avctx->priv_data;
> +    VP9TileData *td = &s->td[jobnr];
> +    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
> +    int bytesperpixel = s->bytesperpixel, row, col, tile_row;
> +    unsigned tile_cols_len;
> +    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
> +    VP9Filter *lflvl_ptr_base;
> +    AVFrame *f;
> +
> +    f = s->s.frames[CUR_FRAME].tf.f;
> +    ls_y = f->linesize[0];
> +    ls_uv =f->linesize[1];
> +
> +    set_tile_offset(&tile_col_start, &tile_col_end,
> +                    jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
> +    td->tile_col_start  = tile_col_start;
> +    uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
> +    yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
> +    lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
> +
> +    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
> +        set_tile_offset(&tile_row_start, &tile_row_end,
> +                        tile_row, s->s.h.tiling.log2_tile_rows,
> s->sb_rows);
> +
> +        memcpy(&td->c, &td->c_b[tile_row], sizeof(td->c));
> +        for (row = tile_row_start; row < tile_row_end;
> +             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v)
> {
> +            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
> +            VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
> +
> +            memset(td->left_partition_ctx, 0, 8);
> +            memset(td->left_skip_ctx, 0, 8);
> +            if (s->s.h.keyframe || s->s.h.intraonly) {
> +                memset(td->left_mode_ctx, DC_PRED, 16);
> +            } else {
> +                memset(td->left_mode_ctx, NEARESTMV, 8);
> +            }
> +            memset(td->left_y_nnz_ctx, 0, 16);
> +            memset(td->left_uv_nnz_ctx, 0, 32);
> +            memset(td->left_segpred_ctx, 0, 8);
> +
> +            for (col = tile_col_start;
> +                 col < tile_col_end;
> +                 col += 8, yoff2 += 64 * bytesperpixel,
> +                 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
> +                // FIXME integrate with lf code (i.e. zero after each
> +                // use, similar to invtxfm coefficients, or similar)
> +                memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
> +                decode_sb(td, row, col, lflvl_ptr,
> +                            yoff2, uvoff2, BL_64X64);
> +            }
> +
> +            // backup pre-loopfilter reconstruction data for intra
> +            // prediction of next row of sb64s
> +            tile_cols_len = tile_col_end - tile_col_start;
> +            if (row + 8 < s->rows) {
> +                memcpy(s->intra_pred_data[0] + (tile_col_start * 8 *
> bytesperpixel),
> +                       f->data[0] + yoff + 63 * ls_y,
> +                       8 * tile_cols_len * bytesperpixel);
> +                memcpy(s->intra_pred_data[1] + (tile_col_start * 8 *
> bytesperpixel >> s->ss_h),
> +                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> +                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
> +                memcpy(s->intra_pred_data[2] + (tile_col_start * 8 *
> bytesperpixel >> s->ss_h),
> +                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
> +                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
> +            }
> +
> +            ff_thread_report_progress2(avctx, row >> 3, 0, 1);
> +        }
> +    }
> +    return 0;
> +}
> +
> +static av_always_inline
> +int loopfilter_proc(AVCodecContext *avctx)
> +{
> +    VP9Context *s = avctx->priv_data;
> +    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
> +    VP9Filter *lflvl_ptr;
> +    int bytesperpixel = s->bytesperpixel, col, i;
> +    AVFrame *f;
> +
> +    f = s->s.frames[CUR_FRAME].tf.f;
> +    ls_y = f->linesize[0];
> +    ls_uv =f->linesize[1];
> +
> +    for (i = 0; i < s->sb_rows; i++) {
> +        ff_thread_await_progress3(avctx, i, 0, s->s.h.tiling.tile_cols);
> +
> +        if (s->s.h.filter.level) {
> +            yoff = (ls_y * 64)*i;
> +            uvoff =  (ls_uv * 64 >> s->ss_v)*i;
> +            lflvl_ptr = s->lflvl+s->sb_cols*i;
> +            for (col = 0; col < s->cols;
> +                 col += 8, yoff += 64 * bytesperpixel,
> +                 uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
> +                ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
> +                                     yoff, uvoff);
> +            }
> +        }
> +    }
>      return 0;
>  }
>

Hm... There is some duplicate code here... I guess it's OK.

+        if (avctx->active_thread_type == FF_THREAD_SLICE)
> +            for (i = 1; i < s->s.h.tiling.tile_cols; i++)
> +                for (j = 0; j < sizeof(s->td[i].counts) /
> sizeof(unsigned); j++)
> +                    ((unsigned *)&s->td[0].counts)[j] += ((unsigned
> *)&s->td[i].counts)[j];

Add a comment so people understand what this does, this is kind of cryptic.

> +typedef struct VP9TileData {
> +    VP9Context *s;
>

Can this be marked const?

> +    VP56RangeCoder c_b[4];
> +    VP56RangeCoder c;
>

Why do you need 'c', and can that be a pointer into one of the c_b entries?

Ronald