[FFmpeg-devel] [PATCH v8 05/15] avcodec/vaapi_encode: move the dpb logic from VAAPI to base layer

Tue May 14 23:46:44 EEST 2024

On 18/04/2024 09:58, tong1.wu-at-intel.com at ffmpeg.org wrote:
> From: Tong Wu <tong1.wu at intel.com>
> 
> Move receive_packet function to base. This requires adding *alloc,
> *issue, *output, *free as hardware callbacks. HWBaseEncodePicture is
> introduced as the base layer structure. The related parameters in
> VAAPIEncodeContext are also extracted to HWBaseEncodeContext. Then DPB
> management logic can be fully extracted to base layer as-is.
> 
> Signed-off-by: Tong Wu <tong1.wu at intel.com>
> ---
>  libavcodec/Makefile             |   2 +-
>  libavcodec/hw_base_encode.c     | 600 ++++++++++++++++++++++++
>  libavcodec/hw_base_encode.h     | 123 +++++
>  libavcodec/vaapi_encode.c       | 793 +++++---------------------------
>  libavcodec/vaapi_encode.h       | 102 +---
>  libavcodec/vaapi_encode_av1.c   |  51 +-
>  libavcodec/vaapi_encode_h264.c  | 176 +++----
>  libavcodec/vaapi_encode_h265.c  | 121 ++---
>  libavcodec/vaapi_encode_mjpeg.c |   7 +-
>  libavcodec/vaapi_encode_mpeg2.c |  47 +-
>  libavcodec/vaapi_encode_vp8.c   |  18 +-
>  libavcodec/vaapi_encode_vp9.c   |  54 +--
>  12 files changed, 1097 insertions(+), 997 deletions(-)
>  create mode 100644 libavcodec/hw_base_encode.c
> 
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 7f6de4470e..a2174dcb2f 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -162,7 +162,7 @@ OBJS-$(CONFIG_STARTCODE)               += startcode.o
>  OBJS-$(CONFIG_TEXTUREDSP)              += texturedsp.o
>  OBJS-$(CONFIG_TEXTUREDSPENC)           += texturedspenc.o
>  OBJS-$(CONFIG_TPELDSP)                 += tpeldsp.o
> -OBJS-$(CONFIG_VAAPI_ENCODE)            += vaapi_encode.o
> +OBJS-$(CONFIG_VAAPI_ENCODE)            += vaapi_encode.o hw_base_encode.o
>  OBJS-$(CONFIG_AV1_AMF_ENCODER)         += amfenc_av1.o
>  OBJS-$(CONFIG_VC1DSP)                  += vc1dsp.o
>  OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
> diff --git a/libavcodec/hw_base_encode.c b/libavcodec/hw_base_encode.c
> new file mode 100644
> index 0000000000..1d9a255f69
> --- /dev/null
> +++ b/libavcodec/hw_base_encode.c
> @@ -0,0 +1,600 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/common.h"
> +#include "libavutil/internal.h"
> +#include "libavutil/log.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "encode.h"
> +#include "avcodec.h"
> +#include "hw_base_encode.h"
> +
> ...

Everything above here looks like a copy of the VAAPI code with the names changed, good if so.  (If not then please highlight any differences.)

> +
> +static int hw_base_encode_send_frame(AVCodecContext *avctx, AVFrame *frame)
> +{
> +    HWBaseEncodeContext *ctx = avctx->priv_data;
> +    HWBaseEncodePicture *pic;
> +    int err;
> +
> +    if (frame) {
> +        av_log(avctx, AV_LOG_DEBUG, "Input frame: %ux%u (%"PRId64").\n",
> +               frame->width, frame->height, frame->pts);
> +
> +        err = hw_base_encode_check_frame(avctx, frame);
> +        if (err < 0)
> +            return err;
> +
> +        pic = ctx->op->alloc(avctx, frame);
> +        if (!pic)
> +            return AVERROR(ENOMEM);

Can you push the allocation of this picture out into the base layer?  vaapi_encode_alloc() and d3d12va_encode_alloc() are identical except for the types and the input_surface setting.

> +
> +        pic->input_image = av_frame_alloc();
> +        if (!pic->input_image) {
> +            err = AVERROR(ENOMEM);
> +            goto fail;
> +        }
> +
> +        pic->recon_image = av_frame_alloc();
> +        if (!pic->recon_image) {
> +            err = AVERROR(ENOMEM);
> +            goto fail;
> +        }
> +
> +        if (ctx->input_order == 0 || frame->pict_type == AV_PICTURE_TYPE_I)
> +            pic->force_idr = 1;
> +
> +        pic->pts = frame->pts;
> +        pic->duration = frame->duration;
> +
> +        if (avctx->flags & AV_CODEC_FLAG_COPY_OPAQUE) {
> +            err = av_buffer_replace(&pic->opaque_ref, frame->opaque_ref);
> +            if (err < 0)
> +                goto fail;
> +
> +            pic->opaque = frame->opaque;
> +        }
> +
> +        av_frame_move_ref(pic->input_image, frame);
> +
> +        if (ctx->input_order == 0)
> +            ctx->first_pts = pic->pts;
> +        if (ctx->input_order == ctx->decode_delay)
> +            ctx->dts_pts_diff = pic->pts - ctx->first_pts;
> +        if (ctx->output_delay > 0)
> +            ctx->ts_ring[ctx->input_order %
> +                        (3 * ctx->output_delay + ctx->async_depth)] = pic->pts;
> +
> +        pic->display_order = ctx->input_order;
> +        ++ctx->input_order;
> +
> +        if (ctx->pic_start) {
> +            ctx->pic_end->next = pic;
> +            ctx->pic_end       = pic;
> +        } else {
> +            ctx->pic_start     = pic;
> +            ctx->pic_end       = pic;
> +        }
> +
> +    } else {
> +        ctx->end_of_stream = 1;
> +
> +        // Fix timestamps if we hit end-of-stream before the initial decode
> +        // delay has elapsed.
> +        if (ctx->input_order < ctx->decode_delay)
> +            ctx->dts_pts_diff = ctx->pic_end->pts - ctx->first_pts;
> +    }
> +
> +    return 0;
> +
> +fail:
> +    ctx->op->free(avctx, pic);

Maybe same with free, since it mixes the two layers?  hw_base_encode_free() in this file calling the API-specific free might be cleaner.

> +    return err;
> +}
> +
> +int ff_hw_base_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt)
> +{
> +    HWBaseEncodeContext *ctx = avctx->priv_data;
> +    HWBaseEncodePicture *pic = NULL;
> +    AVFrame *frame = ctx->frame;
> +    int err;
> +
> +start:
> +    /** if no B frame before repeat P frame, sent repeat P frame out. */
> +    if (ctx->tail_pkt->size) {
> +        for (HWBaseEncodePicture *tmp = ctx->pic_start; tmp; tmp = tmp->next) {
> +            if (tmp->type == PICTURE_TYPE_B && tmp->pts < ctx->tail_pkt->pts)
> +                break;
> +            else if (!tmp->next) {
> +                av_packet_move_ref(pkt, ctx->tail_pkt);
> +                goto end;
> +            }
> +        }
> +    }
> +
> +    err = ff_encode_get_frame(avctx, frame);
> +    if (err < 0 && err != AVERROR_EOF)
> +        return err;
> +
> +    if (err == AVERROR_EOF)
> +        frame = NULL;
> +
> +    if (!(ctx->op && ctx->op->alloc && ctx->op->issue &&
> +          ctx->op->output && ctx->op->free)) {
> +        err = AVERROR(EINVAL);
> +        return err;
> +    }

This check feels like it should have been done earlier?

Also, it should probably be an assert - EINVAL is indicating that the caller has got something wrong, while this would be a bug in this case.

> +
> +    err = hw_base_encode_send_frame(avctx, frame);
> +    if (err < 0)
> +        return err;
> +
> +    if (!ctx->pic_start) {
> +        if (ctx->end_of_stream)
> +            return AVERROR_EOF;
> +        else
> +            return AVERROR(EAGAIN);
> +    }
> +
> +    if (ctx->async_encode) {

Is async_encode false tested after this series?  (I see that D3D12 has it set unconditionally.)

> +        if (av_fifo_can_write(ctx->encode_fifo)) {
> +            err = hw_base_encode_pick_next(avctx, &pic);
> +            if (!err) {
> +                av_assert0(pic);
> +                pic->encode_order = ctx->encode_order +
> +                    av_fifo_can_read(ctx->encode_fifo);
> +                err = ctx->op->issue(avctx, pic);
> +                if (err < 0) {
> +                    av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> +                    return err;
> +                }
> +                pic->encode_issued = 1;
> +                av_fifo_write(ctx->encode_fifo, &pic, 1);
> +            }
> +        }
> +
> +        if (!av_fifo_can_read(ctx->encode_fifo))
> +            return err;
> +
> +        // More frames can be buffered
> +        if (av_fifo_can_write(ctx->encode_fifo) && !ctx->end_of_stream)
> +            return AVERROR(EAGAIN);
> +
> +        av_fifo_read(ctx->encode_fifo, &pic, 1);
> +        ctx->encode_order = pic->encode_order + 1;
> +    } else {
> +        err = hw_base_encode_pick_next(avctx, &pic);
> +        if (err < 0)
> +            return err;
> +        av_assert0(pic);
> +
> +        pic->encode_order = ctx->encode_order++;
> +
> +        err = ctx->op->issue(avctx, pic);
> +        if (err < 0) {
> +            av_log(avctx, AV_LOG_ERROR, "Encode failed: %d.\n", err);
> +            return err;
> +        }
> +
> +        pic->encode_issued = 1;
> +    }
> +
> +    err = ctx->op->output(avctx, pic, pkt);
> +    if (err < 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Output failed: %d.\n", err);
> +        return err;
> +    }
> +
> +    ctx->output_order = pic->encode_order;
> +    hw_base_encode_clear_old(avctx);
> +
> +    /** loop to get an available pkt in encoder flushing. */
> +    if (ctx->end_of_stream && !pkt->size)
> +        goto start;
> +
> +end:
> +    if (pkt->size)
> +        av_log(avctx, AV_LOG_DEBUG, "Output packet: pts %"PRId64", dts %"PRId64", "
> +               "size %d bytes.\n", pkt->pts, pkt->dts, pkt->size);
> +
> +    return 0;
> +}
> diff --git a/libavcodec/hw_base_encode.h b/libavcodec/hw_base_encode.h
> index a578db8c06..b5b676b9a8 100644
> --- a/libavcodec/hw_base_encode.h
> +++ b/libavcodec/hw_base_encode.h
> @@ -19,6 +19,8 @@
>  #ifndef AVCODEC_HW_BASE_ENCODE_H
>  #define AVCODEC_HW_BASE_ENCODE_H
>  
> +#include "libavutil/fifo.h"
> +
>  #define MAX_DPB_SIZE 16
>  #define MAX_PICTURE_REFERENCES 2
>  #define MAX_REORDER_DELAY 16
> @@ -53,13 +55,134 @@ enum {
>      FLAG_NON_IDR_KEY_PICTURES  = 1 << 5,
>  };
>  
> +typedef struct HWBaseEncodePicture {
> +    struct HWBaseEncodePicture *next;

"next" does not seem like the right name for this field.

> +
> +    int64_t         display_order;
> +    int64_t         encode_order;
> +    int64_t         pts;
> +    int64_t         duration;
> +    int             force_idr;
> +
> +    void           *opaque;
> +    AVBufferRef    *opaque_ref;
> +
> +    int             type;
> +    int             b_depth;
> +    int             encode_issued;
> +    int             encode_complete;
> +
> +    AVFrame        *input_image;
> +    AVFrame        *recon_image;
> +
> +    void           *priv_data;
> +
> +    // Whether this picture is a reference picture.
> +    int             is_reference;
> +
> +    // The contents of the DPB after this picture has been decoded.
> +    // This will contain the picture itself if it is a reference picture,
> +    // but not if it isn't.
> +    int                     nb_dpb_pics;
> +    struct HWBaseEncodePicture *dpb[MAX_DPB_SIZE];
> +    // The reference pictures used in decoding this picture. If they are
> +    // used by later pictures they will also appear in the DPB. ref[0][] for
> +    // previous reference frames. ref[1][] for future reference frames.
> +    int                     nb_refs[MAX_REFERENCE_LIST_NUM];
> +    struct HWBaseEncodePicture *refs[MAX_REFERENCE_LIST_NUM][MAX_PICTURE_REFERENCES];
> +    // The previous reference picture in encode order.  Must be in at least
> +    // one of the reference list and DPB list.
> +    struct HWBaseEncodePicture *prev;
> +    // Reference count for other pictures referring to this one through
> +    // the above pointers, directly from incomplete pictures and indirectly
> +    // through completed pictures.
> +    int             ref_count[2];
> +    int             ref_removed[2];
> +} HWBaseEncodePicture;
> +
> +typedef struct HWEncodePictureOperation {
> +    // Alloc memory for the picture structure.

Maybe something like "Allocate API-specific internals of a new picture structure based on the given frame."?

(Assuming you move the allocation of the structure itself out as suggested above.)

> +    HWBaseEncodePicture * (*alloc)(AVCodecContext *avctx, const AVFrame *frame);
> +    // Issue the picture structure, which will send the frame surface to HW Encode API.
> +    int (*issue)(AVCodecContext *avctx, const HWBaseEncodePicture *base_pic);
> +    // Get the output AVPacket.
> +    int (*output)(AVCodecContext *avctx, const HWBaseEncodePicture *base_pic, AVPacket *pkt);
> +    // Free the picture structure.
> +    int (*free)(AVCodecContext *avctx, HWBaseEncodePicture *base_pic);
> +}  HWEncodePictureOperation;
> +
>  typedef struct HWBaseEncodeContext {
>      const AVClass *class;
>  
> +    // Hardware-specific hooks.
> +    const struct HWEncodePictureOperation *op;
> +
> +    // Current encoding window, in display (input) order.
> +    HWBaseEncodePicture *pic_start, *pic_end;
> +    // The next picture to use as the previous reference picture in
> +    // encoding order. Order from small to large in encoding order.
> +    HWBaseEncodePicture *next_prev[MAX_PICTURE_REFERENCES];
> +    int                  nb_next_prev;
> +
> +    // Next input order index (display order).
> +    int64_t         input_order;
> +    // Number of frames that output is behind input.
> +    int64_t         output_delay;
> +    // Next encode order index.
> +    int64_t         encode_order;
> +    // Number of frames decode output will need to be delayed.
> +    int64_t         decode_delay;
> +    // Next output order index (in encode order).
> +    int64_t         output_order;
> +
> +    // Timestamp handling.
> +    int64_t         first_pts;
> +    int64_t         dts_pts_diff;
> +    int64_t         ts_ring[MAX_REORDER_DELAY * 3 +
> +                            MAX_ASYNC_DEPTH];
> +
> +    // Frame type decision.
> +    int gop_size;
> +    int closed_gop;
> +    int gop_per_idr;
> +    int p_per_i;
> +    int max_b_depth;
> +    int b_per_p;
> +    int force_idr;
> +    int idr_counter;
> +    int gop_counter;
> +    int end_of_stream;
> +    int p_to_gpb;
> +
> +    // Whether the driver supports ROI at all.
> +    int             roi_allowed;
> +
> +    // The encoder does not support cropping information, so warn about
> +    // it the first time we encounter any nonzero crop fields.
> +    int             crop_warned;
> +    // If the driver does not support ROI then warn the first time we
> +    // encounter a frame with ROI side data.
> +    int             roi_warned;
> +
> +    // The frame to be filled with data.
> +    AVFrame         *frame;
> +
> +    // Whether the HW supports sync buffer function.
> +    // If supported, encode_fifo/async_depth will be used together.
> +    // Used for output buffer synchronization.
> +    int             async_encode;
> +
> +    // Store buffered pic.
> +    AVFifo          *encode_fifo;
>      // Max number of frame buffered in encoder.
>      int             async_depth;
> +
> +    /** Tail data of a pic, now only used for av1 repeat frame header. */
> +    AVPacket        *tail_pkt;
>  } HWBaseEncodeContext;
>  
> +int ff_hw_base_encode_receive_packet(AVCodecContext *avctx, AVPacket *pkt);
> +
>  #define HW_BASE_ENCODE_COMMON_OPTIONS \
>      { "async_depth", "Maximum processing parallelism. " \
>        "Increase this to improve single channel performance.", \
> diff --git a/libavcodec/vaapi_encode.c b/libavcodec/vaapi_encode.c
> index 227cccae64..18966596e1 100644
> --- a/libavcodec/vaapi_encode.c
> +++ b/libavcodec/vaapi_encode.c
> @@ -138,22 +138,26 @@ static int vaapi_encode_make_misc_param_buffer(AVCodecContext *avctx,
>  static int vaapi_encode_wait(AVCodecContext *avctx,
>                               VAAPIEncodePicture *pic)
>  {
> +#if VA_CHECK_VERSION(1, 9, 0)
> +    HWBaseEncodeContext *base_ctx = avctx->priv_data;
> +#endif
>      VAAPIEncodeContext *ctx = avctx->priv_data;
> +    HWBaseEncodePicture *base_pic = (HWBaseEncodePicture*)pic;

Use "base_foo = &foo->base_field" instead to avoid overriding the type-checking.

(Also in move places below.)

>      VAStatus vas;
>  
> -    av_assert0(pic->encode_issued);
> +    av_assert0(base_pic->encode_issued);
>  
> -    if (pic->encode_complete) {
> +    if (base_pic->encode_complete) {
>          // Already waited for this picture.
>          return 0;
>      }
>  
>      av_log(avctx, AV_LOG_DEBUG, "Sync to pic %"PRId64"/%"PRId64" "
> -           "(input surface %#x).\n", pic->display_order,
> -           pic->encode_order, pic->input_surface);
> +           "(input surface %#x).\n", base_pic->display_order,
> +           base_pic->encode_order, pic->input_surface);
>  
>  #if VA_CHECK_VERSION(1, 9, 0)
> -    if (ctx->has_sync_buffer_func) {
> +    if (base_ctx->async_encode) {
>          vas = vaSyncBuffer(ctx->hwctx->display,
>                             pic->output_buffer,
>                             VA_TIMEOUT_INFINITE);
> @@ -174,9 +178,9 @@ static int vaapi_encode_wait(AVCodecContext *avctx,
>      }
>  
>      // Input is definitely finished with now.
> -    av_frame_free(&pic->input_image);
> +    av_frame_free(&base_pic->input_image);
>  
> -    pic->encode_complete = 1;
> +    base_pic->encode_complete = 1;
>      return 0;
>  }
>  
> @@ -263,9 +267,11 @@ static int vaapi_encode_make_tile_slice(AVCodecContext *avctx,
>  }
>  
>  static int vaapi_encode_issue(AVCodecContext *avctx,
> -                              VAAPIEncodePicture *pic)
> +                              const HWBaseEncodePicture *base_pic)
>  {
> -    VAAPIEncodeContext *ctx = avctx->priv_data;
> +    HWBaseEncodeContext *base_ctx = avctx->priv_data;
> +    VAAPIEncodeContext       *ctx = avctx->priv_data;
> +    VAAPIEncodePicture *pic = (VAAPIEncodePicture*)base_pic;
>      VAAPIEncodeSlice *slice;
>      VAStatus vas;
>      int err, i;
> @@ -274,52 +280,46 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>      av_unused AVFrameSideData *sd;
>  
>      av_log(avctx, AV_LOG_DEBUG, "Issuing encode for pic %"PRId64"/%"PRId64" "
> -           "as type %s.\n", pic->display_order, pic->encode_order,
> -           ff_hw_base_encode_get_pictype_name(pic->type));
> -    if (pic->nb_refs[0] == 0 && pic->nb_refs[1] == 0) {
> +           "as type %s.\n", base_pic->display_order, base_pic->encode_order,
> +           ff_hw_base_encode_get_pictype_name(base_pic->type));
> +    if (base_pic->nb_refs[0] == 0 && base_pic->nb_refs[1] == 0) {
>          av_log(avctx, AV_LOG_DEBUG, "No reference pictures.\n");
>      } else {
>          av_log(avctx, AV_LOG_DEBUG, "L0 refers to");
> -        for (i = 0; i < pic->nb_refs[0]; i++) {
> +        for (i = 0; i < base_pic->nb_refs[0]; i++) {
>              av_log(avctx, AV_LOG_DEBUG, " %"PRId64"/%"PRId64,
> -                   pic->refs[0][i]->display_order, pic->refs[0][i]->encode_order);
> +                   base_pic->refs[0][i]->display_order, base_pic->refs[0][i]->encode_order);
>          }
>          av_log(avctx, AV_LOG_DEBUG, ".\n");
>  
> -        if (pic->nb_refs[1]) {
> +        if (base_pic->nb_refs[1]) {
>              av_log(avctx, AV_LOG_DEBUG, "L1 refers to");
> -            for (i = 0; i < pic->nb_refs[1]; i++) {
> +            for (i = 0; i < base_pic->nb_refs[1]; i++) {
>                  av_log(avctx, AV_LOG_DEBUG, " %"PRId64"/%"PRId64,
> -                       pic->refs[1][i]->display_order, pic->refs[1][i]->encode_order);
> +                       base_pic->refs[1][i]->display_order, base_pic->refs[1][i]->encode_order);
>              }
>              av_log(avctx, AV_LOG_DEBUG, ".\n");
>          }
>      }
>  
> -    av_assert0(!pic->encode_issued);
> -    for (i = 0; i < pic->nb_refs[0]; i++) {
> -        av_assert0(pic->refs[0][i]);
> -        av_assert0(pic->refs[0][i]->encode_issued);
> +    av_assert0(!base_pic->encode_issued);
> +    for (i = 0; i < base_pic->nb_refs[0]; i++) {
> +        av_assert0(base_pic->refs[0][i]);
> +        av_assert0(base_pic->refs[0][i]->encode_issued);
>      }
> -    for (i = 0; i < pic->nb_refs[1]; i++) {
> -        av_assert0(pic->refs[1][i]);
> -        av_assert0(pic->refs[1][i]->encode_issued);
> +    for (i = 0; i < base_pic->nb_refs[1]; i++) {
> +        av_assert0(base_pic->refs[1][i]);
> +        av_assert0(base_pic->refs[1][i]->encode_issued);
>      }
>  
>      av_log(avctx, AV_LOG_DEBUG, "Input surface is %#x.\n", pic->input_surface);
>  
> -    pic->recon_image = av_frame_alloc();
> -    if (!pic->recon_image) {
> -        err = AVERROR(ENOMEM);
> -        goto fail;
> -    }
> -
> -    err = av_hwframe_get_buffer(ctx->recon_frames_ref, pic->recon_image, 0);
> +    err = av_hwframe_get_buffer(ctx->recon_frames_ref, base_pic->recon_image, 0);
>      if (err < 0) {
>          err = AVERROR(ENOMEM);
>          goto fail;
>      }
> -    pic->recon_surface = (VASurfaceID)(uintptr_t)pic->recon_image->data[3];
> +    pic->recon_surface = (VASurfaceID)(uintptr_t)base_pic->recon_image->data[3];
>      av_log(avctx, AV_LOG_DEBUG, "Recon surface is %#x.\n", pic->recon_surface);
>  
>      pic->output_buffer_ref = ff_refstruct_pool_get(ctx->output_buffer_pool);
> @@ -343,7 +343,7 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>  
>      pic->nb_param_buffers = 0;
>  
> -    if (pic->type == PICTURE_TYPE_IDR && ctx->codec->init_sequence_params) {
> +    if (base_pic->type == PICTURE_TYPE_IDR && ctx->codec->init_sequence_params) {
>          err = vaapi_encode_make_param_buffer(avctx, pic,
>                                               VAEncSequenceParameterBufferType,
>                                               ctx->codec_sequence_params,
> @@ -352,7 +352,7 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>              goto fail;
>      }
>  
> -    if (pic->type == PICTURE_TYPE_IDR) {
> +    if (base_pic->type == PICTURE_TYPE_IDR) {
>          for (i = 0; i < ctx->nb_global_params; i++) {
>              err = vaapi_encode_make_misc_param_buffer(avctx, pic,
>                                                        ctx->global_params_type[i],
> @@ -389,7 +389,7 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>      }
>  #endif
>  
> -    if (pic->type == PICTURE_TYPE_IDR) {
> +    if (base_pic->type == PICTURE_TYPE_IDR) {
>          if (ctx->va_packed_headers & VA_ENC_PACKED_HEADER_SEQUENCE &&
>              ctx->codec->write_sequence_header) {
>              bit_len = 8 * sizeof(data);
> @@ -529,9 +529,9 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>      }
>  
>  #if VA_CHECK_VERSION(1, 0, 0)
> -    sd = av_frame_get_side_data(pic->input_image,
> +    sd = av_frame_get_side_data(base_pic->input_image,
>                                  AV_FRAME_DATA_REGIONS_OF_INTEREST);
> -    if (sd && ctx->roi_allowed) {
> +    if (sd && base_ctx->roi_allowed) {
>          const AVRegionOfInterest *roi;
>          uint32_t roi_size;
>          VAEncMiscParameterBufferROI param_roi;
> @@ -542,11 +542,11 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>          av_assert0(roi_size && sd->size % roi_size == 0);
>          nb_roi = sd->size / roi_size;
>          if (nb_roi > ctx->roi_max_regions) {
> -            if (!ctx->roi_warned) {
> +            if (!base_ctx->roi_warned) {
>                  av_log(avctx, AV_LOG_WARNING, "More ROIs set than "
>                         "supported by driver (%d > %d).\n",
>                         nb_roi, ctx->roi_max_regions);
> -                ctx->roi_warned = 1;
> +                base_ctx->roi_warned = 1;
>              }
>              nb_roi = ctx->roi_max_regions;
>          }
> @@ -639,8 +639,6 @@ static int vaapi_encode_issue(AVCodecContext *avctx,
>          }
>      }
>  
> -    pic->encode_issued = 1;
> -
>      return 0;
>  
>  fail_with_picture:
> @@ -657,14 +655,13 @@ fail_at_end:
>      av_freep(&pic->param_buffers);
>      av_freep(&pic->slices);
>      av_freep(&pic->roi);
> -    av_frame_free(&pic->recon_image);
>      ff_refstruct_unref(&pic->output_buffer_ref);
>      pic->output_buffer = VA_INVALID_ID;
>      return err;
>  }

All the churn in this function makes me wonder whether it would be better to have VAAPIEncodePicture *vaapi_pic and HWBaseEncodePicture *pic to avoid it?  (Perhaps not given that it uses both.)

> ...
> diff --git a/libavcodec/vaapi_encode_av1.c b/libavcodec/vaapi_encode_av1.c
> index f4b4586583..393b479f99 100644
> --- a/libavcodec/vaapi_encode_av1.c
> +++ b/libavcodec/vaapi_encode_av1.c
> @@ -357,6 +357,7 @@ static int vaapi_encode_av1_write_sequence_header(AVCodecContext *avctx,
>  
>  static int vaapi_encode_av1_init_sequence_params(AVCodecContext *avctx)
>  {
> +    HWBaseEncodeContext         *base_ctx = avctx->priv_data;
>      VAAPIEncodeContext               *ctx = avctx->priv_data;
>      VAAPIEncodeAV1Context           *priv = avctx->priv_data;
>      AV1RawOBU                     *sh_obu = &priv->sh;
> @@ -438,8 +439,8 @@ static int vaapi_encode_av1_init_sequence_params(AVCodecContext *avctx)
>      vseq->seq_level_idx           = sh->seq_level_idx[0];
>      vseq->seq_tier                = sh->seq_tier[0];
>      vseq->order_hint_bits_minus_1 = sh->order_hint_bits_minus_1;
> -    vseq->intra_period            = ctx->gop_size;
> -    vseq->ip_period               = ctx->b_per_p + 1;
> +    vseq->intra_period            = base_ctx->gop_size;
> +    vseq->ip_period               = base_ctx->b_per_p + 1;
>  
>      vseq->seq_fields.bits.enable_order_hint = sh->enable_order_hint;
>  
> @@ -466,12 +467,13 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>  {
>      VAAPIEncodeContext              *ctx = avctx->priv_data;
>      VAAPIEncodeAV1Context          *priv = avctx->priv_data;
> -    VAAPIEncodeAV1Picture          *hpic = pic->priv_data;
> +    HWBaseEncodePicture        *base_pic = (HWBaseEncodePicture *)pic;

base_pic is const here?

Also consider whether this naming is the best way around.

> +    VAAPIEncodeAV1Picture          *hpic = base_pic->priv_data;
>      AV1RawOBU                    *fh_obu = &priv->fh;
>      AV1RawFrameHeader                *fh = &fh_obu->obu.frame.header;
>      VAEncPictureParameterBufferAV1 *vpic = pic->codec_picture_params;
>      CodedBitstreamFragment          *obu = &priv->current_obu;
> -    VAAPIEncodePicture    *ref;
> +    HWBaseEncodePicture    *ref;
>      VAAPIEncodeAV1Picture *href;
>      int slot, i;
>      int ret;
> @@ -480,24 +482,24 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>  
>      memset(fh_obu, 0, sizeof(*fh_obu));
>      pic->nb_slices = priv->tile_groups;
> -    pic->non_independent_frame = pic->encode_order < pic->display_order;
> +    pic->non_independent_frame = base_pic->encode_order < base_pic->display_order;
>      fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
>      fh_obu->header.obu_has_size_field = 1;
>  
> -    switch (pic->type) {
> +    switch (base_pic->type) {
>      case PICTURE_TYPE_IDR:
> -        av_assert0(pic->nb_refs[0] == 0 || pic->nb_refs[1]);
> +        av_assert0(base_pic->nb_refs[0] == 0 || base_pic->nb_refs[1]);
>          fh->frame_type = AV1_FRAME_KEY;
>          fh->refresh_frame_flags = 0xFF;
>          fh->base_q_idx = priv->q_idx_idr;
>          hpic->slot = 0;
> -        hpic->last_idr_frame = pic->display_order;
> +        hpic->last_idr_frame = base_pic->display_order;
>          break;
>      case PICTURE_TYPE_P:
> -        av_assert0(pic->nb_refs[0]);
> +        av_assert0(base_pic->nb_refs[0]);
>          fh->frame_type = AV1_FRAME_INTER;
>          fh->base_q_idx = priv->q_idx_p;
> -        ref = pic->refs[0][pic->nb_refs[0] - 1];
> +        ref = base_pic->refs[0][base_pic->nb_refs[0] - 1];
>          href = ref->priv_data;
>          hpic->slot = !href->slot;
>          hpic->last_idr_frame = href->last_idr_frame;
> @@ -512,8 +514,8 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
>  
>          /** set the 2nd nearest frame in L0 as Golden frame. */
> -        if (pic->nb_refs[0] > 1) {
> -            ref = pic->refs[0][pic->nb_refs[0] - 2];
> +        if (base_pic->nb_refs[0] > 1) {
> +            ref = base_pic->refs[0][base_pic->nb_refs[0] - 2];
>              href = ref->priv_data;
>              fh->ref_frame_idx[3] = href->slot;
>              fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
> @@ -521,7 +523,7 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          }
>          break;
>      case PICTURE_TYPE_B:
> -        av_assert0(pic->nb_refs[0] && pic->nb_refs[1]);
> +        av_assert0(base_pic->nb_refs[0] && base_pic->nb_refs[1]);
>          fh->frame_type = AV1_FRAME_INTER;
>          fh->base_q_idx = priv->q_idx_b;
>          fh->refresh_frame_flags = 0x0;
> @@ -534,7 +536,7 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          vpic->ref_frame_ctrl_l0.fields.search_idx0 = AV1_REF_FRAME_LAST;
>          vpic->ref_frame_ctrl_l1.fields.search_idx0 = AV1_REF_FRAME_BWDREF;
>  
> -        ref                            = pic->refs[0][pic->nb_refs[0] - 1];
> +        ref                            = base_pic->refs[0][base_pic->nb_refs[0] - 1];
>          href                           = ref->priv_data;
>          hpic->last_idr_frame           = href->last_idr_frame;
>          fh->primary_ref_frame          = href->slot;
> @@ -543,7 +545,7 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>              fh->ref_frame_idx[i] = href->slot;
>          }
>  
> -        ref                            = pic->refs[1][pic->nb_refs[1] - 1];
> +        ref                            = base_pic->refs[1][base_pic->nb_refs[1] - 1];
>          href                           = ref->priv_data;
>          fh->ref_order_hint[href->slot] = ref->display_order - href->last_idr_frame;
>          for (i = AV1_REF_FRAME_GOLDEN; i < AV1_REFS_PER_FRAME; i++) {
> @@ -554,13 +556,13 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          av_assert0(0 && "invalid picture type");
>      }
>  
> -    fh->show_frame                = pic->display_order <= pic->encode_order;
> +    fh->show_frame                = base_pic->display_order <= base_pic->encode_order;
>      fh->showable_frame            = fh->frame_type != AV1_FRAME_KEY;
>      fh->frame_width_minus_1       = avctx->width - 1;
>      fh->frame_height_minus_1      = avctx->height - 1;
>      fh->render_width_minus_1      = fh->frame_width_minus_1;
>      fh->render_height_minus_1     = fh->frame_height_minus_1;
> -    fh->order_hint                = pic->display_order - hpic->last_idr_frame;
> +    fh->order_hint                = base_pic->display_order - hpic->last_idr_frame;
>      fh->tile_cols                 = priv->tile_cols;
>      fh->tile_rows                 = priv->tile_rows;
>      fh->tile_cols_log2            = priv->tile_cols_log2;
> @@ -626,13 +628,13 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          vpic->reference_frames[i] = VA_INVALID_SURFACE;
>  
>      for (i = 0; i < MAX_REFERENCE_LIST_NUM; i++) {
> -        for (int j = 0; j < pic->nb_refs[i]; j++) {
> -            VAAPIEncodePicture *ref_pic = pic->refs[i][j];
> +        for (int j = 0; j < base_pic->nb_refs[i]; j++) {
> +            HWBaseEncodePicture *ref_pic = base_pic->refs[i][j];
>  
>              slot = ((VAAPIEncodeAV1Picture*)ref_pic->priv_data)->slot;
>              av_assert0(vpic->reference_frames[slot] == VA_INVALID_SURFACE);
>  
> -            vpic->reference_frames[slot] = ref_pic->recon_surface;
> +            vpic->reference_frames[slot] = ((VAAPIEncodePicture *)ref_pic)->recon_surface;
>          }
>      }
>  
> @@ -653,7 +655,7 @@ static int vaapi_encode_av1_init_picture_params(AVCodecContext *avctx,
>          vpic->bit_offset_cdef_params         = priv->cdef_start_offset;
>          vpic->size_in_bits_cdef_params       = priv->cdef_param_size;
>          vpic->size_in_bits_frame_hdr_obu     = priv->fh_data_len;
> -        vpic->byte_offset_frame_hdr_obu_size = (((pic->type == PICTURE_TYPE_IDR) ?
> +        vpic->byte_offset_frame_hdr_obu_size = (((base_pic->type == PICTURE_TYPE_IDR) ?
>                                                 priv->sh_data_len / 8 : 0) +
>                                                 (fh_obu->header.obu_extension_flag ?
>                                                 2 : 1));
> @@ -695,14 +697,15 @@ static int vaapi_encode_av1_write_picture_header(AVCodecContext *avctx,
>      CodedBitstreamAV1Context *cbctx = priv->cbc->priv_data;
>      AV1RawOBU               *fh_obu = &priv->fh;
>      AV1RawFrameHeader       *rep_fh = &fh_obu->obu.frame_header;
> +    HWBaseEncodePicture   *base_pic = (HWBaseEncodePicture *)pic;

Also const.

>      VAAPIEncodeAV1Picture *href;
>      int ret = 0;
>  
>      pic->tail_size = 0;
>      /** Pack repeat frame header. */
> -    if (pic->display_order > pic->encode_order) {
> +    if (base_pic->display_order > base_pic->encode_order) {
>          memset(fh_obu, 0, sizeof(*fh_obu));
> -        href = pic->refs[0][pic->nb_refs[0] - 1]->priv_data;
> +        href = base_pic->refs[0][base_pic->nb_refs[0] - 1]->priv_data;
>          fh_obu->header.obu_type = AV1_OBU_FRAME_HEADER;
>          fh_obu->header.obu_has_size_field = 1;
>  
> @@ -937,7 +940,7 @@ const FFCodec ff_av1_vaapi_encoder = {
>      .p.id           = AV_CODEC_ID_AV1,
>      .priv_data_size = sizeof(VAAPIEncodeAV1Context),
>      .init           = &vaapi_encode_av1_init,
> -    FF_CODEC_RECEIVE_PACKET_CB(&ff_vaapi_encode_receive_packet),
> +    FF_CODEC_RECEIVE_PACKET_CB(&ff_hw_base_encode_receive_packet),
>      .close          = &vaapi_encode_av1_close,
>      .p.priv_class   = &vaapi_encode_av1_class,
>      .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_HARDWARE |
> ...

I didn't look carefully through the other codecs here.

Thanks,

- Mark