[FFmpeg-devel] [PATCH] avcodec/cuviddec: correctly handle buffer size and status when deinterlacing

Wed Feb 26 21:58:46 EET 2025

On 26.02.2025 03:42, Scott Theisen wrote:
> On 2/25/25 13:43, Timo Rothenpieler wrote:
>> ---
>>   libavcodec/cuviddec.c | 24 +++++++++++++-----------
>>   1 file changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/libavcodec/cuviddec.c b/libavcodec/cuviddec.c
>> index 67076a1752..312742fb8c 100644
>> --- a/libavcodec/cuviddec.c
>> +++ b/libavcodec/cuviddec.c
>> @@ -131,7 +131,7 @@ static int CUDAAPI 
>> cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>>       CUVIDDECODECREATEINFO cuinfo;
>>       int surface_fmt;
>>       int chroma_444;
>> -    int fifo_size_inc;
>> +    int old_nb_surfaces, fifo_size_inc, fifo_size_mul = 1;
>>       int old_width = avctx->width;
>>       int old_height = avctx->height;
>> @@ -349,20 +349,24 @@ static int CUDAAPI 
>> cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>>           return 0;
>>       }
>> -    fifo_size_inc = ctx->nb_surfaces;
>> -    ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, format- 
>> >min_num_decode_surfaces + 3);
>> +    if (ctx->deint_mode_current != cudaVideoDeinterlaceMode_Weave 
>> && !ctx->drop_second_field) {
>> +        avctx->framerate = av_mul_q(avctx->framerate, (AVRational){2, 
>> 1});
>> +        fifo_size_mul = 2;
>> +    }
>> +    old_nb_surfaces = ctx->nb_surfaces;
>> +    ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, format- 
>> >min_num_decode_surfaces + 3);
>>       if (avctx->extra_hw_frames > 0)
>>           ctx->nb_surfaces += avctx->extra_hw_frames;
>> -    fifo_size_inc = ctx->nb_surfaces - fifo_size_inc;
>> +    fifo_size_inc = ctx->nb_surfaces * fifo_size_mul - 
>> av_fifo_can_read(ctx->frame_queue) - av_fifo_can_write(ctx->frame_queue);
>>       if (fifo_size_inc > 0 && av_fifo_grow2(ctx->frame_queue, 
>> fifo_size_inc) < 0) {
>>           av_log(avctx, AV_LOG_ERROR, "Failed to grow frame queue on 
>> video sequence callback\n");
>>           ctx->internal_error = AVERROR(ENOMEM);
>>           return 0;
>>       }
>> -    if (fifo_size_inc > 0 && av_reallocp_array(&ctx->key_frame, ctx- 
>> >nb_surfaces, sizeof(int)) < 0) {
>> +    if (ctx->nb_surfaces > old_nb_surfaces && av_reallocp_array(&ctx- 
>> >key_frame, ctx->nb_surfaces, sizeof(int)) < 0) {
>>           av_log(avctx, AV_LOG_ERROR, "Failed to grow key frame array 
>> on video sequence callback\n");
>>           ctx->internal_error = AVERROR(ENOMEM);
>>           return 0;
>> @@ -374,9 +378,6 @@ static int CUDAAPI 
>> cuvid_handle_video_sequence(void *opaque, CUVIDEOFORMAT* form
>>       cuinfo.bitDepthMinus8 = format->bit_depth_luma_minus8;
>>       cuinfo.DeinterlaceMode = ctx->deint_mode_current;
>> -    if (ctx->deint_mode_current != cudaVideoDeinterlaceMode_Weave 
>> && !ctx->drop_second_field)
>> -        avctx->framerate = av_mul_q(avctx->framerate, (AVRational){2, 
>> 1});
>> -
>>       ctx->internal_error = CHECK_CU(ctx->cvdl- 
>> >cuvidCreateDecoder(&ctx->cudecoder, &cuinfo));
>>       if (ctx->internal_error < 0)
>>           return 0;
>> @@ -448,11 +449,12 @@ static int cuvid_is_buffer_full(AVCodecContext 
>> *avctx)
>>   {
>>       CuvidContext *ctx = avctx->priv_data;
>> -    int delay = ctx->cuparseinfo.ulMaxDisplayDelay;
>> +    int mult = 1;
>>       if (ctx->deint_mode != cudaVideoDeinterlaceMode_Weave && !ctx- 
>> >drop_second_field)
>> -        delay *= 2;
>> +        mult = 2;
>> -    return av_fifo_can_read(ctx->frame_queue) + delay >= ctx- 
>> >nb_surfaces;
>> +    // "- mult + 1" ensures that the buffer is still signalled full 
>> if one half-frame has already been returned when deinterlacing.
>> +    return av_fifo_can_read(ctx->frame_queue) + (ctx- 
>> >cuparseinfo.ulMaxDisplayDelay * mult) >= ctx->nb_surfaces * mult - 
>> mult + 1;
> 
> I think this is clearer:
> return ((av_fifo_can_read(ctx->frame_queue) + mult - 1) / mult) + ctx- 
>  >cuparseinfo.ulMaxDisplayDelay >= ctx->nb_surfaces

Yeah, though I've used a shift instead, purely as a minimal speed 
optimization.

> Integer ceiling division to get the number of referenced surfaces in 
> frame_queue.
> 
> However, when going from mult = 2 to 1, it thinks the buffer is more 
> full than it really is, which probably isn't a problem. Unfortunately, 
> when going from mult = 1 to 2, if there is more than one frame in 
> frame_queue, it will think there are less surfaces referenced than there 
> are, which may be a problem.

That's not the case, only new frames added to the queue are be doubled.

Patch is applied now, thanks!