[Libav-user] Muxed audio and video streams are not synchronized.

Thu Jun 11 11:32:51 CEST 2015

Dear libav users, 

I'm trying to do the following:

1. Decode a packet from a H264 encoded video (orig_packet),
2. Scale the resulting YUV frame to RGB24,
3. Draw a watermark on the resulting frame,
4. Scale it back to YUV420P,
5. Encode yuv frame in a new packet (styl_packet),
6. Copy dts and pts time from original packet to the stylized packet
7. Write this packet in a MPEG4 container

So far, and it's been a painful and long way to reach this point, everything works (almost)
great but I'm facing two issues:

1) Video and audio are not synched, it seems that the video is a bit slower than it should be (see packet duration & timestamps below)
2) The resulting image quality is poor, we can see macro blocks covering the whole images (even in still parts of the frames).

Any help would be a big relief !
Thank you!

Here is the code for the stylization:
_______________________________
void vs_stylize(VS_VideoContext *in_video_ctx, VS_VideoContext *out_video_ctx)
{
    AVStream *in_stream, *out_stream;
    AVPacket orig_pkt, styl_pkt;

    VS_Picture *yuv_pix=vs_alloc_picture(in_video_ctx, VS_PIX_FMT_YUV420P);
    VS_Picture *rgb_pix=vs_alloc_picture(in_video_ctx, VS_PIX_FMT_RGB24);

    int ret, got_something;
    int idx=0;//saved frame index in name

    av_init_packet(&orig_pkt);
    av_init_packet(&styl_pkt);

    while(1)
    {
        ret=av_read_frame(in_video_ctx->format_ctx, &orig_pkt);
        if(ret<0)
        break;

        in_stream=in_video_ctx->format_ctx->streams[orig_pkt.stream_index];
        out_stream=out_video_ctx->format_ctx->streams[orig_pkt.stream_index];

        log_packet(in_video_ctx->format_ctx, &orig_pkt, "in");

        if(in_stream->codec->codec->type==AVMEDIA_TYPE_AUDIO)
        {
            // simply copy audio packet
            orig_pkt.pts = av_rescale_q_rnd(orig_pkt.pts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
            orig_pkt.dts = av_rescale_q_rnd(orig_pkt.dts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
            orig_pkt.duration = av_rescale_q(orig_pkt.duration, in_stream->time_base, out_stream->time_base);
            orig_pkt.pos = -1;

            ret = av_interleaved_write_frame(out_video_ctx->format_ctx, &orig_pkt);
            if (ret < 0) {
                fprintf(stderr, "Error muxing packet\n");
                break;
            }
        }
        else
        if(in_stream->codec->codec->type==AVMEDIA_TYPE_VIDEO)
        {
            //Decode packet (orig_packet),
            //Scale to RGB24,
            //Put a watermark on the frame,
            //Scale back to YUV420P,
            //Copy dts and pts time from original packet
            //Encode yuv frame in a new packet (styl_packet);
            ret=avcodec_decode_video2(in_stream->codec, yuv_pix->av_frame, &got_something, &orig_pkt);
            if(got_something!=0)
            {
                rgb_pix->av_frame=YUVtoRGB(yuv_pix->av_frame, in_stream->codec);
                waterMark(rgb_pix->av_frame, in_stream->codec);
                yuv_pix->av_frame=RGBtoYUV(rgb_pix->av_frame, in_stream->codec);
                avcodec_encode_video2(out_stream->codec, &styl_pkt, yuv_pix->av_frame, &got_something);
                if(!got_something)
                {
                    INFO(stderr, ":-( Unable to encode yuv frame.\n");
                    exit(0);
                }      
            }

            //copy timestamps
            //Note: dans transcodeing .c they use av_packet_rescale_ts()
            styl_pkt.pts = av_rescale_q_rnd(orig_pkt.pts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
            styl_pkt.dts = av_rescale_q_rnd(orig_pkt.dts, in_stream->time_base, out_stream->time_base, AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
            styl_pkt.duration = av_rescale_q(orig_pkt.duration, in_stream->time_base, out_stream->time_base);
            styl_pkt.pos = -1;
            log_packet(out_video_ctx->format_ctx, &orig_pkt, "ORIG");
            log_packet(out_video_ctx->format_ctx, &styl_pkt, "STYL");
            //I noticed timestamps are totally different even when both codec contexts share the same time base.
            ret = av_interleaved_write_frame(out_video_ctx->format_ctx, &styl_pkt);
            if (ret < 0) {
                fprintf(stderr, "Error muxing packet\n");
                break;
            }
        }
        av_free_packet(&orig_pkt);
        av_free_packet(&styl_pkt);
    }
    av_write_trailer(out_video_ctx->format_ctx);

    vs_free(in_video_ctx);//free and close everything
    vs_free(out_video_ctx);
}

And the output with packet infos:
______________________________

dhcp10:vs_lib-5 Fix$ ./execute small.mp4 out_
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x7f841200a000] overread end of atom 'colr' by 1 bytes
Input #0, mov,mp4,m4a,3gp,3g2,mj2, from 'small.mp4':
  Metadata:
    major_brand     : mp42
    minor_version   : 0
    compatible_brands: mp42isomavc1
    creation_time   : 2010-03-20 21:29:11
    encoder         : HandBrake 0.9.4 2009112300
  Duration: 00:00:05.57, start: 0.000000, bitrate: 551 kb/s
    Stream #0:0(und): Video: h264 (Constrained Baseline) (avc1 / 0x31637661), yuv420p(tv, bt709), 560x320, 465 kb/s, 30 fps, 30 tbr, 90k tbn, 60 tbc (default)
    Metadata:
      creation_time   : 2010-03-20 21:29:11
      encoder         : JVT/AVC Coding
    Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 83 kb/s (default)
    Metadata:
      creation_time   : 2010-03-20 21:29:11
----------------------------------------Codec for output codec context is mpeg4
Successfully opened mpeg4 codec
[mp4 @ 0x7f8412887200] Using AVStream.codec.time_base as a timebase hint to the muxer is deprecated. Set AVStream.time_base instead.
[mp4 @ 0x7f8412887200] Codec for stream 0 does not use global headers but container format requires global headers
[mp4 @ 0x7f8412887200] Using AVStream.codec.time_base as a timebase hint to the muxer is deprecated. Set AVStream.time_base instead.
Output #0, mp4, to 'out_small.mp4':
  Metadata:
    encoder         : Lavf56.15.102
    Stream #0:0: Video: mpeg4 ( [0][0][0] / 0x0020), yuv420p, 560x320, q=2-31, 465 kb/s, 15360 tbn, 60 tbc
    Stream #0:1: Audio: aac (LC) ([64][0][0][0] / 0x0040), 48000 Hz, mono, fltp, 83 kb/s
----------------------------------------in: pts:0 pts_time:0 dts:0 dts_time:0 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:0 pts_time:0 dts:0 dts_time:0 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:0 pts_time:0 dts:0 dts_time:0 duration:512 duration_time:0.0333333 stream_index:0
in: pts:3000 pts_time:0.0333333 dts:3000 dts_time:0.0333333 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:3000 pts_time:0.195312 dts:3000 dts_time:0.195312 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:512 pts_time:0.0333333 dts:512 dts_time:0.0333333 duration:512 duration_time:0.0333333 stream_index:0
in: pts:6000 pts_time:0.0666667 dts:6000 dts_time:0.0666667 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:6000 pts_time:0.390625 dts:6000 dts_time:0.390625 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:1024 pts_time:0.0666667 dts:1024 dts_time:0.0666667 duration:512 duration_time:0.0333333 stream_index:0
in: pts:9000 pts_time:0.1 dts:9000 dts_time:0.1 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:9000 pts_time:0.585938 dts:9000 dts_time:0.585938 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:1536 pts_time:0.1 dts:1536 dts_time:0.1 duration:512 duration_time:0.0333333 stream_index:0
in: pts:0 pts_time:0 dts:0 dts_time:0 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:1024 pts_time:0.0213333 dts:1024 dts_time:0.0213333 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:2048 pts_time:0.0426667 dts:2048 dts_time:0.0426667 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:3072 pts_time:0.064 dts:3072 dts_time:0.064 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:4096 pts_time:0.0853333 dts:4096 dts_time:0.0853333 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:5120 pts_time:0.106667 dts:5120 dts_time:0.106667 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:6144 pts_time:0.128 dts:6144 dts_time:0.128 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:12000 pts_time:0.133333 dts:12000 dts_time:0.133333 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:12000 pts_time:0.78125 dts:12000 dts_time:0.78125 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:2048 pts_time:0.133333 dts:2048 dts_time:0.133333 duration:512 duration_time:0.0333333 stream_index:0
in: pts:15000 pts_time:0.166667 dts:15000 dts_time:0.166667 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:15000 pts_time:0.976562 dts:15000 dts_time:0.976562 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:2560 pts_time:0.166667 dts:2560 dts_time:0.166667 duration:512 duration_time:0.0333333 stream_index:0
in: pts:18000 pts_time:0.2 dts:18000 dts_time:0.2 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:18000 pts_time:1.17188 dts:18000 dts_time:1.17188 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:3072 pts_time:0.2 dts:3072 dts_time:0.2 duration:512 duration_time:0.0333333 stream_index:0
in: pts:21000 pts_time:0.233333 dts:21000 dts_time:0.233333 duration:3000 duration_time:0.0333333 stream_index:0
ORIG: pts:21000 pts_time:1.36719 dts:21000 dts_time:1.36719 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:3584 pts_time:0.233333 dts:3584 dts_time:0.233333 duration:512 duration_time:0.0333333 stream_index:0
in: pts:7168 pts_time:0.149333 dts:7168 dts_time:0.149333 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:8192 pts_time:0.170667 dts:8192 dts_time:0.170667 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:9216 pts_time:0.192 dts:9216 dts_time:0.192 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:10240 pts_time:0.213333 dts:10240 dts_time:0.213333 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:11264 pts_time:0.234667 dts:11264 dts_time:0.234667 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:12288 pts_time:0.256 dts:12288 dts_time:0.256 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:13312 pts_time:0.277333 dts:13312 dts_time:0.277333 duration:1024 duration_time:0.0213333 stream_index:1
in: pts:24000 pts_time:0.266667 dts:24000 dts_time:0.266667 duration:3000 duration_time:0.0333333 stream_index:0
ret 320
<-------------------- Watermarking frame
----------------------------------------
[swscaler @ 0x7f8413186000] Warning: data is not aligned! This can lead to a speedloss
ret 320
ORIG: pts:24000 pts_time:1.5625 dts:24000 dts_time:1.5625 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:4096 pts_time:0.266667 dts:4096 dts_time:0.266667 duration:512 duration_time:0.0333333 stream_index:0
in: pts:27000 pts_time:0.3 dts:27000 dts_time:0.3 duration:3000 duration_time:0.0333333 stream_index:0
ret 320
<-------------------- Watermarking frame
----------------------------------------
ret 320
ORIG: pts:27000 pts_time:1.75781 dts:27000 dts_time:1.75781 duration:3000 duration_time:0.195312 stream_index:0
STYL: pts:4608 pts_time:0.3 dts:4608 dts_time:0.3 duration:512 duration_time:0.0333333 stream_index:0
in: pts:30000 pts_time:0.333333 dts:30000 dts_time:0.333333 duration:3000 duration_time:0.0333333 stream_index:0
ret 320
<-------------------- Watermarking frame
----------------------------------------
.../... The rest is OK till the end of process. A mpeg4 movie is generated.

And finally the way I set the AVFormatContext with mpeg4 init:
________________________________________________________
VS_VideoContext * get_out_video_ctx(VS_VideoContext *in_ctx, char *filename)
{
    AVFormatContext *out_fmt_ctx=NULL;
    AVOutputFormat *o_format=NULL;
    AVCodecContext *o_codec_ctx=NULL;
    AVCodec *o_codec=NULL;

    AVDictionary    *optionsDict = NULL;//Mandatory way of setting options. Initializes with sws_getcontext()

    VS_VideoContext *out_video_ctx=(VS_VideoContext *) malloc(sizeof(VS_VideoContext));

    int i, ret;

    av_register_all();
    avcodec_register_all();

    //Allocating MPEG4 format context
    avformat_alloc_output_context2(&out_fmt_ctx, NULL, NULL, filename);
    if (!out_fmt_ctx)
    {
        fprintf(stderr, "Could not create output AVFormatContext\n");
        ret = AVERROR_UNKNOWN;
        exit(0);
    }

    //find MPEG4 output codec
    o_codec = avcodec_find_encoder(AV_CODEC_ID_MPEG4);
    if (!o_codec)
    {
        fprintf(stderr, "Segmenter error: Could not find video decoder, key frames will not be honored.\n");
    }
    o_codec_ctx=avcodec_alloc_context3(o_codec);
    if (!o_codec_ctx)
    {
        fprintf(stderr, "Could not allocate output video codec context\n");
        exit(1);
    }

    //codec found, now we param it
    o_codec_ctx->codec_id=AV_CODEC_ID_MPEG4;
    o_codec_ctx->bit_rate=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->bit_rate;
    o_codec_ctx->width=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->width;
    o_codec_ctx->height=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->height;
    o_codec_ctx->time_base.num=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->time_base.num;
    o_codec_ctx->time_base.den=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->time_base.den;//Frames per second
    o_codec_ctx->gop_size=12;//in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->gop_size;
    o_codec_ctx->pix_fmt=AV_PIX_FMT_YUV420P;

    //Now we open it
    if(avcodec_open2(o_codec_ctx, o_codec, &optionsDict)<0)
    {
        INFO(stderr, "Unable to open codec! [%s]\n", o_codec->name);
        INFO(stderr, "Terminate program.\n");
        exit(0);
    }

    //INFO(stderr, "%s %s %d\n", __FILE__, __func__, __LINE__);
    INFO(stdout, "Codec for output codec context is %s\n", o_codec->name);
    INFO(stdout, "Successfully opened %s codec\n", o_codec->name); 

    //Copying  audio streams and codecs from source to destination streams
    //Affecting MPEG4 foramt context to VIDEO stream
    for (i = 0; i < in_ctx->nb_streams; i++)
    {
        AVStream *in_stream = in_ctx->format_ctx->streams[i];

        if(in_stream->codec->codec_type==AVMEDIA_TYPE_AUDIO)
        {
            AVStream *out_stream = avformat_new_stream(out_fmt_ctx, in_stream->codec->codec);
            if (!out_stream)
            {
                fprintf(stderr, "Failed allocating output stream\n");
                ret = AVERROR_UNKNOWN;
                exit(0);
            }
            ret = avcodec_copy_context(out_stream->codec, in_stream->codec);
            if (ret < 0)
            {
                fprintf(stderr, "Failed to copy context from input to output stream codec context\n");
                exit(0);
            }
            out_stream->codec->codec_tag = 0;
            if (out_fmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
                out_stream->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
        }

        if(in_stream->codec->codec_type==AVMEDIA_TYPE_VIDEO)
        {
            AVStream *out_stream = avformat_new_stream(out_fmt_ctx, o_codec_ctx->codec);
            if (!out_stream)
            {
                fprintf(stderr, "Failed allocating output stream\n");
                ret = AVERROR_UNKNOWN;
                exit(0);
            }
            out_stream->codec->codec_tag = 0;
            if (out_fmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
            {
                out_stream->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
            }          
        }
    }

    //Setting AVOutputFormat of AVFormatContext
    o_format=out_fmt_ctx->oformat;
    //Setting codec context for out stream
    out_fmt_ctx->streams[in_ctx->video_stream_idx]->codec=o_codec_ctx;

    //Open file to write if not already opened
    if(!(o_format->flags&AVFMT_NOFILE))
    {
        ret=avio_open(&out_fmt_ctx->pb, filename, AVIO_FLAG_WRITE);
        if(ret<0)
        {
            INFO(stderr, "Unable to open output filename. Abort program.\n");
            vs_free(in_ctx);
            exit(0);
        }
    }

    //write header for output file
    ret=avformat_write_header(out_fmt_ctx, NULL);
    if(ret<0)
    {
        INFO(stderr, "Unable to write header for output file. Abort program.\n");
        vs_free(in_ctx);
        exit(0);
    }

    //This is just a straight encapsulation of some libav stuff
    out_video_ctx->format_ctx=out_fmt_ctx;
    out_video_ctx->filename=filename;
    out_video_ctx->codec_name=(char *) out_fmt_ctx->streams[in_ctx->video_stream_idx]->codec->codec->long_name;
    out_video_ctx->video_stream_idx=in_ctx->video_stream_idx;
    out_video_ctx->audio_stream_idx=in_ctx->audio_stream_idx;
    out_video_ctx->picture_width=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->width;
    out_video_ctx->picture_height=in_ctx->format_ctx->streams[in_ctx->video_stream_idx]->codec->height;
    out_video_ctx->nb_streams=in_ctx->nb_streams;

    av_dump_format(out_fmt_ctx, 0, filename, 1);
    INFO(stdout, "----------------------------------------");

    return out_video_ctx;
}

Thanks!

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://ffmpeg.org/pipermail/libav-user/attachments/20150611/4d9a857d/attachment.html>