[Libav-user] webm (vorbis) audio to aac conversion issues

Wed Jan 1 12:07:29 CET 2014

Hi,

I have written a small program to convert webm (vorbis) audio to aac
format, using FFmpeg libraries - C++ (on Windows using 32 bit Zeranoe
FFmpeg builds). After writing this program, I find it is sometimes
converting files as per expectation, and at other times, results in larger
duration files, and audio playback is broken/awkward as well.

This code appears to be working fine for mp3, which also uses FLTP format
(same as vorbis), so technically both look similar.

Please see below sample code I am using:

    ////////////////////////////////////////////////
    #include "stdafx.h"

    #include <iostream>
    #include <fstream>

    #include <string>
    #include <vector>
    #include <map>

    #include <deque>
    #include <queue>

    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <conio.h>

    extern "C"
    {
    #include "libavcodec/avcodec.h"
    #include "libavformat/avformat.h"
    #include "libavdevice/avdevice.h"
    #include "libswscale/swscale.h"
    #include "libavutil/dict.h"
    #include "libavutil/error.h"
    #include "libavutil/opt.h"
    #include <libavutil/fifo.h>
    #include <libavutil/imgutils.h>
    #include <libavutil/samplefmt.h>
    #include <libswresample/swresample.h>
    }

    AVFormatContext*    fmt_ctx= NULL;
    int                    audio_stream_index = -1;
    AVCodecContext *    codec_ctx_audio = NULL;
    AVCodec*            codec_audio = NULL;
    AVFrame*            decoded_frame = NULL;
    uint8_t**            audio_dst_data = NULL;
    int                    got_frame = 0;
    int                    audiobufsize = 0;
    AVPacket            input_packet;
    int                    audio_dst_linesize = 0;
    int                    audio_dst_bufsize = 0;
    SwrContext *        swr = NULL;

    AVOutputFormat *    output_format = NULL ;
    AVFormatContext *    output_fmt_ctx= NULL;
    AVStream *            audio_st = NULL;
    AVCodec *            audio_codec = NULL;
    double                audio_pts = 0.0;
    AVFrame *            out_frame = avcodec_alloc_frame();

    int                    audio_input_frame_size = 0;

    uint8_t *            audio_data_buf = NULL;
    uint8_t *            audio_out = NULL;
    int                    audio_bit_rate;
    int                    audio_sample_rate;
    int                    audio_channels;

    int decode_packet();
    int open_audio_input(char* src_filename);
    int decode_frame();

    int open_encoder(char* output_filename);
    AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
        enum AVCodecID codec_id);
    int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);
    void close_audio(AVFormatContext *oc, AVStream *st);
    void write_audio_frame(uint8_t ** audio_src_data, int
audio_src_bufsize);

    int open_audio_input(char* src_filename)
    {
        int i =0;
        /* open input file, and allocate format context */
        if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
        {
            fprintf(stderr, "Could not open source file %s\n",
src_filename);
            exit(1);
        }

        // Retrieve stream information
        if(avformat_find_stream_info(fmt_ctx, NULL)<0)
            return -1; // Couldn't find stream information

        // Dump information about file onto standard error
        av_dump_format(fmt_ctx, 0, src_filename, 0);

        // Find the first video stream
        for(i=0; i<fmt_ctx->nb_streams; i++)
        {
            if(fmt_ctx->streams[i]->codec->codec_type==AVMEDIA_TYPE_AUDIO)
            {
                audio_stream_index=i;
                break;
            }
        }
        if ( audio_stream_index != -1 )
        {
            // Get a pointer to the codec context for the audio stream
            codec_ctx_audio=fmt_ctx->streams[audio_stream_index]->codec;

            // Find the decoder for the video stream
            codec_audio=avcodec_find_decoder(codec_ctx_audio->codec_id);
            if(codec_audio==NULL) {
                fprintf(stderr, "Unsupported audio codec!\n");
                return -1; // Codec not found
            }

            // Open codec
            AVDictionary *codecDictOptions = NULL;
            if(avcodec_open2(codec_ctx_audio, codec_audio,
&codecDictOptions)<0)
                return -1; // Could not open codec

            // Set up SWR context once you've got codec information
            swr = swr_alloc();
            av_opt_set_int(swr, "in_channel_layout",
codec_ctx_audio->channel_layout, 0);
            av_opt_set_int(swr, "out_channel_layout",
codec_ctx_audio->channel_layout,  0);
            av_opt_set_int(swr, "in_sample_rate",
codec_ctx_audio->sample_rate, 0);
            av_opt_set_int(swr, "out_sample_rate",
codec_ctx_audio->sample_rate, 0);
            av_opt_set_sample_fmt(swr, "in_sample_fmt",
codec_ctx_audio->sample_fmt, 0);
            av_opt_set_sample_fmt(swr, "out_sample_fmt",
AV_SAMPLE_FMT_S16,  0);
            swr_init(swr);

            // Allocate audio frame
            if ( decoded_frame == NULL ) decoded_frame =
avcodec_alloc_frame();
            int nb_planes = 0;
            AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];
            nb_planes =
av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ?
codec_ctx_audio->channels : 1;
            int tempSize =  sizeof(uint8_t *) * nb_planes;
            audio_dst_data = (uint8_t**)av_mallocz(tempSize);
            if (!audio_dst_data)
            {
                fprintf(stderr, "Could not allocate audio data buffers\n");
            }
            else
            {
                for ( int i = 0 ; i < nb_planes ; i ++ )
                {
                    audio_dst_data[i] = NULL;
                }
            }
        }
    }

    int decode_frame()
    {
        int rv = 0;
        got_frame = 0;
        if ( fmt_ctx == NULL  )
        {
            return rv;
        }
        int ret = 0;
        audiobufsize = 0;
        rv = av_read_frame(fmt_ctx, &input_packet);
        if ( rv < 0 )
        {
            return rv;
        }
        rv = decode_packet();
        // Free the input_packet that was allocated by av_read_frame
        av_free_packet(&input_packet);
        return rv;
    }

    int decode_packet()
    {
        int rv = 0;
        int ret = 0;

        //audio stream?
        if(input_packet.stream_index == audio_stream_index)
        {
            /* decode audio frame */
            rv = avcodec_decode_audio4(codec_ctx_audio, decoded_frame,
&got_frame, &input_packet);
            if (rv < 0)
            {
                fprintf(stderr, "Error decoding audio frame\n");
                //return ret;
            }
            else
            {
                if (got_frame)
                {
                    if ( audio_dst_data[0] == NULL )
                    {
                         ret = av_samples_alloc(audio_dst_data,
&audio_dst_linesize, decoded_frame->channels,
                            decoded_frame->nb_samples,
(AVSampleFormat)decoded_frame->format, 1);
                        if (ret < 0)
                        {
                            fprintf(stderr, "Could not allocate audio
buffer\n");
                            return AVERROR(ENOMEM);
                        }
                        /* TODO: extend return code of the av_samples_*
functions so that this call is not needed */
                        audio_dst_bufsize =
av_samples_get_buffer_size(NULL, audio_st->codec->channels,
                            decoded_frame->nb_samples,
(AVSampleFormat)decoded_frame->format, 1);

                        //int16_t* outputBuffer = ...;
                        swr_convert(swr, audio_dst_data,
out_frame->nb_samples,
                                    (const uint8_t
**)(decoded_frame->data), decoded_frame->nb_samples);
                        //swr_convert( swr, audio_dst_data,
out_frame->nb_samples, (const uint8_t**) decoded_frame->extended_data,
decoded_frame->nb_samples );
                    }
                    /* copy audio data to destination buffer:
                    * this is required since rawaudio expects non aligned
data */
                    //av_samples_copy(audio_dst_data, decoded_frame->data,
0, 0,
                    //    decoded_frame->nb_samples,
decoded_frame->channels, (AVSampleFormat)decoded_frame->format);
                }
            }
        }
        return rv;
    }

    int open_encoder(char* output_filename )
    {
        int rv = 0;

        /* allocate the output media context */
        AVOutputFormat *opfmt = NULL;

        avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL,
output_filename);
        if (!output_fmt_ctx) {
            printf("Could not deduce output format from file extension:
using MPEG.\n");
            avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg",
output_filename);
        }
        if (!output_fmt_ctx) {
            rv = -1;
        }
        else
        {
            output_format = output_fmt_ctx->oformat;
        }

        /* Add the audio stream using the default format codecs
        * and initialize the codecs. */
        audio_st = NULL;

        if ( output_fmt_ctx )
        {
            if (output_format->audio_codec != AV_CODEC_ID_NONE)
            {
                audio_st = add_audio_stream(output_fmt_ctx, &audio_codec,
output_format->audio_codec);
            }

            /* Now that all the parameters are set, we can open the audio
and
            * video codecs and allocate the necessary encode buffers. */
            if (audio_st)
            {
                rv = open_audio(output_fmt_ctx, audio_codec, audio_st);
                if ( rv < 0 ) return rv;
            }

            av_dump_format(output_fmt_ctx, 0, output_filename, 1);
            /* open the output file, if needed */
            if (!(output_format->flags & AVFMT_NOFILE))
            {
                if (avio_open(&output_fmt_ctx->pb, output_filename,
AVIO_FLAG_WRITE) < 0) {
                    fprintf(stderr, "Could not open '%s'\n",
output_filename);
                    rv = -1;
                }
                else
                {
                    /* Write the stream header, if any. */
                    if (avformat_write_header(output_fmt_ctx, NULL) < 0)
                    {
                        fprintf(stderr, "Error occurred when opening output
file\n");
                        rv = -1;
                    }
                }
            }
        }

        return rv;
    }

    AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,
        enum AVCodecID codec_id)
    {
        AVCodecContext *c;
        AVStream *st;

        /* find the audio encoder */
        *codec = avcodec_find_encoder(codec_id);
        if (!(*codec)) {
            fprintf(stderr, "Could not find codec\n");
            exit(1);
        }

        st = avformat_new_stream(oc, *codec);
        if (!st) {
            fprintf(stderr, "Could not allocate stream\n");
            exit(1);
        }
        st->id = 1;

        c = st->codec;

        /* put sample parameters */
        c->sample_fmt  = AV_SAMPLE_FMT_S16;
        c->bit_rate    = audio_bit_rate;
        c->sample_rate = audio_sample_rate;
        c->channels    = audio_channels;

        // some formats want stream headers to be separate
        if (oc->oformat->flags & AVFMT_GLOBALHEADER)
            c->flags |= CODEC_FLAG_GLOBAL_HEADER;

        return st;
    }

    int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)
    {
        int ret=0;
        AVCodecContext *c;

        st->duration = fmt_ctx->duration;
        c = st->codec;

        /* open it */
        ret = avcodec_open2(c, codec, NULL) ;
        if ( ret < 0)
        {
            fprintf(stderr, "could not open codec\n");
            return -1;
            //exit(1);
        }

        if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)
            audio_input_frame_size = 10000;
        else
            audio_input_frame_size = c->frame_size;
        int tempSize = audio_input_frame_size *
            av_get_bytes_per_sample(c->sample_fmt) *
            c->channels;
        return ret;
    }

    void close_audio(AVFormatContext *oc, AVStream *st)
    {
        avcodec_close(st->codec);
    }

    void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize)
    {
        AVFormatContext *oc = output_fmt_ctx;
        AVStream *st = audio_st;
        if ( oc == NULL || st == NULL ) return;
        AVCodecContext *c;
        AVPacket pkt = { 0 }; // data and size must be 0;
        int got_packet;

        av_init_packet(&pkt);
        c = st->codec;

        out_frame->nb_samples = audio_input_frame_size;
        int buf_size =         audio_src_bufsize *
            av_get_bytes_per_sample(c->sample_fmt) *
            c->channels;
        avcodec_fill_audio_frame(out_frame, c->channels, c->sample_fmt,
            (uint8_t *) *audio_src_data,
            buf_size, 1);
        avcodec_encode_audio2(c, &pkt, out_frame, &got_packet);
        if (!got_packet)
        {
        }
        else
        {
            if (pkt.pts != AV_NOPTS_VALUE)
                pkt.pts =  av_rescale_q(pkt.pts, st->codec->time_base,
st->time_base);
            if (pkt.dts != AV_NOPTS_VALUE)
                pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base,
st->time_base);
            if ( c && c->coded_frame && c->coded_frame->key_frame)
                pkt.flags |= AV_PKT_FLAG_KEY;

             pkt.stream_index = st->index;
            pkt.flags |= AV_PKT_FLAG_KEY;
            /* Write the compressed frame to the media file. */
            if (av_interleaved_write_frame(oc, &pkt) != 0)
            {
                fprintf(stderr, "Error while writing audio frame\n");
                exit(1);
            }
        }
        av_free_packet(&pkt);
    }

    void write_delayed_frames(AVFormatContext *oc, AVStream *st)
    {
        AVCodecContext *c = st->codec;
        int got_output = 0;
        int ret = 0;
        AVPacket pkt;
        pkt.data = NULL;
        pkt.size = 0;
        av_init_packet(&pkt);
        int i = 0;
        for (got_output = 1; got_output; i++)
        {
            ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);
            if (ret < 0)
            {
                fprintf(stderr, "error encoding frame\n");
                exit(1);
            }
            static int64_t tempPts = 0;
            static int64_t tempDts = 0;
            /* If size is zero, it means the image was buffered. */
            if (got_output)
            {
                if (pkt.pts != AV_NOPTS_VALUE)
                    pkt.pts =  av_rescale_q(pkt.pts, st->codec->time_base,
st->time_base);
                if (pkt.dts != AV_NOPTS_VALUE)
                    pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base,
st->time_base);
                if ( c && c->coded_frame && c->coded_frame->key_frame)
                    pkt.flags |= AV_PKT_FLAG_KEY;

                pkt.stream_index = st->index;
                /* Write the compressed frame to the media file. */
                ret = av_interleaved_write_frame(oc, &pkt);
            }
            else
            {
                ret = 0;
            }
            av_free_packet(&pkt);
        }
    }

    int main(int argc, char **argv)
    {
        /* register all formats and codecs */
        av_register_all();
        avcodec_register_all();
        avformat_network_init();
        avdevice_register_all();
        int i =0;
        int ret=0;
        char src_filename[90] = "test_a.webm";
        char dst_filename[90] = "output.aac";
        open_audio_input(src_filename);
        if ( codec_ctx_audio->bit_rate == 0 ) codec_ctx_audio->bit_rate =
112000;
        audio_bit_rate        = codec_ctx_audio->bit_rate;
        audio_sample_rate    = codec_ctx_audio->sample_rate;
        audio_channels        = codec_ctx_audio->channels;
        open_encoder( dst_filename );
        while(1)
        {
            int rv = decode_frame();
            if ( rv < 0 )
            {
                break;
            }

            if (audio_st)
            {
                audio_pts = (double)audio_st->pts.val *
audio_st->time_base.num /
                    audio_st->time_base.den;
            }
            else
            {
                audio_pts = 0.0;
            }
            if ( codec_ctx_audio )
            {
                if ( got_frame)
                {
                    write_audio_frame( audio_dst_data, audio_dst_bufsize );
                }
            }
            if ( audio_dst_data[0] )
            {
                av_freep(&audio_dst_data[0]);
                audio_dst_data[0] = NULL;
            }
            printf("\naudio_pts: %.3f", audio_pts);
        }
        while(1)
        {
            if ( audio_dst_data && audio_dst_data[0] )
            {
                av_freep(&audio_dst_data[0]);
                audio_dst_data[0] = NULL;
            }
            ret = av_samples_alloc(audio_dst_data, NULL,
codec_ctx_audio->channels,
                decoded_frame->nb_samples, AV_SAMPLE_FMT_S16, 0);
            ret = swr_convert(swr, audio_dst_data,
out_frame->nb_samples,NULL, 0);
            if ( ret <= 0 ) break;
            write_audio_frame( audio_dst_data, audio_dst_bufsize );
        }
        write_delayed_frames( output_fmt_ctx, audio_st );
        av_write_trailer(output_fmt_ctx);
        close_audio( output_fmt_ctx, audio_st);
        swr_free(&swr);
        avcodec_free_frame(&out_frame);
        getch();
        return 0;
    }

"test_a.webm" input file results in longer duration (40 second output), and
if I change it to "jet.webm", it is converted fine.

Both input files are approximately 18 second duration.

For reference, these files can be downloaded from links below:

http://www.filedropper.com/testa ,
http://www.filedropper.com/jet

Alternatively, they are zipped and uploaded elsewhere as well:

http://www.files.com/shared/52c3eefe990ea/test_audio_files.zip

Could someone kindly guide on what I am doing wrong here?

Thanks in advance...

p.s. These files are taken/extracted from different online sources/demos;
also posted on SO:
http://stackoverflow.com/questions/20867959/ffmpeg-library-webm-vorbis-audio-to-aac-conversion
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://ffmpeg.org/pipermail/libav-user/attachments/20140101/12f3c3df/attachment.html>