<div dir="ltr"><div><div><div><div><div><div>Hi!<br><br></div>I have made a small application to extract audio from an mp4 file, or simply convert an existing audio file to AAC/mp4 format (both raw AAC, or inside mp4 container). I have run this application with existing mp4 files as input, and it properly extracts audio, and encodes to mp4 (audio only:AAC), or even directly in AAC format (i.e. test.aac also works). But when I tried running it on mp3 files, output clip plays faster than it should be (a clip of 1:12 seconds plays back till 1:05 seconds only, and is also noisy).<br>
<br></div>Here is the code I have written to achieve this:<br><br>////////////////////////////////////////////////<br><br>#include "stdafx.h"<br><br>#include <iostream><br>#include <fstream><br><br>
#include <string><br>#include <vector><br>#include <map><br><br>#include <deque><br>#include <queue><br><br>#include <math.h><br>#include <stdlib.h><br>#include <stdio.h><br>
#include <conio.h><br><br>extern "C"<br>{<br>#include "libavcodec/avcodec.h"<br>#include "libavformat/avformat.h"<br>#include "libavdevice/avdevice.h"<br>#include "libswscale/swscale.h"<br>
#include "libavutil/dict.h"<br>#include "libavutil/error.h"<br>#include "libavutil/opt.h"<br>#include <libavutil/fifo.h><br>#include <libavutil/imgutils.h><br>#include <libavutil/samplefmt.h><br>
}<br><br>AVFormatContext* fmt_ctx= NULL;<br>int audio_stream_index = -1;<br>AVCodecContext *codec_ctx_audio = NULL;<br>AVCodec* codec_audio = NULL;<br>AVFrame* decoded_frame = NULL;<br>uint8_t** audio_dst_data = NULL;<br>
int got_frame = 0;<br>int audiobufsize = 0;<br>AVPacket input_packet;<br>int audio_dst_linesize = 0;<br>int audio_dst_bufsize = 0;<br><br>AVOutputFormat *output_format = NULL ;<br>
AVFormatContext *output_fmt_ctx = NULL;<br>AVStream *audio_st = NULL;<br>AVCodec *audio_codec = NULL;<br>double audio_pts = 0.0;<br><br>int audio_input_frame_size = 0;<br>
<br>uint8_t *audio_data_buf = NULL;<br>uint8_t *audio_out = NULL;<br>int audio_bit_rate;<br>int audio_sample_rate;<br>int audio_channels;<br><br>int decode_packet();<br>int open_audio_input(char* src_filename);<br>
int decode_frame();<br><br>int open_encoder(char* output_filename);<br>AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,<br> enum AVCodecID codec_id);<br>int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st);<br>
void close_audio(AVFormatContext *oc, AVStream *st);<br>void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize);<br><br>int open_audio_input(char* src_filename)<br>{<br> int i =0;<br> /* open input file, and allocate format context */<br>
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0) <br> {<br> fprintf(stderr, "Could not open source file %s\n", src_filename);<br> exit(1);<br> }<br><br> // Retrieve stream information<br>
if(avformat_find_stream_info(fmt_ctx, NULL)<0)<br> return -1; // Couldn't find stream information<br><br> // Dump information about file onto standard error<br> av_dump_format(fmt_ctx, 0, src_filename, 0);<br>
<br> // Find the first video stream<br> for(i=0; i<fmt_ctx->nb_streams; i++)<br> {<br> if(fmt_ctx->streams[i]->codec->codec_type==AVMEDIA_TYPE_AUDIO) <br> {<br> audio_stream_index=i;<br>
break;<br> }<br> }<br> if ( audio_stream_index != -1 )<br> {<br> // Get a pointer to the codec context for the audio stream<br> codec_ctx_audio=fmt_ctx->streams[audio_stream_index]->codec;<br>
<br> // Find the decoder for the video stream<br> codec_audio=avcodec_find_decoder(codec_ctx_audio->codec_id);<br> if(codec_audio==NULL) {<br> fprintf(stderr, "Unsupported audio codec!\n");<br>
return -1; // Codec not found<br> }<br><br> // Open codec<br> AVDictionary *codecDictOptions = NULL;<br> if(avcodec_open2(codec_ctx_audio, codec_audio, &codecDictOptions)<0)<br>
return -1; // Could not open codec<br><br> // Allocate audio frame<br> if ( decoded_frame == NULL ) decoded_frame = avcodec_alloc_frame();<br> int nb_planes = 0;<br> AVStream* audio_stream = fmt_ctx->streams[audio_stream_index];<br>
nb_planes = av_sample_fmt_is_planar(codec_ctx_audio->sample_fmt) ?<br> codec_ctx_audio->channels : 1;<br> int tempSize = sizeof(uint8_t *) * nb_planes;<br> audio_dst_data = (uint8_t**)av_mallocz(tempSize);<br>
if (!audio_dst_data) <br> {<br> fprintf(stderr, "Could not allocate audio data buffers\n");<br> }<br> else<br> {<br> for ( int i = 0 ; i < nb_planes ; i ++ )<br>
{<br> audio_dst_data[i] = NULL;<br> }<br> }<br> }<br>}<br><br><br>int decode_frame()<br>{<br> int rv = 0;<br> got_frame = 0;<br> if ( fmt_ctx == NULL )<br> {<br>
return rv;<br> }<br> int ret = 0;<br> audiobufsize = 0;<br> rv = av_read_frame(fmt_ctx, &input_packet);<br> if ( rv < 0 )<br> {<br> return rv;<br> }<br> rv = decode_packet();<br>
// Free the input_packet that was allocated by av_read_frame<br> av_free_packet(&input_packet);<br> return rv;<br>}<br><br>int decode_packet()<br>{<br> int rv = 0;<br> int ret = 0;<br><br> //audio stream?<br>
if(input_packet.stream_index == audio_stream_index) <br> {<br> /* decode audio frame */<br> rv = avcodec_decode_audio4(codec_ctx_audio, decoded_frame, &got_frame, &input_packet);<br> if (rv < 0) <br>
{<br> fprintf(stderr, "Error decoding audio frame\n");<br> //return ret;<br> }<br> else<br> {<br> if (got_frame) <br> {<br> if ( audio_dst_data[0] == NULL )<br>
{<br> ret = av_samples_alloc(audio_dst_data, &audio_dst_linesize, decoded_frame->channels,<br> decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);<br>
if (ret < 0) <br> {<br> fprintf(stderr, "Could not allocate audio buffer\n");<br> return AVERROR(ENOMEM);<br> }<br>
/* TODO: extend return code of the av_samples_* functions so that this call is not needed */<br> audio_dst_bufsize =<br> av_samples_get_buffer_size(NULL, decoded_frame->channels,<br>
decoded_frame->nb_samples, (AVSampleFormat)decoded_frame->format, 1);<br> }<br> /* copy audio data to destination buffer:<br> * this is required since rawaudio expects non aligned data */<br>
av_samples_copy(audio_dst_data, decoded_frame->data, 0, 0,<br> decoded_frame->nb_samples, decoded_frame->channels, (AVSampleFormat)decoded_frame->format);<br> }<br>
}<br> }<br> return rv;<br>}<br><br><br>int open_encoder(char* output_filename )<br>{<br> int rv = 0;<br><br> /* allocate the output media context */<br> AVOutputFormat *opfmt = NULL;<br><br> avformat_alloc_output_context2(&output_fmt_ctx, opfmt, NULL, output_filename);<br>
if (!output_fmt_ctx) {<br> printf("Could not deduce output format from file extension: using MPEG.\n");<br> avformat_alloc_output_context2(&output_fmt_ctx, NULL, "mpeg", output_filename);<br>
}<br> if (!output_fmt_ctx) {<br> rv = -1;<br> }<br> else<br> {<br> output_format = output_fmt_ctx->oformat;<br> }<br><br> /* Add the audio stream using the default format codecs<br> * and initialize the codecs. */<br>
audio_st = NULL;<br><br> if ( output_fmt_ctx )<br> {<br> if (output_format->audio_codec != AV_CODEC_ID_NONE) <br> {<br> audio_st = add_audio_stream(output_fmt_ctx, &audio_codec, output_format->audio_codec);<br>
}<br><br> /* Now that all the parameters are set, we can open the audio and<br> * video codecs and allocate the necessary encode buffers. */<br> if (audio_st)<br> {<br> rv = open_audio(output_fmt_ctx, audio_codec, audio_st);<br>
if ( rv < 0 ) return rv;<br> }<br><br> av_dump_format(output_fmt_ctx, 0, output_filename, 1);<br> /* open the output file, if needed */<br> if (!(output_format->flags & AVFMT_NOFILE)) <br>
{<br> if (avio_open(&output_fmt_ctx->pb, output_filename, AVIO_FLAG_WRITE) < 0) {<br> fprintf(stderr, "Could not open '%s'\n", output_filename);<br> rv = -1;<br>
}<br> else<br> {<br> /* Write the stream header, if any. */<br> if (avformat_write_header(output_fmt_ctx, NULL) < 0) <br> {<br> fprintf(stderr, "Error occurred when opening output file\n");<br>
rv = -1;<br> }<br> }<br> }<br> }<br> <br> return rv;<br>}<br><br>AVStream *add_audio_stream(AVFormatContext *oc, AVCodec **codec,<br> enum AVCodecID codec_id)<br>
{<br> AVCodecContext *c;<br> AVStream *st;<br><br> /* find the audio encoder */<br> *codec = avcodec_find_encoder(codec_id);<br> if (!(*codec)) {<br> fprintf(stderr, "Could not find codec\n");<br>
exit(1);<br> }<br><br> st = avformat_new_stream(oc, *codec);<br> if (!st) {<br> fprintf(stderr, "Could not allocate stream\n");<br> exit(1);<br> }<br> st->id = 1;<br><br> c = st->codec;<br>
<br> /* put sample parameters */<br> c->sample_fmt = AV_SAMPLE_FMT_S16;<br> c->bit_rate = audio_bit_rate;<br> c->sample_rate = audio_sample_rate;<br> c->channels = audio_channels;<br><br>
// some formats want stream headers to be separate<br> if (oc->oformat->flags & AVFMT_GLOBALHEADER)<br> c->flags |= CODEC_FLAG_GLOBAL_HEADER;<br><br> return st;<br>}<br><br>int open_audio(AVFormatContext *oc, AVCodec *codec, AVStream *st)<br>
{<br> int ret=0;<br> AVCodecContext *c;<br><br> c = st->codec;<br><br> /* open it */<br> if (avcodec_open2(c, codec, NULL) < 0) {<br> fprintf(stderr, "could not open codec\n");<br> return -1;<br>
//exit(1);<br> }<br><br> if (c->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)<br> audio_input_frame_size = 10000;<br> else<br> audio_input_frame_size = c->frame_size;<br>
int tempSize = audio_input_frame_size *<br> av_get_bytes_per_sample(c->sample_fmt) *<br> c->channels;<br> return ret;<br>}<br><br>void close_audio(AVFormatContext *oc, AVStream *st)<br>{<br> avcodec_close(st->codec);<br>
}<br><br>void write_audio_frame(uint8_t ** audio_src_data, int audio_src_bufsize)<br>{<br> AVFormatContext *oc = output_fmt_ctx;<br> AVStream *st = audio_st;<br> if ( oc == NULL || st == NULL ) return;<br> AVCodecContext *c;<br>
AVPacket pkt = { 0 }; // data and size must be 0;<br> AVFrame *frame = avcodec_alloc_frame();<br> int got_packet;<br><br> av_init_packet(&pkt);<br> c = st->codec;<br><br> frame->nb_samples = audio_input_frame_size;<br>
int buf_size = audio_src_bufsize *<br> av_get_bytes_per_sample(c->sample_fmt) *<br> c->channels;<br> avcodec_fill_audio_frame(frame, c->channels, c->sample_fmt,<br> (uint8_t *) *audio_src_data,<br>
buf_size, 1);<br> avcodec_encode_audio2(c, &pkt, frame, &got_packet);<br> if (!got_packet)<br> {<br> avcodec_free_frame(&frame);<br> }<br> else<br> {<br> pkt.stream_index = st->index;<br>
/* Write the compressed frame to the media file. */<br> if (av_interleaved_write_frame(oc, &pkt) != 0) <br> {<br> fprintf(stderr, "Error while writing audio frame\n");<br> exit(1);<br>
}<br> avcodec_free_frame(&frame);<br> }<br> av_free_packet(&pkt);<br>}<br><br><br>void write_delayed_frames(AVFormatContext *oc, AVStream *st)<br>{<br> AVCodecContext *c = st->codec;<br>
int got_output = 0;<br> int ret = 0;<br> AVPacket pkt;<br> pkt.data = NULL;<br> pkt.size = 0;<br> av_init_packet(&pkt);<br> int i = 0;<br> //int got_packet;<br><br> <br> for (got_output = 1; got_output; i++) <br>
{<br> ret = avcodec_encode_audio2(c, &pkt, NULL, &got_output);<br> if (ret < 0) <br> {<br> fprintf(stderr, "error encoding frame\n");<br> exit(1);<br> }<br>
static int64_t tempPts = 0;<br> static int64_t tempDts = 0;<br> /* If size is zero, it means the image was buffered. */<br> if (got_output) <br> {<br> if (pkt.pts != AV_NOPTS_VALUE)<br>
pkt.pts = av_rescale_q(pkt.pts, st->codec->time_base, st->time_base);<br> if (pkt.dts != AV_NOPTS_VALUE)<br> pkt.dts = av_rescale_q(pkt.dts, st->codec->time_base, st->time_base);<br>
if (c->coded_frame->key_frame)<br> pkt.flags |= AV_PKT_FLAG_KEY;<br><br> pkt.stream_index = st->index;<br> /* Write the compressed frame to the media file. */<br> ret = av_interleaved_write_frame(oc, &pkt);<br>
}<br> else <br> {<br> ret = 0;<br> }<br> av_free_packet(&pkt);<br> }<br>}<br><br>int main (int argc, char **argv)<br>{<br> /* register all formats and codecs */<br>
av_register_all();<br> int i =0;<br> char src_filename[90] = "test.mp3";<br> char dst_filename[90] = "test.mp4";<br> open_audio_input(src_filename);<br> audio_bit_rate = codec_ctx_audio->bit_rate;<br>
audio_sample_rate = codec_ctx_audio->sample_rate;<br> audio_channels = codec_ctx_audio->channels;<br> open_encoder( dst_filename );<br> while(1)<br> {<br> int rv = decode_frame();<br>
if ( rv < 0 )<br> {<br> break;<br> }<br><br> if (audio_st)<br> {<br> audio_pts = (double)audio_st->pts.val * audio_st->time_base.num / <br> audio_st->time_base.den;<br>
}<br> else<br> {<br> audio_pts = 0.0;<br> }<br> printf("\naudio_pts: %.3f",audio_pts);<br> if ( codec_ctx_audio )<br> {<br> if ( got_frame) <br>
{<br> write_audio_frame( audio_dst_data, audio_dst_bufsize );<br> }<br> }<br> if ( audio_dst_data[0] )<br> {<br> av_freep(&audio_dst_data[0]);<br> audio_dst_data[0] = NULL;<br>
}<br> }<br> write_delayed_frames( output_fmt_ctx, audio_st );<br> av_write_trailer(output_fmt_ctx);<br> close_audio( output_fmt_ctx, audio_st);<br> return 0;<br>}<br>///////////////////////////////////////////////<br>
<br></div>I have been looking at this problem from many angles since about two days now, but cant seem to figure out what I'm doing wrong.<br>
<br></div>Note also: the printf() statement I've inserted shows audio_pts up to 64.551 (that's about 1:05 seconds that also proves encoder is not going to full duration of input file: 1:12 secs):<br>.......<br>.......<br>
.......<br>audio_pts: 63.808<br>audio_pts: 63.832<br>audio_pts: 63.855<br>audio_pts: 63.878<br>audio_pts: 63.901<br>audio_pts: 63.925<br>audio_pts: 63.948<br>audio_pts: 63.971<br>audio_pts: 63.994<br>audio_pts: 64.017<br>
audio_pts: 64.041<br>audio_pts: 64.064<br>audio_pts: 64.087<br>audio_pts: 64.110<br>audio_pts: 64.134<br>audio_pts: 64.157<br>audio_pts: 64.180<br>audio_pts: 64.203<br>audio_pts: 64.226<br>audio_pts: 64.250<br>audio_pts: 64.273<br>
audio_pts: 64.296<br>audio_pts: 64.319<br>audio_pts: 64.342<br>audio_pts: 64.366<br>audio_pts: 64.389<br>audio_pts: 64.412<br>audio_pts: 64.435<br>audio_pts: 64.459<br>audio_pts: 64.482<br>audio_pts: 64.505<br>audio_pts: 64.528<br>
audio_pts: 64.551<br><br><br></div>Can anyone please guide me what I may be doing wrong?<br><br>Thanks in advance for any guidance!<br><br></div>p.s. when run through command line like: ffmpeg -i test.mp3 test.mp4, it converts the file just fine.<br>
</div>