[FFmpeg-user] How to upsample and then encode audio

Большой Человек cheloveck2.0 at gmail.com
Wed Jun 8 15:00:06 EEST 2022


The logic and steps for transcoding audio with upsampling were right.
Problem was outside of the topic. So anyone may use this thread as
foundation for your code if it will be needed.

пт, 3 июн. 2022 г. в 11:49, Большой Человек <cheloveck2.0 at gmail.com>:

>  Basically after transcoding pcm_alaw 8khz to mp3 44.1khz, I can hear only
> some brief or even swift sound in first 1-2 seconds, unrecognizable sound.
> So something is wrong with pts/dts, packed to planar convertion, or
> upsampling.
>
>  My application does transcoding rtsp camera stream to file. Video and
> audio. Video works fine and audio remuxing as well. Now I have pcm_alaw
> 8khz audio stream and want to transcode it to mp4 file along with video.
>
>   Code is quite cumbersome to construct reproducible part, so firstly I
> want to know if my logic is right. Here is my draft process (assume all
> error are checked and handled):
>
> create encoder:
> ```
>     codec_ = avcodec_find_encoder(AV_CODEC_ID_MP3);
>
>     enc_ctx_ = avcodec_alloc_context3(codec_);
>
>     enc_ctx_->bit_rate = 64000;
>     enc_ctx_->codec_type = AVMEDIA_TYPE_AUDIO;
>
>     enc_ctx_->sample_fmt   = codec_->sample_fmts ? codec_->sample_fmts[0]
> : AV_SAMPLE_FMT_S32P;
>
>     // functions from here
> https://www.ffmpeg.org/doxygen/4.1/encode_audio_8c-example.html
>     enc_ctx_->sample_rate    = select_sample_rate(codec_);
>     enc_ctx_->channel_layout = select_channel_layout(codec_);
>     enc_ctx_->channels       =
> av_get_channel_layout_nb_channels(enc_ctx_->channel_layout);
>     enc_ctx_->time_base = (AVRational){1, enc_ctx_->sample_rate};
>     enc_ctx_->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
>
>     if (is_global_header) {
>         enc_ctx_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
>     }
>
>     avcodec_open2(enc_ctx_, codec_, nullptr);
> ```
>
> create resampler (in_frame):
> ```
>     audio_fifo_ = av_audio_fifo_alloc(enc_ctx_->sample_fmt,
> enc_ctx_->channels, 1));
>
>     in_ch_layout_ = in_frame->channel_layout;
>     in_sample_fmt = in_frame->format;
>     in_sample_rate_ = in_frame->sample_rate;
>
>     swr_ctx_ = swr_alloc_set_opts(NULL,                       // we're
> allocating a new context
>                              enc_ctx_->channel_layout,        //
> out_ch_layout
>                              enc_ctx_->sample_fmt,            //
> out_sample_fmt
>                              enc_ctx_->sample_rate,           //
> out_sample_rate
>                              in_frame->channel_layout,        //
> in_ch_layout
>                              (AVSampleFormat)in_frame->format, //
> in_sample_fmt
>                              in_frame->sample_rate,            //
> in_sample_rate
>                              0,                                //
> log_offset
>                              NULL);                            // log_ctx
>
>     swr_init(swr_ctx_);
> ```
>
> resample (in_frame, start_pts, start_dts):
> ```
>     auto resampled_frame = av_frame_alloc();
>
>     auto dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx_,
> in_frame->sample_rate) +
>                                     in_frame->nb_samples,
> enc_ctx_->sample_rate, in_frame->sample_rate, AV_ROUND_UP);
>
>     // resampled_frame->nb_samples     = dst_nb_samples;
>     resampled_frame->format         = enc_ctx_->sample_fmt;
>     resampled_frame->channel_layout = enc_ctx_->channel_layout;
>     // resampled_frame->channels       = enc_ctx_->channels;
>     resampled_frame->sample_rate    = enc_ctx_->sample_rate;
>
>     error = swr_convert_frame(swr_ctx_, resampled_frame, in_frame);
>
>     /* Make the FIFO as large as it needs to be to hold both,
>      * the old and the new samples. */
>     if (av_audio_fifo_size(audio_fifo_) < dst_nb_samples) {
>         av_audio_fifo_realloc(audio_fifo_, dst_nb_samples);
>     }
>
>     /* Store the new samples in the FIFO buffer. */
>     auto nb_samples = av_audio_fifo_write(audio_fifo_,
>                                           reinterpret_cast<void
> **>(resampled_frame->extended_data),
>                                           resampled_frame->nb_samples);
>
>
>     int delay = 0;
>     // trying to split resampled frame to desired chunks
>     while (av_audio_fifo_size(audio_fifo_) > 0) {
>         const int frame_size = FFMIN(av_audio_fifo_size(audio_fifo_),
> enc_ctx_->frame_size);
>
>         auto out_frame = av_frame_alloc();
>
>
>         out_frame->nb_samples       = frame_size;
>         out_frame->format           = enc_ctx_->sample_fmt;
>         out_frame->channel_layout   = enc_ctx_->channel_layout;
>         out_frame->channels         = enc_ctx_->channels;
>         out_frame->sample_rate      = enc_ctx_->sample_rate;
>
>         av_frame_get_buffer(out_frame, 0);
>
>         av_audio_fifo_read(audio_fifo_, (void **)out_frame->data,
> frame_size) < frame_size);
>
> // ***** tried both cases
>         out_frame->pts = in_frame->pts + delay;
>         out_frame->pkt_dts = in_frame->pkt_dts + delay;
>         // swr_next_pts(swr_ctx_, in_frame->pts) + delay;
>         // swr_next_pts(swr_ctx_, in_frame->pkt_dts) + delay;
>
>         result.push_back(out_frame);
>
>         delay += frame_size;
>     }
>
>     return result;
> ```
>
>
> encoding and muxing (in_frame):
> ```
> bool DoesNeedResample(const AVFrame * in_frame) {
>    assert(("DoesNeedResample: in_frame is empty", in_frame));
>    assert(("DoesNeedResample: encoder is not started", is_init_));
>
>    if (in_frame->sample_rate != enc_ctx_->sample_rate ||
> in_frame->channel_layout != enc_ctx_->channel_layout ||
> in_frame->channels != enc_ctx_->channels ||
> in_frame->format != enc_ctx_->sample_fmt) {
> return true;
>    }
>
>    return false;
> }
>
>     av_frame_make_writable(in_frame);
>
>
>     streamserver::AVFrames encoding_frames;
>     if (DoesNeedResample(in_frame)) {
>         encoding_frames = Resample(in_frame,
>         av_rescale_q(in_frame->pts, in_audio_stream_timebase_,
> out_audio_stream_->time_base),
>         av_rescale_q(in_frame->pkt_dts, in_audio_stream_timebase_,
> out_audio_stream_->time_base));
>     } else {
>         encoding_frames.push_back(av_frame_clone(in_frame));
>     }
>
>
>     for (auto frame : encoding_frames) {
>         if ((err = avcodec_send_frame(encoder_ctx, frame)) < 0) {
>             AVFrameFree(&frame);
>         }
>
>         while (err >= 0) {
>             pkt_->data = NULL;
>             pkt_->size = 0;
>             av_init_packet(pkt_);
>
>             err = avcodec_receive_packet(encoder_ctx, pkt_);
>             if (err == AVERROR(EAGAIN) || err == AVERROR_EOF) {
>                 break;
>             } else if (err < 0) {
>                 break;
>             }
>
>             pkt_->stream_index = out_audio_stream_->index;
>
>             av_interleaved_write_frame(ofmt_ctx_, pkt_);
>         }
>
>         av_packet_unref(pkt_);
>     }
> ```
>
> Sound in result video is corrupted, see first paragraph for description.
>
> In https://www.ffmpeg.org/doxygen/4.1/transcode_aac_8c-example.html
> there are lines:
> ```
>         /*
>         * Perform a sanity check so that the number of converted samples is
>         * not greater than the number of samples to be converted.
>         * If the sample rates differ, this case has to be handled
> differently
>         */
>         av_assert0(output_codec_context->sample_rate ==
> input_codec_context->sample_rate);
> ```
>
> How to handle such cases? I tried to split resampled frames via fifo in
> example above!
>


More information about the ffmpeg-user mailing list