[FFmpeg-devel] One pass volume normalization (ebur128)

Sat Jul 13 21:56:05 CEST 2013

On 7/13/13, Nicolas George <nicolas.george at normalesup.org> wrote:
> Le quintidi 25 messidor, an CCXXI, Jan Ehrhardt a ecrit :
>> Subject: [FFmpeg-devel] One pass volume normalization (ebur128)
>
> Single-pass volume normalization is not possible, please do not call the
> feature that way.
>
>> I am once again proposing a patch for one pass volume normalization base
>> on ebur128, as I see this still did not make it into FFMpeg 2.0. My
>> patch is heaveily based on Clement Boesch's proposal in
>> http://permalink.gmane.org/gmane.comp.video.ffmpeg.devel/159978
>>
>> We have been using this patch now for more than 4 months and 1800+
>> videos of approximately 1 hour have been transcoded with it.
>>
>> Part of our FFMpeg commandline reads as
>> -filter_complex \
>>
>> "[0:v]setpts=PTS-STARTPTS[v0];[0:a]asetpts=PTS-STARTPTS,ebur128=metadata=1,volume=metadata=lavfi.r128.I,ebur128[a0]"
>> \
>>    -map [v0] -map [a0]
>
> r128.I is not a good choice, but there is nothing better yet.
>
>>
>> It uses the already present ebur128 meta injection to adjust the
>> volume on the fly. What would be the objection to move this into the
>> FFMpeg core, so I do not have to patch my FFMpeg every time I compile
>> a new one? I applied the patch below to FFMpeg Release/v.2.0.
>>
>> Jan
>>
>>
>> diff --git a/libavfilter/af_volume.c b/libavfilter/af_volume.c
>> index a2ac1e2..6372bb2 100644
>> --- a/libavfilter/af_volume.c
>> +++ b/libavfilter/af_volume.c
>
> Missing documentation update.
>
>> @@ -51,18 +51,24 @@ static const AVOption volume_options[] = {
>>          { "fixed",  "select 8-bit fixed-point",     0, AV_OPT_TYPE_CONST,
>> { .i64 = PRECISION_FIXED  }, INT_MIN, INT_MAX, A|F, "precision" },
>>          { "float",  "select 32-bit floating-point", 0, AV_OPT_TYPE_CONST,
>> { .i64 = PRECISION_FLOAT  }, INT_MIN, INT_MAX, A|F, "precision" },
>>          { "double", "select 64-bit floating-point", 0, AV_OPT_TYPE_CONST,
>> { .i64 = PRECISION_DOUBLE }, INT_MIN, INT_MAX, A|F, "precision" },
>
>> +    { "metadata", "set the metadata key for loudness normalization",
>> OFFSET(metadata), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = A|F },
>
> Inconsistent indentation.

Identation is fine.

>
>>      { NULL },
>>  };
>>
>>  AVFILTER_DEFINE_CLASS(volume);
>>
>> +static void set_fixed_volume(VolumeContext *vol, double volume)
>> +{
>> +    vol->volume_i = (int)(volume * 256 + 0.5);
>> +    vol->volume   = vol->volume_i / 256.0;
>> +}
>> +
>>  static av_cold int init(AVFilterContext *ctx)
>>  {
>>      VolumeContext *vol = ctx->priv;
>>
>>      if (vol->precision == PRECISION_FIXED) {
>> -        vol->volume_i = (int)(vol->volume * 256 + 0.5);
>> -        vol->volume   = vol->volume_i / 256.0;
>> +        set_fixed_volume(vol, vol->volume);
>>          av_log(ctx, AV_LOG_VERBOSE, "volume:(%d/256)(%f)(%1.2fdB)
>> precision:fixed\n",
>>                 vol->volume_i, vol->volume,
>> 20.0*log(vol->volume)/M_LN10);
>>      } else {
>> @@ -171,13 +177,13 @@ static av_cold void volume_init(VolumeContext *vol)
>>
>>      switch (av_get_packed_sample_fmt(vol->sample_fmt)) {
>>      case AV_SAMPLE_FMT_U8:
>> -        if (vol->volume_i < 0x1000000)
>> +        if (vol->volume_i < 0x1000000 && !vol->metadata)
>>              vol->scale_samples = scale_samples_u8_small;
>>          else
>>              vol->scale_samples = scale_samples_u8;
>>          break;
>>      case AV_SAMPLE_FMT_S16:
>> -        if (vol->volume_i < 0x10000)
>> +        if (vol->volume_i < 0x10000 && !vol->metadata)
>>              vol->scale_samples = scale_samples_s16_small;
>>          else
>>              vol->scale_samples = scale_samples_s16;
>> @@ -216,11 +222,30 @@ static int config_output(AVFilterLink *outlink)
>>
>>  static int filter_frame(AVFilterLink *inlink, AVFrame *buf)
>>  {
>> -    VolumeContext *vol    = inlink->dst->priv;
>> -    AVFilterLink *outlink = inlink->dst->outputs[0];
>> +    AVFilterContext *ctx  = inlink->dst;
>> +    VolumeContext *vol    = ctx->priv;
>> +    AVFilterLink *outlink = ctx->outputs[0];
>>      int nb_samples        = buf->nb_samples;
>>      AVFrame *out_buf;
>>
>> +    if (vol->metadata) {
>> +        double loudness, new_volume, timestamp, mx;
>> +        AVDictionaryEntry *e;
>> +        mx = 20;
>> +        timestamp = (float)(1.0 * buf->pts / outlink->sample_rate);
>> +        mx = fmin(mx, timestamp);
>> +        e = av_dict_get(buf->metadata, vol->metadata, NULL, 0);
>> +        if (e) {
>> +            loudness = av_strtod(e->value, NULL);
>
>> +            if (loudness > -69) {
>> +                new_volume = fmax(-mx,fmin(mx,(-23 - loudness)));
>> +                av_log(NULL, AV_LOG_VERBOSE, "loudness=%f => %f =>
>> volume=%f\n",
>> +                    loudness, new_volume, pow(10, new_volume / 20));
>> +                set_fixed_volume(vol, pow(10, new_volume / 20));
>> +            }
>
> This paragraph has several problems. First, it is missing spaces around
> words, that is easy to fix.
>
> Second, it has a duplicated mathematical formula, which is pretty much a
> recipe for inconsistency. That is easy to fix too.
>
> Third, it has several hardcoded values, and that is not good design.
>
> It seems to me that using an expression, evaluated each time the metadata
> value changes and with that value available as a variable would be a much
> nicer design.

Four, it is using NULL for log context.

>
>> +        }
>> +    }
>> +
>>      if (vol->volume == 1.0 || vol->volume_i == 256)
>>          return ff_filter_frame(outlink, buf);
>>
>> @@ -269,6 +294,12 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
>> *buf)
>>      return ff_filter_frame(outlink, out_buf);
>>  }
>>
>
>> +static av_cold void uninit(AVFilterContext *ctx)
>> +{
>> +    VolumeContext *vol = ctx->priv;
>> +    av_opt_free(vol);
>> +}
>
> AFAIK, this is unneeded since the "evil plan".
>
>> +
>>  static const AVFilterPad avfilter_af_volume_inputs[] = {
>>      {
>>          .name           = "default",
>> @@ -294,6 +325,7 @@ AVFilter avfilter_af_volume = {
>>      .priv_size      = sizeof(VolumeContext),
>>      .priv_class     = &volume_class,
>>      .init           = init,
>> +    .uninit         = uninit,
>>      .inputs         = avfilter_af_volume_inputs,
>>      .outputs        = avfilter_af_volume_outputs,
>>      .flags          = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,
>> diff --git a/libavfilter/af_volume.h b/libavfilter/af_volume.h
>> index bd7932e..4deca9c 100644
>> --- a/libavfilter/af_volume.h
>> +++ b/libavfilter/af_volume.h
>> @@ -48,6 +48,7 @@ typedef struct VolumeContext {
>>      void (*scale_samples)(uint8_t *dst, const uint8_t *src, int
>> nb_samples,
>>                            int volume);
>>      int samples_align;
>> +    char *metadata;
>>  } VolumeContext;
>>
>>  void ff_volume_init_x86(VolumeContext *vol);
>
>> diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
>> index 88d37e8..f4ce6d9 100644
>> --- a/libavfilter/f_ebur128.c
>> +++ b/libavfilter/f_ebur128.c
>
> Unrelated.
>
>> @@ -410,7 +410,7 @@ static av_cold int init(AVFilterContext *ctx)
>>
>>      if (ebur128->loglevel != AV_LOG_INFO &&
>>          ebur128->loglevel != AV_LOG_VERBOSE) {
>
>> -        if (ebur128->do_video || ebur128->metadata)
>> +        if (ebur128->do_video)
>>              ebur128->loglevel = AV_LOG_VERBOSE;
>>          else
>>              ebur128->loglevel = AV_LOG_INFO;
>> @@ -689,7 +689,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame
>> *insamples)
>>                  SET_META("LRA.high", ebur128->lra_high);
>>              }
>>
>> -            av_log(ctx, ebur128->loglevel, "t: %-10s " LOG_FMT "\n",
>> +            av_log(ctx, ebur128->metadata || !ebur128->do_video ?
>> AV_LOG_VERBOSE : ebur128->loglevel, "t: %-10s " LOG_FMT "\n",
>>                     av_ts2timestr(pts, &outlink->time_base),
>>                     loudness_400, loudness_3000,
>>                     ebur128->integrated_loudness,
>> ebur128->loudness_range);
>> diff --git a/libavfilter/x86/af_volume_init.c
>> b/libavfilter/x86/af_volume_init.c
>> index 81d605f..fab5a03 100644
>> --- a/libavfilter/x86/af_volume_init.c
>> +++ b/libavfilter/x86/af_volume_init.c
>> @@ -39,7 +39,7 @@ av_cold void ff_volume_init_x86(VolumeContext *vol)
>>      enum AVSampleFormat sample_fmt =
>> av_get_packed_sample_fmt(vol->sample_fmt);
>>
>>      if (sample_fmt == AV_SAMPLE_FMT_S16) {
>> -        if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768) {
>> +        if (EXTERNAL_SSE2(mm_flags) && vol->volume_i < 32768 &&
>> !vol->metadata) {
>>              vol->scale_samples = ff_scale_samples_s16_sse2;
>>              vol->samples_align = 8;
>>          }
>
> Regards,
>
> --
>   Nicolas George
>