[FFmpeg-devel] [PATCH 2/3] [GSoC] [AAC] aaccoder: Implement Perceptual Noise Substitution

Mon Apr 13 03:59:55 CEST 2015

Here's an objective comparison of the difference the patch makes:

Original spectrum:
https://0x0.st/T7.png

Encoded without the patchset:
https://0x0.st/Th.png

Encoded with the patchset:
https://0x0.st/TF.png

Difference:
https://0x0.st/TR.png <https://0x0.st/T5.png>
Made by: "$ composite Encoded_clean.png Encoded_noise.png -compose
difference Difference.png"

On 12 April 2015 at 05:50, Rostislav Pehlivanov <atomnuker at gmail.com> wrote:

> This commit enables the use of the pseudo-codebook NOISE_BT for encoding
> noise values for the twoloop coder. It uses the energy values from the
> psychoacoustic model to determine whether it's acceptible to use noise for
> encoding and if so, determine the energy of the noise. The cost system was
> modified to accept the 13th codebook (skipping the nonexistant 12). The
> system was extended such that in the future it should be easy to add
> support for intensity stereo coding, hence the use of arrays for the maps.
>
> The parameters used (such as the factor by which uplims is multiplied when
> comparing and the cost returned by the BT_NOISE case) and the way energy
> values are converted to scalefactor indices have not been extensively
> tested, so safe values which should not break anything were used. They are
> to be tweaked in the future to optimize audio quality if needed.
> ---
>  libavcodec/aaccoder.c | 128
> +++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 86 insertions(+), 42 deletions(-)
>
> diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
> index 64eee32..f7662fd 100644
> --- a/libavcodec/aaccoder.c
> +++ b/libavcodec/aaccoder.c
> @@ -40,6 +40,9 @@
>  #include "aacenc.h"
>  #include "aactab.h"
>
> +/** Total number of usable codebooks **/
> +#define CB_TOT 13
> +
>  /** bits needed to code codebook run value for long windows */
>  static const uint8_t run_value_bits_long[64] = {
>       5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
> @@ -57,6 +60,10 @@ static const uint8_t * const run_value_bits[2] = {
>      run_value_bits_long, run_value_bits_short
>  };
>
> +/** Map to convert values from BandCodingPath index to a codebook index
> **/
> +static const uint8_t aac_cb_out_map[CB_TOT]  =
> {0,1,2,3,4,5,6,7,8,9,10,11,13};
> +/** Inverse map to convert from codebooks to BandCodingPath indices **/
> +static const uint8_t aac_cb_in_map[CB_TOT+1] =
> {0,1,2,3,4,5,6,7,8,9,10,11,0,12};
>
>  /**
>   * Quantize one coefficient.
> @@ -108,7 +115,7 @@ static av_always_inline float
> quantize_and_encode_band_cost_template(
>                                  const float *scaled, int size, int
> scale_idx,
>                                  int cb, const float lambda, const float
> uplim,
>                                  int *bits, int BT_ZERO, int BT_UNSIGNED,
> -                                int BT_PAIR, int BT_ESC)
> +                                int BT_PAIR, int BT_ESC, int BT_NOISE)
>  {
>      const int q_idx = POW_SF2_ZERO - scale_idx + SCALE_ONE_POS -
> SCALE_DIV_512;
>      const float Q   = ff_aac_pow2sf_tab [q_idx];
> @@ -119,8 +126,6 @@ static av_always_inline float
> quantize_and_encode_band_cost_template(
>      float cost = 0;
>      const int dim = BT_PAIR ? 2 : 4;
>      int resbits = 0;
> -    const int range  = aac_cb_range[cb];
> -    const int maxval = aac_cb_maxval[cb];
>      int off;
>
>      if (BT_ZERO) {
> @@ -130,15 +135,22 @@ static av_always_inline float
> quantize_and_encode_band_cost_template(
>              *bits = 0;
>          return cost * lambda;
>      }
> +    if (BT_NOISE) {
> +        for (i = 0; i < size; i++)
> +            cost += in[i]*in[i];
> +        if (bits)
> +            *bits = 0;
> +        return cost * lambda;
> +    }
>      if (!scaled) {
>          abs_pow34_v(s->scoefs, in, size);
>          scaled = s->scoefs;
>      }
> -    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED,
> maxval);
> +    quantize_bands(s->qcoefs, in, scaled, size, Q34, !BT_UNSIGNED,
> aac_cb_maxval[cb]);
>      if (BT_UNSIGNED) {
>          off = 0;
>      } else {
> -        off = maxval;
> +        off = aac_cb_maxval[cb];
>      }
>      for (i = 0; i < size; i += dim) {
>          const float *vec;
> @@ -147,7 +159,7 @@ static av_always_inline float
> quantize_and_encode_band_cost_template(
>          int curbits;
>          float rd = 0.0f;
>          for (j = 0; j < dim; j++) {
> -            curidx *= range;
> +            curidx *= aac_cb_range[cb];
>              curidx += quants[j] + off;
>          }
>          curbits =  ff_aac_spectral_bits[cb-1][curidx];
> @@ -207,8 +219,8 @@ static av_always_inline float
> quantize_and_encode_band_cost_template(
>      return cost;
>  }
>
> -#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED,
> BT_PAIR, BT_ESC) \
> -static float quantize_and_encode_band_cost_ ## NAME(
>                   \
> +#define QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NAME, BT_ZERO, BT_UNSIGNED,
> BT_PAIR, BT_ESC, BT_NOISE) \
> +static float quantize_and_encode_band_cost_ ## NAME(
>               \
>                                  struct AACEncContext *s,
>               \
>                                  PutBitContext *pb, const float *in,
>                \
>                                  const float *scaled, int size, int
> scale_idx,           \
> @@ -217,15 +229,16 @@ static float quantize_and_encode_band_cost_ ## NAME(
>      return quantize_and_encode_band_cost_template(
>               \
>                                  s, pb, in, scaled, size, scale_idx,
>                \
>                                  BT_ESC ? ESC_BT : cb, lambda, uplim,
> bits,              \
> -                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC);
>                \
> +                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC,
> BT_NOISE);       \
>  }
>
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0)
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0)
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0)
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0)
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0)
> -QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ZERO,  1, 0, 0, 0, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SQUAD, 0, 0, 0, 0, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UQUAD, 0, 1, 0, 0, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(SPAIR, 0, 0, 1, 0, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(UPAIR, 0, 1, 1, 0, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(ESC,   0, 1, 1, 1, 0)
> +QUANTIZE_AND_ENCODE_BAND_COST_FUNC(NOISE, 0, 0, 0, 0, 1)
>
>  static float (*const quantize_and_encode_band_cost_arr[])(
>                                  struct AACEncContext *s,
> @@ -245,6 +258,8 @@ static float (*const
> quantize_and_encode_band_cost_arr[])(
>      quantize_and_encode_band_cost_UPAIR,
>      quantize_and_encode_band_cost_UPAIR,
>      quantize_and_encode_band_cost_ESC,
> +    NULL,
> +    quantize_and_encode_band_cost_NOISE,
>  };
>
>  #define quantize_and_encode_band_cost(                                  \
> @@ -312,7 +327,7 @@ typedef struct BandCodingPath {
>  static void encode_window_bands_info(AACEncContext *s,
> SingleChannelElement *sce,
>                                       int win, int group_len, const float
> lambda)
>  {
> -    BandCodingPath path[120][12];
> +    BandCodingPath path[120][CB_TOT];
>      int w, swb, cb, start, size;
>      int i, j;
>      const int max_sfb  = sce->ics.max_sfb;
> @@ -325,7 +340,7 @@ static void encode_window_bands_info(AACEncContext *s,
> SingleChannelElement *sce
>
>      abs_pow34_v(s->scoefs, sce->coeffs, 1024);
>      start = win*128;
> -    for (cb = 0; cb < 12; cb++) {
> +    for (cb = 0; cb < CB_TOT; cb++) {
>          path[0][cb].cost     = 0.0f;
>          path[0][cb].prev_idx = -1;
>          path[0][cb].run      = 0;
> @@ -333,7 +348,7 @@ static void encode_window_bands_info(AACEncContext *s,
> SingleChannelElement *sce
>      for (swb = 0; swb < max_sfb; swb++) {
>          size = sce->ics.swb_sizes[swb];
>          if (sce->zeroes[win*16 + swb]) {
> -            for (cb = 0; cb < 12; cb++) {
> +            for (cb = 0; cb < CB_TOT; cb++) {
>                  path[swb+1][cb].prev_idx = cb;
>                  path[swb+1][cb].cost     = path[swb][cb].cost;
>                  path[swb+1][cb].run      = path[swb][cb].run + 1;
> @@ -343,14 +358,14 @@ static void encode_window_bands_info(AACEncContext
> *s, SingleChannelElement *sce
>              int mincb = next_mincb;
>              next_minrd = INFINITY;
>              next_mincb = 0;
> -            for (cb = 0; cb < 12; cb++) {
> +            for (cb = 0; cb < CB_TOT; cb++) {
>                  float cost_stay_here, cost_get_here;
>                  float rd = 0.0f;
>                  for (w = 0; w < group_len; w++) {
>                      FFPsyBand *band = &s->psy.ch
> [s->cur_channel].psy_bands[(win+w)*16+swb];
>                      rd += quantize_band_cost(s, sce->coeffs + start +
> w*128,
>                                               s->scoefs + start + w*128,
> size,
> -                                             sce->sf_idx[(win+w)*16+swb],
> cb,
> +                                             sce->sf_idx[(win+w)*16+swb],
> aac_cb_out_map[cb],
>                                               lambda / band->threshold,
> INFINITY, NULL);
>                  }
>                  cost_stay_here = path[swb][cb].cost + rd;
> @@ -379,7 +394,7 @@ static void encode_window_bands_info(AACEncContext *s,
> SingleChannelElement *sce
>      //convert resulting path from backward-linked list
>      stack_len = 0;
>      idx       = 0;
> -    for (cb = 1; cb < 12; cb++)
> +    for (cb = 1; cb < CB_TOT; cb++)
>          if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
>              idx = cb;
>      ppos = max_sfb;
> @@ -394,12 +409,13 @@ static void encode_window_bands_info(AACEncContext
> *s, SingleChannelElement *sce
>      //perform actual band info encoding
>      start = 0;
>      for (i = stack_len - 1; i >= 0; i--) {
> -        put_bits(&s->pb, 4, stackcb[i]);
> +        cb = aac_cb_out_map[stackcb[i]];
> +        put_bits(&s->pb, 4, cb);
>          count = stackrun[i];
> -        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
> +        memset(sce->zeroes + win*16 + start, !cb, count);
>          //XXX: memset when band_type is also uint8_t
>          for (j = 0; j < count; j++) {
> -            sce->band_type[win*16 + start] =  stackcb[i];
> +            sce->band_type[win*16 + start] = cb;
>              start++;
>          }
>          while (count >= run_esc) {
> @@ -413,7 +429,7 @@ static void encode_window_bands_info(AACEncContext *s,
> SingleChannelElement *sce
>  static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement
> *sce,
>                                    int win, int group_len, const float
> lambda)
>  {
> -    BandCodingPath path[120][12];
> +    BandCodingPath path[120][CB_TOT];
>      int w, swb, cb, start, size;
>      int i, j;
>      const int max_sfb  = sce->ics.max_sfb;
> @@ -426,7 +442,7 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>
>      abs_pow34_v(s->scoefs, sce->coeffs, 1024);
>      start = win*128;
> -    for (cb = 0; cb < 12; cb++) {
> +    for (cb = 0; cb < CB_TOT; cb++) {
>          path[0][cb].cost     = run_bits+4;
>          path[0][cb].prev_idx = -1;
>          path[0][cb].run      = 0;
> @@ -450,7 +466,7 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>              }
>              next_minbits = path[swb+1][0].cost;
>              next_mincb = 0;
> -            for (cb = 1; cb < 12; cb++) {
> +            for (cb = 1; cb < CB_TOT; cb++) {
>                  path[swb+1][cb].cost = 61450;
>                  path[swb+1][cb].prev_idx = -1;
>                  path[swb+1][cb].run = 0;
> @@ -459,6 +475,7 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>              float minbits = next_minbits;
>              int mincb = next_mincb;
>              int startcb = sce->band_type[win*16+swb];
> +            startcb = aac_cb_in_map[startcb];
>              next_minbits = INFINITY;
>              next_mincb = 0;
>              for (cb = 0; cb < startcb; cb++) {
> @@ -466,13 +483,20 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>                  path[swb+1][cb].prev_idx = -1;
>                  path[swb+1][cb].run = 0;
>              }
> -            for (cb = startcb; cb < 12; cb++) {
> +            for (cb = startcb; cb < CB_TOT; cb++) {
>                  float cost_stay_here, cost_get_here;
>                  float bits = 0.0f;
> +                if (cb == 12 && sce->band_type[win*16+swb] != NOISE_BT) {
> +                    path[swb+1][cb].cost = 61450;
> +                    path[swb+1][cb].prev_idx = -1;
> +                    path[swb+1][cb].run = 0;
> +                    continue;
> +                }
>                  for (w = 0; w < group_len; w++) {
>                      bits += quantize_band_cost(s, sce->coeffs + start +
> w*128,
>                                                 s->scoefs + start + w*128,
> size,
> -
>  sce->sf_idx[(win+w)*16+swb], cb,
> +
>  sce->sf_idx[(win+w)*16+swb],
> +                                               aac_cb_out_map[cb],
>                                                 0, INFINITY, NULL);
>                  }
>                  cost_stay_here = path[swb][cb].cost + bits;
> @@ -501,7 +525,7 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>      //convert resulting path from backward-linked list
>      stack_len = 0;
>      idx       = 0;
> -    for (cb = 1; cb < 12; cb++)
> +    for (cb = 1; cb < CB_TOT; cb++)
>          if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
>              idx = cb;
>      ppos = max_sfb;
> @@ -517,12 +541,13 @@ static void codebook_trellis_rate(AACEncContext *s,
> SingleChannelElement *sce,
>      //perform actual band info encoding
>      start = 0;
>      for (i = stack_len - 1; i >= 0; i--) {
> -        put_bits(&s->pb, 4, stackcb[i]);
> +        cb = aac_cb_out_map[stackcb[i]];
> +        put_bits(&s->pb, 4, cb);
>          count = stackrun[i];
> -        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
> +        memset(sce->zeroes + win*16 + start, !cb, count);
>          //XXX: memset when band_type is also uint8_t
>          for (j = 0; j < count; j++) {
> -            sce->band_type[win*16 + start] =  stackcb[i];
> +            sce->band_type[win*16 + start] = cb;
>              start++;
>          }
>          while (count >= run_esc) {
> @@ -711,8 +736,9 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>  {
>      int start = 0, i, w, w2, g;
>      int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate /
> avctx->channels * (lambda / 120.f);
> -    float dists[128] = { 0 }, uplims[128];
> +    float dists[128] = { 0 }, uplims[128] = { 0 }, energies[128] = { 0 };
>      float maxvals[128];
> +    float energy_avg = 0;
>      int fflag, minscaler;
>      int its  = 0;
>      int allz = 0;
> @@ -724,32 +750,47 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>      //XXX: some heuristic to determine initial quantizers will reduce
> search time
>      //determine zero bands and upper limits
>      for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
> +        start = 0;
>          for (g = 0;  g < sce->ics.num_swb; g++) {
>              int nz = 0;
> -            float uplim = 0.0f;
> +            float uplim = 0.0f, energy = 0.0f;
> +            float freq =
> (w*16+g)*(avctx->sample_rate/(1024/sce->ics.num_windows)/2);
>              for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
>                  FFPsyBand *band = &s->psy.ch
> [s->cur_channel].psy_bands[(w+w2)*16+g];
>                  uplim += band->threshold;
> -                if (band->energy <= band->threshold || band->threshold ==
> 0.0f) {
> +                energy += band->energy;
> +                if (band->threshold == 0.0f || band->energy <
> band->threshold) {
>                      sce->zeroes[(w+w2)*16+g] = 1;
>                      continue;
>                  }
>                  nz = 1;
>              }
>              uplims[w*16+g] = uplim *512;
> +            energies[w*16+g] = log2f(2*(energy*energy));
> +            energy_avg = (energies[w*16+g] + energy_avg)/2;
> +            if (freq > 4000.0f && energy <= uplim * 1.52f) {
> +                sce->band_type[w*16+g] = NOISE_BT;
> +                nz = 1;
> +            } else { /* Will be determined in the two-loop search */
> +                sce->band_type[w*16+g] = 0;
> +            }
>              sce->zeroes[w*16+g] = !nz;
>              if (nz)
>                  minthr = FFMIN(minthr, uplim);
>              allz |= nz;
> +            start += sce->ics.swb_sizes[g];
>          }
>      }
>      for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
>          for (g = 0;  g < sce->ics.num_swb; g++) {
>              if (sce->zeroes[w*16+g]) {
>                  sce->sf_idx[w*16+g] = SCALE_ONE_POS;
> -                continue;
> +            } else if (sce->band_type[w*16+g] == NOISE_BT) {
> +                float energy_norm = (energies[w*16+g]/energy_avg);
> +                sce->sf_idx[w*16+g] = av_clip((energy_norm*256) - 70,
> -100, 155);
> +            } else {
> +                sce->sf_idx[w*16+g] = SCALE_ONE_POS +
> FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
>              }
> -            sce->sf_idx[w*16+g] = SCALE_ONE_POS +
> FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
>          }
>      }
>
> @@ -785,7 +826,8 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>                      int cb;
>                      float dist = 0.0f;
>
> -                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >=
> 218) {
> +                    if (sce->zeroes[w*16+g] || sce->band_type[w*16+g] >=
> NOISE_BT ||
> +                        sce->sf_idx[w*16+g] >= 218) {
>                          start += sce->ics.swb_sizes[g];
>                          continue;
>                      }
> @@ -814,11 +856,11 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>              }
>              if (tbits > destbits) {
>                  for (i = 0; i < 128; i++)
> -                    if (sce->sf_idx[i] < 218 - qstep)
> +                    if (sce->sf_idx[i] < 218 - qstep && sce->band_type[i]
> < NOISE_BT)
>                          sce->sf_idx[i] += qstep;
>              } else {
>                  for (i = 0; i < 128; i++)
> -                    if (sce->sf_idx[i] > 60 - qstep)
> +                    if (sce->sf_idx[i] > 60 - qstep && sce->band_type[i]
> < NOISE_BT)
>                          sce->sf_idx[i] -= qstep;
>              }
>              qstep >>= 1;
> @@ -831,7 +873,7 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>          for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
> {
>              for (g = 0; g < sce->ics.num_swb; g++) {
>                  int prevsc = sce->sf_idx[w*16+g];
> -                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g]
> > 60) {
> +                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g]
> > 60 && sce->band_type[w*16+g] < NOISE_BT) {
>                      if (find_min_book(maxvals[w*16+g],
> sce->sf_idx[w*16+g]-1))
>                          sce->sf_idx[w*16+g]--;
>                      else //Try to make sure there is some energy in every
> band
> @@ -839,6 +881,8 @@ static void
> search_for_quantizers_twoloop(AVCodecContext *avctx,
>                  }
>                  sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g],
> minscaler, minscaler + SCALE_MAX_DIFF);
>                  sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
> +                if (sce->band_type[w*16+g] >= NOISE_BT)
> +                    continue;
>                  if (sce->sf_idx[w*16+g] != prevsc)
>                      fflag = 1;
>                  sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g],
> sce->sf_idx[w*16+g]);
> --
> 2.1.4
>
>