[FFmpeg-cvslog] AAC encoder: tweak PNS usage to be more aggressive

Fri Sep 25 09:17:19 CEST 2015

ffmpeg | branch: master | Claudio Freire <klaussfreire at gmail.com> | Fri Sep 25 03:56:32 2015 -0300| [9458a62decfcaa1313b1ba69276466de536d0768] | committer: Claudio Freire

AAC encoder: tweak PNS usage to be more aggressive

This patch tweaks search_for_pns to be both more
aggressive and more careful when applying PNS. On
the one side, it will again try to use PNS on zero
(or effectively zero) bands. For this, both zeroes
and band_type have to be checked (some ZERO bands
aren't marked in zeroes). On the other side, a more
accurate rate-distortion measure avoids using PNS
where it would cause audible distortion.

Also fixed a small bug in the computation of freq
that caused PNS usage on low-frequency bands during
8-short windows. This allows re-enabling PNS during
8-short.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9458a62decfcaa1313b1ba69276466de536d0768
---

 libavcodec/aaccoder.c |   47 +++++++++++++++++++++--------------------------
 tests/fate/aac.mak    |    2 +-
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/libavcodec/aaccoder.c b/libavcodec/aaccoder.c
index 8d5ea77..4749d8c 100644
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@@ -593,19 +593,18 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
     const float lambda = s->lambda;
     const float freq_mult = avctx->sample_rate/(1024.0f/sce->ics.num_windows)/2.0f;
     const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
-    const float spread_threshold = NOISE_SPREAD_THRESHOLD*(lambda/100.f);
-
-    if (sce->ics.window_sequence[0] == EIGHT_SHORT_SEQUENCE)
-        return;
+    const float spread_threshold = NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f);
 
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
     for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        int wstart = sce->ics.swb_offset[w*16];
         for (g = 0;  g < sce->ics.num_swb; g++) {
             int noise_sfi;
             float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
-            float pns_energy = 0.0f, energy_ratio, dist_thresh;
+            float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
             float sfb_energy = 0.0f, threshold = 0.0f, spread = 0.0f;
             const int start = sce->ics.swb_offset[w*16+g];
-            const float freq = start*freq_mult;
+            const float freq = (start-wstart)*freq_mult;
             const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
             if (freq < NOISE_LOW_LIMIT || avctx->cutoff && freq >= avctx->cutoff)
                 continue;
@@ -617,18 +616,22 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
             }
 
             /* Ramps down at ~8000Hz and loosens the dist threshold */
-            dist_thresh = FFMIN(2.5f*NOISE_LOW_LIMIT/freq, 1.27f);
+            dist_thresh = FFMIN(2.5f*NOISE_LOW_LIMIT/freq, 2.5f);
 
-            if (sce->zeroes[w*16+g] || spread < spread_threshold ||
-                sfb_energy > threshold*thr_mult*freq_boost) {
+            /* zero and energy close to threshold usually means hole avoidance,
+             * we do want to remain avoiding holes with PNS
+             */
+            if (((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.5f/freq_boost)) || spread < spread_threshold ||
+                (sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost)) {
                 sce->pns_ener[w*16+g] = sfb_energy;
                 continue;
             }
 
-            noise_sfi = av_clip(roundf(log2f(sfb_energy)*2), -100, 155); /* Quantize */
+            pns_tgt_energy = sfb_energy*spread*spread/sce->ics.group_len[w];
+            noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
             noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
             for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
-                float band_energy, scale;
+                float band_energy, scale, pns_senergy;
                 const int start_c = sce->ics.swb_offset[(w+w2)*16+g];
                 band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                 for (i = 0; i < sce->ics.swb_sizes[g]; i++)
@@ -636,7 +639,8 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
                 band_energy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
                 scale = noise_amp/sqrtf(band_energy);
                 s->fdsp->vector_fmul_scalar(PNS, PNS, scale, sce->ics.swb_sizes[g]);
-                pns_energy += s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                pns_senergy = s->fdsp->scalarproduct_float(PNS, PNS, sce->ics.swb_sizes[g]);
+                pns_energy += pns_senergy;
                 abs_pow34_v(NOR34, &sce->coeffs[start_c], sce->ics.swb_sizes[g]);
                 abs_pow34_v(PNS34, PNS, sce->ics.swb_sizes[g]);
                 dist1 += quantize_band_cost(s, &sce->coeffs[start_c],
@@ -645,23 +649,14 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
                                             sce->sf_idx[(w+w2)*16+g],
                                             sce->band_alt[(w+w2)*16+g],
                                             lambda/band->threshold, INFINITY, NULL, 0);
-                dist2 += quantize_band_cost(s, PNS,
-                                            PNS34,
-                                            sce->ics.swb_sizes[g],
-                                            noise_sfi,
-                                            NOISE_BT,
-                                            lambda/band->threshold, INFINITY, NULL, 0);
+                /* Estimate rd on average as 9 bits for CB and sf + spread energy * lambda/thr */
+                dist2 += 9+band->energy/(band->spread*band->spread)*lambda/band->threshold;
             }
-            energy_ratio = sfb_energy/pns_energy; /* Compensates for quantization error */
-            sce->pns_ener[w*16+g] = energy_ratio*sfb_energy;
-            if (energy_ratio > 0.85f && energy_ratio < 1.25f && dist1/dist2 > dist_thresh) {
+            energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
+            sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
+            if (energy_ratio > 0.85f && energy_ratio < 1.25f && (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || dist2*dist_thresh < dist1)) {
                 sce->band_type[w*16+g] = NOISE_BT;
                 sce->zeroes[w*16+g] = 0;
-                if (sce->band_type[w*16+g-1] != NOISE_BT && /* Prevent holes */
-                    sce->band_type[w*16+g-2] == NOISE_BT) {
-                    sce->band_type[w*16+g-1] = NOISE_BT;
-                    sce->zeroes[w*16+g-1] = 0;
-                }
             }
         }
     }
diff --git a/tests/fate/aac.mak b/tests/fate/aac.mak
index 8e9c915..d6a355e 100644
--- a/tests/fate/aac.mak
+++ b/tests/fate/aac.mak
@@ -174,7 +174,7 @@ fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.w
 fate-aac-pns-encode: CMP_SHIFT = -4096
 fate-aac-pns-encode: CMP_TARGET = 623.77
 fate-aac-pns-encode: SIZE_TOLERANCE = 3560
-fate-aac-pns-encode: FUZZ = 1
+fate-aac-pns-encode: FUZZ = 25
 
 FATE_AAC_ENCODE += fate-aac-tns-encode
 fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k