[FFmpeg-devel] [RFC] AAC Encoder

Sat Aug 16 03:57:56 CEST 2008

On Fri, Aug 15, 2008 at 07:59:52PM +0300, Kostya wrote:
[...]

> /**
>  * Set window sequence and related parameters for channel element.
>  *
>  * @param ctx   model context
>  * @param audio samples for the current frame
>  * @param la    lookahead samples (NULL when unavailable)
>  * @param tag   number of channel element to analyze
>  * @param type  channel element type (e.g. ID_SCE or ID_CPE)
>  * @param cpe   pointer to the current channel element
>  */
> void ff_aac_psy_suggest_window(AACPsyContext *ctx, int16_t *audio, int16_t *la, int tag, int type, ChannelElement *cpe);

I really think this should return the window size as int
0 could mean "dont know try both" possibly, of course it can be done
differently.

[...]
> /**
>  * Quantize one coefficient.
>  * @return absolute value of the quantized coefficient
>  * @see 3GPP TS26.403 5.6.2 "Scalefactor determination"
>  */
> static av_always_inline int quant(float coef, const float Q)
> {
>     return av_clip((int)(pow(fabsf(coef) * Q, 0.75) + 0.4054), 0, 8191);
> }

converting float to int by casting is rather slow on x86
also i do not see why the cliping against 0 is done

and where does the 0.4054 come from? How has this value been selected?

> 
> /**
>  * Convert coefficients to integers.

>  * @return sum of coefficients

this is not true, its the sum of their absolute values

[...]
> 
> /**
>  * constants for 3GPP AAC psychoacoustic model
>  * @{
>  */
> #define PSY_3GPP_C1 3.0f                    // log2(8.0)
> #define PSY_3GPP_C2 1.32192809488736234787f // log2(2.5)
> #define PSY_3GPP_C3 0.55935730170421255071f // 1 - C2/C1
> 
> #define PSY_3GPP_SPREAD_LOW  1.5f // spreading factor for ascending threshold spreading  (15 dB/Bark)
> #define PSY_3GPP_SPREAD_HI   3.0f // spreading factor for descending threshold spreading (30 dB/Bark)
> 

> #define PSY_3GPP_RPEMIN      0.01f
> #define PSY_3GPP_RPELEV      2.0f

RPE ?
please document what that means, iam no psychoacoustic guru
Its easier to review code when one knows what something is.

> /**
>  * @}
>  */
> 
> /**
>  * information for single band used by 3GPP TS26.403-inspired psychoacoustic model
>  */
> typedef struct Psy3gppBand{
>     float energy;    ///< band energy
>     float ffac;      ///< form factor
>     float thr;       ///< energy threshold
>     float pe;        ///< perceptual entropy

>     float a;         ///< constant part in perceptual entropy
>     float b;         ///< variable part in perceptual entropy

no, single letter variable names are not good for structs
iam also not truly happy about pe and nl either but they at least have names
matching the initial letters of what the abbreviate.

>     float nl;        ///< predicted number of lines left after quantization
>     float min_snr;   ///< minimal SNR
>     float thr_quiet; ///< threshold in quiet
> }Psy3gppBand;
> 
> /**
>  * single/pair channel context for psychoacoustic model
>  */
> typedef struct Psy3gppChannel{
>     float       a[2];                       ///< parameter used for perceptual entropy - constant part
>     float       b[2];                       ///< parameter used for perceptual entropy - variable part
>     float       pe[2];                      ///< channel perceptual entropy
>     float       thr[2];                     ///< channel thresholds sum

this feels somewhat duplicated from above, iam not saying i want it to
be changed though, i first need to better understand the code

>     Psy3gppBand band[2][128];               ///< bands information
>     Psy3gppBand prev_band[2][128];          ///< bands information from the previous frame

no next_band? yes it may be a naive question but my naive mind would consider
the next to be usefull as well when the previous is.

> 
>     float       win_nrg[2];                 ///< sliding average of channel energy

nrg is not an acceptable abbreviation for energy

[...]
> /**
>  * Tell encoder which window types to use.
>  * @see 3GPP TS26.403 5.4.1 "Blockswitching"
>  */
> static void psy_3gpp_window(AACPsyContext *apc, int16_t *audio, int16_t *la, int tag, int type, ChannelElement *cpe)
> {
>     int ch;
>     int chans = type == ID_CPE ? 2 : 1;
>     int i, j;
>     int br = apc->avctx->bit_rate / apc->avctx->channels;
>     int attack_ratio = (br <= 16000 + 8000*chans) ? 18 : 10;
>     Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
>     Psy3gppChannel *pch = &pctx->ch[tag];
>     uint8_t grouping[2];
>     enum WindowSequence win[2];
> 

>     if(la && !(apc->flags & PSY_MODEL_NO_SWITCH)){
>         float s[8], v;
>         for(ch = 0; ch < chans; ch++){
>             enum WindowSequence last_window_sequence = cpe->ch[ch].ics.window_sequence[0];
>             int switch_to_eight = 0;
>             float sum = 0.0, sum2 = 0.0;
>             int attack_n = 0;
>             for(i = 0; i < 8; i++){
>                 for(j = 0; j < 128; j++){
>                     v = iir_filter(audio[(i*128+j)*apc->avctx->channels+ch], pch->iir_state[ch]);

this filter can be unrolled by a factor of 2 to avoid some moves in its
state (like the low pass one)

>                     sum += v*v;
>                 }
>                 s[i] = sum;
>                 sum2 += sum;
>             }
>             for(i = 0; i < 8; i++){
>                 if(s[i] > pch->win_nrg[ch] * attack_ratio){
>                     attack_n = i + 1;
>                     switch_to_eight = 1;
>                     break;
>                 }
>             }
>             pch->win_nrg[ch] = pch->win_nrg[ch]*7/8 + sum2/64;
> 
>             switch(last_window_sequence){
>             case ONLY_LONG_SEQUENCE:
>                 win[ch] = switch_to_eight ? LONG_START_SEQUENCE : ONLY_LONG_SEQUENCE;
>                 grouping[ch] = 0;
>                 break;
>             case LONG_START_SEQUENCE:
>                 win[ch] = EIGHT_SHORT_SEQUENCE;
>                 grouping[ch] = pch->next_grouping[ch];
>                 break;
>             case LONG_STOP_SEQUENCE:
>                 win[ch] = ONLY_LONG_SEQUENCE;
>                 grouping[ch] = 0;
>                 break;
>             case EIGHT_SHORT_SEQUENCE:
>                 win[ch] = switch_to_eight ? EIGHT_SHORT_SEQUENCE : LONG_STOP_SEQUENCE;
>                 grouping[ch] = switch_to_eight ? pch->next_grouping[ch] : 0;
>                 break;
>             }
>             pch->next_grouping[ch] = window_grouping[attack_n];
>         }

How much quality is lost by using this compared to RD optimal switching?

[...]

> /**
>  * Modify threshold by adding some value in loudness domain.
>  * @see 3GPP TS26.403 5.6.1.1.1 "Addition of noise with equal loudness"
>  */
> static inline float modify_thr(float thr, float r){
>     float t;
>     t = pow(thr, 0.25) + r;
>     return t*t*t*t;

(t*t)*(t*t)
might help the compiler optimize it

[...]
> /**
>  * Determine scalefactors and prepare coefficients for encoding.
>  * @see 3GPP TS26.403 5.4 "Psychoacoustic model"
>  */
> static void psy_3gpp_process(AACPsyContext *apc, int tag, int type, ChannelElement *cpe)
> {
>     int start;
>     int ch, w, wg, g, g2, i;
>     int prev_scale;
>     Psy3gppContext *pctx = (Psy3gppContext*) apc->model_priv_data;
>     float pe_target;
>     int bits_avail;
>     int chans = type == ID_CPE ? 2 : 1;
>     Psy3gppChannel *pch = &pctx->ch[tag];
> 
>     //calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
>     memset(pch->band, 0, sizeof(pch->band));
>     for(ch = 0; ch < chans; ch++){
>         start = 0;
>         for(w = 0; w < cpe->ch[ch].ics.num_windows; w++){
>             for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                 g2 = w*16 + g;
>                 for(i = 0; i < cpe->ch[ch].ics.swb_sizes[g]; i++)
>                     pch->band[ch][g2].energy +=  cpe->ch[ch].coeffs[start+i] *  cpe->ch[ch].coeffs[start+i];

>                 pch->band[ch][g2].energy /= 262144.0f;

and this factor does what?
besides it should be *= 1.0/...

>                 pch->band[ch][g2].thr = pch->band[ch][g2].energy * 0.001258925f;
>                 start += cpe->ch[ch].ics.swb_sizes[g];
>                 if(pch->band[ch][g2].energy != 0.0){
>                     float ffac = 0.0;
> 
>                     for(i = 0; i < cpe->ch[ch].ics.swb_sizes[g]; i++)
>                         ffac += sqrt(FFABS(cpe->ch[ch].coeffs[start+i]));
>                     pch->band[ch][g2].ffac = ffac / sqrt(512.0);
>                 }
>             }
>         }
>     }
> 
>     //modify thresholds - spread, threshold in quiet - 5.4.3 "Spreaded Energy Calculation"
>     for(ch = 0; ch < chans; ch++){
>         for(w = 0; w < cpe->ch[ch].ics.num_windows; w++){
>             for(g = 1; g < cpe->ch[ch].ics.num_swb; g++){
>                 g2 = w*16 + g;
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][g2].thr = FFMAX(pch->band[ch][g2].thr, pch->band[ch][g2-1].thr * pctx->s_low_l[g-1]);
>                 else
>                     pch->band[ch][g2].thr = FFMAX(pch->band[ch][g2].thr, pch->band[ch][g2-1].thr * pctx->s_low_s[g-1]);
>             }
>             for(g = cpe->ch[ch].ics.num_swb - 2; g >= 0; g--){
>                 g2 = w*16 + g;
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][g2].thr = FFMAX(pch->band[ch][g2].thr, pch->band[ch][g2+1].thr * pctx->s_hi_l[g+1]);
>                 else
>                     pch->band[ch][g2].thr = FFMAX(pch->band[ch][g2].thr, pch->band[ch][g2+1].thr * pctx->s_hi_s[g+1]);
>             }
>             for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                 g2 = w*16 + g;
>                 if(cpe->ch[ch].ics.num_swb == apc->num_bands1024)
>                     pch->band[ch][g2].thr_quiet = FFMAX(pch->band[ch][g2].thr, pctx->ath_l[g]);
>                 else
>                     pch->band[ch][g2].thr_quiet = FFMAX(pch->band[ch][g2].thr, pctx->ath_s[g]);
>                 pch->band[ch][g2].thr_quiet = fmaxf(PSY_3GPP_RPEMIN*pch->band[ch][g2].thr_quiet, fminf(pch->band[ch][g2].thr_quiet, PSY_3GPP_RPELEV*pch->prev_band[ch][g2].thr_quiet));
>                 pch->band[ch][g2].thr = FFMAX(pch->band[ch][g2].thr, pch->band[ch][g2].thr_quiet * 0.25);
>             }
>         }
>     }
> 

>     // M/S detection - 5.5.2 "Mid/Side Stereo"
>     if(chans > 1 && cpe->common_window){
>         start = 0;
>         for(w = 0; w < cpe->ch[0].ics.num_windows; w++){
>             for(g = 0; g < cpe->ch[0].ics.num_swb; g++){
>                 double en_m = 0.0, en_s = 0.0, ff_m = 0.0, ff_s = 0.0, l1;
>                 float m, s;
> 
>                 g2 = w*16 + g;
>                 cpe->ms.mask[w][g] = 0;
>                 if(pch->band[0][g2].energy == 0.0 || pch->band[1][g2].energy == 0.0)
>                     continue;

>                 for(i = 0; i < cpe->ch[0].ics.swb_sizes[g]; i++){
>                     m = (cpe->ch[0].coeffs[start+i] + cpe->ch[1].coeffs[start+i]) / 2.0;
>                     s = (cpe->ch[0].coeffs[start+i] - cpe->ch[1].coeffs[start+i]) / 2.0;
>                     en_m += m*m;
>                     en_s += s*s;
>                     ff_m += sqrt(FFABS(m));
>                     ff_s += sqrt(FFABS(s));
>                 }
>                 en_m /= 262144.0;
>                 en_s /= 262144.0;

the /2 can be removed out of the loop

>                 ff_m /= sqrt(512.0);
>                 ff_s /= sqrt(512.0);

>                 l1 = FFMIN(pch->band[0][g2].thr, pch->band[1][g2].thr);
>                 if(en_m == 0.0 || en_s == 0.0 || l1*l1 / (en_m * en_s) >= (pch->band[0][g2].thr * pch->band[1][g2].thr / (pch->band[0][g2].energy * pch->band[1][g2].energy))){

If i assume that the threshold magic does something positve and just
simplify it

t0=  pch->band[0][g2].thr
t1=  pch->band[1][g2].thr
mint= FFMIN(t0,t1)
maxt= FFMAX(t0,t1)
en0= pch->band[0][g2].energy
en1= pch->band[1][g2].energy
if(mint*en0*en1 >= en_m*en_s*maxt){

then one can actually read the code and it makes even some sense

also the ff_m/_s is not used unless the condition is true, and then the whole
code is full of duplicated variants calculating it this should be done in a
simpler and cleaner way and only when it is actually used and once, 
sqrt is not that fast.

[...]
>         //determine scalefactors - 5.6.2 "Scalefactor determination"
>         for(ch = 0; ch < chans; ch++){
>             prev_scale = -1;
>             for(w = 0; w < cpe->ch[ch].ics.num_windows; w++){
>                 for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                     g2 = w*16 + g;

>                     cpe->ch[ch].zeroes[w][g] = pch->band[ch][g2].thr >= pch->band[ch][g2].energy;

how much quality is lost compared to full RD decission ? its just a matter of
checking how many bits this would need which is likely negligible speed wise.
(assuming you can unentangle the threshold check into a distortion
 computation)

>                     if(cpe->ch[ch].zeroes[w][g]) continue;
>                     //spec gives constant for lg() but we scaled it for log2()
>                     cpe->ch[ch].sf_idx[w][g] = (int)(2.66667 * (log2(6.75*pch->band[ch][g2].thr) - log2(pch->band[ch][g2].ffac)));

2.666... * log2(6.75*pch->band[ch][g2].thr / pch->band[ch][g2].ffac)

>                     if(prev_scale != -1)
>                         cpe->ch[ch].sf_idx[w][g] = av_clip(cpe->ch[ch].sf_idx[w][g], prev_scale - SCALE_MAX_DIFF, prev_scale + SCALE_MAX_DIFF);
>                     prev_scale = cpe->ch[ch].sf_idx[w][g];
>                 }
>             }
>         }
>         break;
>     case PSY_MODE_QUALITY:
>         for(ch = 0; ch < chans; ch++){
>             start = 0;
>             for(w = 0; w < cpe->ch[ch].ics.num_windows; w++){
>                 for(g = 0; g < cpe->ch[ch].ics.num_swb; g++){
>                     g2 = w*16 + g;
>                     if(pch->band[ch][g2].thr >= pch->band[ch][g2].energy){
>                         cpe->ch[ch].sf_idx[w][g] = 0;
>                         cpe->ch[ch].zeroes[w][g] = 1;
>                     }else{
>                         cpe->ch[ch].zeroes[w][g] = 0;
>                         cpe->ch[ch].sf_idx[w][g] = (int)(2.66667 * (log2(6.75*pch->band[ch][g2].thr) - log2(pch->band[ch][g2].ffac)));
>                         while(cpe->ch[ch].sf_idx[ch][g] > 3){
>                             float dist = calc_distortion(cpe->ch[ch].coeffs + start, cpe->ch[ch].ics.swb_sizes[g], SCALE_ONE_POS + cpe->ch[ch].sf_idx[ch][g]);
>                             if(dist < pch->band[ch][g2].thr) break;
>                             cpe->ch[ch].sf_idx[ch][g] -= 3;
>                         }
>                     }

looks rather similar to the previous cases

>                     start += cpe->ch[ch].ics.swb_sizes[g];
>                 }
>             }
>         }
>         break;
>     }
> 

>     //limit scalefactors

why? bitstream limits or is this related to psy?

[...]
> static const AACPsyModel psy_models[AAC_NB_PSY_MODELS] =
> {
>     {
>        "Null model",
>         NULL,
>         psy_null_window,
>         psy_null_process,
>         NULL,
>     },
>     {
>        "Null model - short windows",
>         NULL,
>         psy_null8_window,
>         psy_null8_process,
>         NULL,
>     },
>     {
>        "3GPP TS 26.403-inspired model",
>         psy_3gpp_init,
>         psy_3gpp_window,
>         psy_3gpp_process,
>         psy_3gpp_end,
>     },
> };

Iam tempted to suggest that you drop the 2 null models. I see no real
sense in them. But if you like to keep them thats perfectly fine with me.
Adding a 4th model for experimenation would possibly make sense
Also you could keep the 3gpp exactly as the spec says and implement all my
suggestions rather in a 4th model that would make comparissions easier and
would give some protection against mistakes ...

> 
> int av_cold ff_aac_psy_init(AACPsyContext *ctx, AVCodecContext *avctx,
>                             enum AACPsyModelType model, int elements, int flags,
>                             const uint8_t *bands1024, int num_bands1024,
>                             const uint8_t *bands128,  int num_bands128)
> {
>     int i;
> 

>     if(model >= AAC_NB_PSY_MODELS || !psy_models[model].window || !psy_models[model].process){
>          av_log(avctx, AV_LOG_ERROR, "Invalid psy model\n");
>          return -1;
>     }

is there really any sense in checking .window and .process of a const static
array?

> 

> #ifndef CONFIG_HARDCODED_TABLES
>    for (i = 0; i < 316; i++)
>         ff_aac_pow2sf_tab[i] = pow(2, (i - 200)/4.);
> #endif /* CONFIG_HARDCODED_TABLES */

this is likely duplicated

[...]
> void ff_aac_psy_preprocess(AACPsyContext *ctx, int16_t *audio, int16_t *dest, int tag, int type)
> {
>     int chans = type == ID_CPE ? 2 : 1;
>     const int chstride = ctx->avctx->channels;
>     int i, ch;
>     float t[2];
> 
>     if(chans == 1){

>         for(ch = 0; ch < chans; ch++){
>             for(i = 0; i < 1024; i++){
>                 dest[i * chstride + ch] = audio[i * chstride + ch];
>             }
>         }

memcpy

>     }else{
>         for(i = 0; i < 1024; i++){
>             if(ctx->flags & PSY_MODEL_NO_ST_ATT){
>                 for(ch = 0; ch < 2; ch++)
>                     t[ch] = audio[i * chstride + ch];
>             }else{
>                 t[0] = audio[i * chstride + 0] * (0.5 + ctx->stereo_att) + audio[i * chstride + 1] * (0.5 - ctx->stereo_att);
>                 t[1] = audio[i * chstride + 0] * (0.5 - ctx->stereo_att) + audio[i * chstride + 1] * (0.5 + ctx->stereo_att);
>             }
>             if(!(ctx->flags & PSY_MODEL_NO_LOWPASS)){
>                 LPFilterState *is = (LPFilterState*)ctx->lp_state + tag*2;

>                 for(ch = 0; ch < 2; ch++)
>                     t[ch] = ff_lowpass_filter(&ctx->lp_coeffs, is + ch, t[ch]);

i do not think this is compatible with the filter you posted

>             }
>             for(ch = 0; ch < 2; ch++)
>                 dest[i * chstride + ch] = av_clip_int16(t[ch]);

we have optimized code for converting float to int16, please use it

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

It is not what we do, but why we do it that matters.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080816/30ad028b/attachment.pgp>