[FFmpeg-devel] [PATCH] HE-AAC v1 decoder

Måns Rullgård mans
Wed Jan 27 16:10:08 CET 2010


Alex Converse <alex.converse at gmail.com> writes:

> Greetings Sirs,
>
> Enclosed you will find one (1) HE-AAC decoder to make glorious great
> software FFmpeg.

Good stuff :-)

> More seriously, attached is a patch that implements SBR and integrates
> it with AAC to make an HE-AAC v1 decoder. A filterbank compatible with
> ff_float_to_int16_interleave_c is missing but otherwise the code is
> complete.

Does this mean it won't work on systems that don't have an ASM
version?

> Note that the decoder spends 90+% of its time in the filterbank and
> will be extremely slow on any system that does not have a asm
> version of scalarproduct_float().

Good thing it's an easy function to write asm for. ;-)

> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 94c93dd..e5ca336 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -40,7 +40,7 @@ OBJS-$(CONFIG_VAAPI)                   += vaapi.o
>  OBJS-$(CONFIG_VDPAU)                   += vdpau.o
>  
>  # decoders/encoders/hardware accelerators
> -OBJS-$(CONFIG_AAC_DECODER)             += aac.o aactab.o
> +OBJS-$(CONFIG_AAC_DECODER)             += aac.o aactab.o aacsbr.o
>  OBJS-$(CONFIG_AAC_ENCODER)             += aacenc.o aaccoder.o    \
>                                            aacpsy.o aactab.o      \
>                                            psymodel.o iirfilter.o \

OK

> +av_cold void ff_aac_sbr_init(void)
> +{
> +    int n, k;
> +    static const struct {
> +        const void *sbr_codes, *sbr_bits;
> +        const unsigned int table_size, elem_size;
> +    } sbr_tmp[] = {
> +        { t_huffman_env_1_5dB_codes,       t_huffman_env_1_5dB_bits,
> +            sizeof(t_huffman_env_1_5dB_codes),       sizeof(t_huffman_env_1_5dB_codes[0]) },
> +        { f_huffman_env_1_5dB_codes,       f_huffman_env_1_5dB_bits,
> +            sizeof(f_huffman_env_1_5dB_codes),       sizeof(f_huffman_env_1_5dB_codes[0]) },
> +        { t_huffman_env_bal_1_5dB_codes,   t_huffman_env_bal_1_5dB_bits,
> +            sizeof(t_huffman_env_bal_1_5dB_codes),   sizeof(t_huffman_env_bal_1_5dB_codes[0]) },
> +        { f_huffman_env_bal_1_5dB_codes,   f_huffman_env_bal_1_5dB_bits,
> +            sizeof(f_huffman_env_bal_1_5dB_codes),   sizeof(f_huffman_env_bal_1_5dB_codes[0]) },
> +        { t_huffman_env_3_0dB_codes,       t_huffman_env_3_0dB_bits,
> +            sizeof(t_huffman_env_3_0dB_codes),       sizeof(t_huffman_env_3_0dB_codes[0]) },
> +        { f_huffman_env_3_0dB_codes,       f_huffman_env_3_0dB_bits,
> +            sizeof(f_huffman_env_3_0dB_codes),       sizeof(f_huffman_env_3_0dB_codes[0]) },
> +        { t_huffman_env_bal_3_0dB_codes,   t_huffman_env_bal_3_0dB_bits,
> +            sizeof(t_huffman_env_bal_3_0dB_codes),   sizeof(t_huffman_env_bal_3_0dB_codes[0]) },
> +        { f_huffman_env_bal_3_0dB_codes,   f_huffman_env_bal_3_0dB_bits,
> +            sizeof(f_huffman_env_bal_3_0dB_codes),   sizeof(f_huffman_env_bal_3_0dB_codes[0]) },
> +        { t_huffman_noise_3_0dB_codes,     t_huffman_noise_3_0dB_bits,
> +            sizeof(t_huffman_noise_3_0dB_codes),     sizeof(t_huffman_noise_3_0dB_codes[0]) },
> +        { t_huffman_noise_bal_3_0dB_codes, t_huffman_noise_bal_3_0dB_bits,
> +            sizeof(t_huffman_noise_bal_3_0dB_codes), sizeof(t_huffman_noise_bal_3_0dB_codes[0]) },
> +    };

I don't really like the formatting of that table, but it's no big deal.

> +static int array_min_int16(int16_t *array, int nel)
> +{
> +    int i, min = array[0];
> +    for (i = 1; i < nel; i++)
> +        if (array[i] < min)
> +            min = array[i];
> +    return min;
> +}

How long are these arrays typically?  Is this called a lot?  There's a
SIMD possibility here.

> +static int qsort_comparison_function_int16(const void *a, const void *b)
> +{
> +    return *(const int16_t *)a - *(const int16_t *)b;
> +}

A radix sort would probably be faster.

> +static void make_bands(int16_t* bands, int start, int stop, int num_bands)
> +{
> +    int k, previous, present;
> +    float base, prod;
> +
> +    base = powf((float)stop / start, 1.0f / num_bands);
> +    prod = 1.0f;
> +    previous = start;
> +
> +    for (k = 0; k < num_bands-1; k++) {
> +        prod *= base;
> +        present  = lroundf(start * prod);
> +        bands[k] = present - previous;
> +        previous = present;
> +    }
> +    bands[num_bands-1] = stop - previous;
> +}

Cue another round of libm fixups...

> +static inline void remove_table_element(void *table, uint8_t *last_el, int el_size,
> +                                        int el)
> +{
> +    memmove((uint8_t *)table + el_size*el, (uint8_t *)table + el_size*(el + 1), (*last_el - el)*el_size);
> +    (*last_el)--;
> +}

I'd declare a temp variable of type uint8_t * and drop the casts.

> +static inline int in_table(void *table, int last_el, int el_size, void *needle)
> +{
> +    int i;
> +    uint8_t *table_ptr = table; // avoids a warning with void * ptr arith

void * arithmetic is not just a warning, it's impossible.  It boggles
my mind that gcc doesn't throw an error over it.

> +    for (i = 0; i <= last_el; i++, table_ptr += el_size)
> +        if (!memcmp(table_ptr, needle, el_size))
> +            return 1;
> +    return 0;
> +}

Since these functions are only used with 16-bit elements, they could
be made much faster.

> +/// Derived Frequency Band Tables (14496-3 sp04 p197)
> +static int sbr_make_f_derived(AACContext *ac, SpectralBandReplication *sbr)
> +{
> +    int k, temp;
> +
> +    sbr->n[1] = sbr->n_master - sbr->spectrum_params[1].bs_xover_band;
> +    sbr->n[0] = (sbr->n[1] + 1) >> 1;
> +
> +    memcpy(sbr->f_tablehigh, &sbr->f_master[sbr->spectrum_params[1].bs_xover_band],
> +           (sbr->n[1] + 1) * sizeof(sbr->f_master[0]));
> +    sbr->m[1] = sbr->f_tablehigh[sbr->n[1]] - sbr->f_tablehigh[0];
> +    sbr->k[4] = sbr->f_tablehigh[0];
> +
> +    // Requirements (14496-3 sp04 p205)
> +    if (sbr->k[4] + sbr->m[1] > 64) {
> +        av_log(ac->avccontext, AV_LOG_ERROR,
> +               "Stop frequency border too high: %d\n", sbr->k[4] + sbr->m[1]);
> +        return -1;
> +    }
> +    if (sbr->k[4] > 32) {
> +        av_log(ac->avccontext, AV_LOG_ERROR, "Start frequency border too high: %d\n", sbr->k[4]);
> +        return -1;
> +    }
> +
> +    sbr->f_tablelow[0] = sbr->f_tablehigh[0];
> +    temp = sbr->n[1] & 1;
> +    for (k = 1; k <= sbr->n[0]; k++)
> +        sbr->f_tablelow[k] = sbr->f_tablehigh[(k << 1) - temp];
> +
> +    sbr->n_q = FFMAX(1, lroundf(sbr->spectrum_params[1].bs_noise_bands *
> +                                log2f(sbr->k[2] / (float)sbr->k[4]))); // 0 <= bs_noise_bands <= 3
> +    if (sbr->n_q > 5) {
> +        av_log(ac->avccontext, AV_LOG_ERROR, "Too many noise floor scale factors: %d\n", sbr->n_q);
> +        return -1;
> +    }
> +
> +    sbr->f_tablenoise[0] = sbr->f_tablelow[0];
> +    temp = 0;
> +    for (k = 1; k <= sbr->n_q; k++) {
> +        temp += (sbr->n[0] - temp) / (sbr->n_q + 1 - k);
> +        sbr->f_tablenoise[k] = sbr->f_tablelow[temp];
> +    }
> +
> +    sbr_hf_calc_npatches(ac, sbr);

Missing error check.

> +    // Limiter Frequency Band Table (14496-3 sp04 p198)
> +    if (sbr->bs_limiter_bands > 0) {
> +        const float lim_bands_per_octave[3] = {1.2, 2, 3};
> +        int16_t patch_borders[sbr->num_patches + 1];

No variable-length arrays please.  sbr->num_patches is at most 5 so
just use that.

> +        patch_borders[0] = sbr->k[4];
> +        for (k=1; k <= sbr->num_patches; k++)
> +            patch_borders[k] = patch_borders[k-1] + sbr->patch_num_subbands[k-1];
> +
> +        memcpy( sbr->f_tablelim,                  sbr->f_tablelow,
> +               (sbr->n[0]        + 1) * sizeof(sbr->f_tablelow[0]));
> +        memcpy(&sbr->f_tablelim[sbr->n[0] + 1], &patch_borders[1],
> +               (sbr->num_patches - 1) * sizeof(patch_borders[0]));
> +
> +        qsort(sbr->f_tablelim, sbr->num_patches + sbr->n[0],
> +              sizeof(sbr->f_tablelim[0]),
> +              qsort_comparison_function_int16);
> +
> +        k = 1;
> +        sbr->n_lim = sbr->n[0] + sbr->num_patches - 1;
> +        while (k <= sbr->n_lim) {
> +            // if ( nOctaves * limBands >= 0.49) ...
> +            if (log2(sbr->f_tablelim[k] / (float)sbr->f_tablelim[k-1]) *
> +                lim_bands_per_octave[sbr->bs_limiter_bands - 1] >= 0.49) {
> +                k++;
> +                continue;
> +            }
> +            if (sbr->f_tablelim[k] == sbr->f_tablelim[k-1] ||
> +                !in_table(patch_borders, sbr->num_patches, sizeof(patch_borders[0]), &sbr->f_tablelim[k]))
> +                remove_table_element(sbr->f_tablelim, &sbr->n_lim, sizeof(sbr->f_tablelim[0]), k);
> +            else if (!in_table(patch_borders, sbr->num_patches, sizeof(patch_borders[0]), &sbr->f_tablelim[k-1]))
> +                remove_table_element(sbr->f_tablelim, &sbr->n_lim, sizeof(sbr->f_tablelim[0]), k-1);
> +            else
> +                k++;
> +        };

Instead of shifting the entire table down each time something is
removed, it could be faster to mark the removed elements with some
invalid value (if one exists) and compact it separately afterwards.

> +    } else {
> +        sbr->f_tablelim[0] = sbr->f_tablelow[0];
> +        sbr->f_tablelim[1] = sbr->f_tablelow[sbr->n[0]];
> +        sbr->n_lim = 1;
> +    }
> +
> +    sbr->data[0].f_indexnoise = 0;
> +    sbr->data[1].f_indexnoise = 0;
> +
> +    return 0;
> +}
> +
> +static const int8_t ceil_log2[] = {
> +    0, 0, 1, 2, 2, 3, 3,
> +};

This is always indexed as [x + 1].  Drop the +1 and shift the array
down.

> +static int sbr_grid(AACContext *ac, SpectralBandReplication *sbr,
> +                    GetBitContext *gb, SBRData *ch_data)
> +{
> +    int i;
> +
> +    ch_data->bs_freq_res[0] = ch_data->bs_freq_res[ch_data->bs_num_env[1]];
> +    ch_data->bs_num_env[0] = ch_data->bs_num_env[1];
> +    ch_data->bs_amp_res = sbr->bs_amp_res_header;
> +
> +    switch (ch_data->bs_frame_class = get_bits(gb, 2)) {
> +    case FIXFIX:
> +        ch_data->bs_num_env[1] = 1 << get_bits(gb, 2);
> +        if (ch_data->bs_num_env[1] == 1)
> +            ch_data->bs_amp_res = 0;
> +
> +        ch_data->bs_freq_res[1] = get_bits1(gb);
> +        for (i = 1; i < ch_data->bs_num_env[1]; i++)
> +            ch_data->bs_freq_res[i + 1] = ch_data->bs_freq_res[1];
> +        break;
> +    case FIXVAR:
> +        ch_data->bs_var_bord[1] = get_bits(gb, 2);
> +        ch_data->bs_num_rel[1]  = get_bits(gb, 2);
> +        ch_data->bs_num_env[1] = ch_data->bs_num_rel[1] + 1;
> +
> +        for (i = 0; i < ch_data->bs_num_rel[1]; i++)
> +            ch_data->bs_rel_bord[1][i] = (get_bits(gb, 2) << 1) + 2;
> +
> +        ch_data->bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env[1] + 1]);
> +
> +        for (i = 0; i < ch_data->bs_num_env[1]; i++)
> +            ch_data->bs_freq_res[ch_data->bs_num_env[1] - i] = get_bits1(gb);
> +        break;
> +    case VARFIX:
> +        ch_data->bs_var_bord[0] = get_bits(gb, 2);
> +        ch_data->bs_num_rel[0]  = get_bits(gb, 2);
> +        ch_data->bs_num_env[1]  = ch_data->bs_num_rel[0] + 1;
> +
> +        for (i = 0; i < ch_data->bs_num_rel[0]; i++)
> +            ch_data->bs_rel_bord[0][i] = (get_bits(gb, 2) << 1) + 2;
> +
> +        ch_data->bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env[1] + 1]);
> +
> +        for (i = 0; i < ch_data->bs_num_env[1]; i++)
> +            ch_data->bs_freq_res[i + 1] = get_bits1(gb);
> +        break;
> +    case VARVAR:
> +        ch_data->bs_var_bord[0] = get_bits(gb, 2);
> +        ch_data->bs_var_bord[1] = get_bits(gb, 2);
> +        ch_data->bs_num_rel[0]  = get_bits(gb, 2);
> +        ch_data->bs_num_rel[1]  = get_bits(gb, 2);
> +        ch_data->bs_num_env[1]  = ch_data->bs_num_rel[0] + ch_data->bs_num_rel[1] + 1;
> +
> +        for (i = 0; i < ch_data->bs_num_rel[0]; i++)
> +            ch_data->bs_rel_bord[0][i] = (get_bits(gb, 2) << 1) + 2;
> +        for (i = 0; i < ch_data->bs_num_rel[1]; i++)
> +            ch_data->bs_rel_bord[1][i] = (get_bits(gb, 2) << 1) + 2;
> +
> +        ch_data->bs_pointer = get_bits(gb, ceil_log2[ch_data->bs_num_env[1] + 1]);
> +
> +        for (i = 0; i < ch_data->bs_num_env[1]; i++)
> +            ch_data->bs_freq_res[i + 1] = get_bits1(gb);
> +        break;
> +    default:
> +        break;

Why have an empty default label when it can't even happen?

> +    }
> +
> +    if (ch_data->bs_frame_class == FIXFIX && ch_data->bs_num_env[1] > 4) {
> +        av_log(ac->avccontext, AV_LOG_ERROR,
> +               "Invalid bitstream, too many SBR envelopes in FIXFIX type SBR frame: %d\n",
> +               ch_data->bs_num_env[1]);
> +        return -1;
> +    }
> +    if (ch_data->bs_frame_class == VARVAR && ch_data->bs_num_env[1] > 5) {
> +        av_log(ac->avccontext, AV_LOG_ERROR,
> +               "Invalid bitstream, too many SBR envelopes in VARVAR type SBR frame: %d\n",
> +               ch_data->bs_num_env[1]);
> +        return -1;
> +    }
> +
> +    ch_data->bs_num_noise = ch_data->bs_num_env[1] > 1 ? 2 : 1;
> +
> +    return 0;
> +}
> +
> +static void sbr_grid_copy(SBRData *dst, const SBRData *src) {
> +    //These variables are saved from the previous frame rather than copied
> +    dst->bs_freq_res[0] = dst->bs_freq_res[dst->bs_num_env[1]];
> +    dst->bs_num_env[0]  = dst->bs_num_env[1];
> +
> +    //These variables are read from the bitstream and therefore copied

Maybe there's a good reason for that copying, but the comment sure
doesn't make it any clearer.

> +    memcpy(dst->bs_freq_res+1, src->bs_freq_res+1, sizeof(dst->bs_freq_res)-1);
> +    memcpy(dst->bs_num_env+1,  src->bs_num_env+1,  sizeof(dst->bs_num_env)-1);
> +    memcpy(dst->bs_var_bord,   src->bs_var_bord,   sizeof(dst->bs_var_bord));
> +    memcpy(dst->bs_rel_bord,   src->bs_rel_bord,   sizeof(dst->bs_rel_bord));
> +    memcpy(dst->bs_num_rel,    src->bs_num_rel,    sizeof(dst->bs_rel_bord));
> +    dst->bs_amp_res   = src->bs_amp_res;
> +    dst->bs_num_noise = src->bs_num_noise;
> +    dst->bs_pointer   = src->bs_pointer;
> +    dst->bs_frame_class = src->bs_frame_class;
> +}
> +
> +static void sbr_dtdf(SpectralBandReplication *sbr, GetBitContext *gb,
> +                     SBRData *ch_data)
> +{
> +    int i;
> +
> +    for (i = 0; i < ch_data->bs_num_env[1]; i++)
> +        ch_data->bs_df_env[i] = get_bits1(gb);
> +    for (i = 0; i < ch_data->bs_num_noise; i++)
> +        ch_data->bs_df_noise[i] = get_bits1(gb);
> +}
> +
> +static void sbr_invf(SpectralBandReplication *sbr, GetBitContext *gb,
> +                     SBRData *ch_data)
> +{
> +    int i;
> +
> +    memcpy(ch_data->bs_invf_mode[1], ch_data->bs_invf_mode[0], 5 * sizeof(uint8_t));
> +    for (i = 0; i < sbr->n_q; i++)
> +        ch_data->bs_invf_mode[0][i] = get_bits(gb, 2);
> +}
> +
> +static void sbr_envelope(SpectralBandReplication *sbr, GetBitContext *gb,
> +                         SBRData *ch_data, int ch)
> +{
> +    int bits, max_depth;
> +    int i, j;
> +    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
> +    int t_lav, f_lav;
> +
> +    if (sbr->bs_coupling && ch) {
> +        max_depth = 2;
> +        if (ch_data->bs_amp_res) {
> +            bits = 5;
> +            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_3_0DB].table;
> +            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_3_0DB];
> +            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
> +            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
> +        } else {
> +            bits = 6;
> +            t_huff = vlc_sbr[T_HUFFMAN_ENV_BAL_1_5DB].table;
> +            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_BAL_1_5DB];
> +            f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_1_5DB].table;
> +            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_1_5DB];
> +        }
> +    } else {
> +        max_depth = 3;
> +        if (ch_data->bs_amp_res) {
> +            bits = 6;
> +            t_huff = vlc_sbr[T_HUFFMAN_ENV_3_0DB].table;
> +            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_3_0DB];
> +            f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
> +            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
> +        } else {
> +            bits = 7;
> +            t_huff = vlc_sbr[T_HUFFMAN_ENV_1_5DB].table;
> +            t_lav  = vlc_sbr_lav[T_HUFFMAN_ENV_1_5DB];
> +            f_huff = vlc_sbr[F_HUFFMAN_ENV_1_5DB].table;
> +            f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_1_5DB];
> +        }
> +    }
> +
> +    for (i = 0; i < ch_data->bs_num_env[1]; i++) {
> +        if (!ch_data->bs_df_env[i]) {
> +            ch_data->bs_data_env[i][0] = get_bits(gb, bits); // bs_env_start_value_balance
> +            for (j = 1; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
> +                ch_data->bs_data_env[i][j] = get_vlc2(gb, f_huff, 9, max_depth) - f_lav;
> +        } else {
> +            for (j = 0; j < sbr->n[ch_data->bs_freq_res[i + 1]]; j++)
> +                ch_data->bs_data_env[i][j] = get_vlc2(gb, t_huff, 9, max_depth) - t_lav;
> +        }
> +    }
> +}

get_vlc2() is more efficient with a constant max depth.

> +static void sbr_noise(SpectralBandReplication *sbr, GetBitContext *gb,
> +                      SBRData *ch_data, int ch)
> +{
> +    int max_depth;
> +    int i, j;
> +    VLC_TYPE (*t_huff)[2], (*f_huff)[2];
> +    int t_lav, f_lav;
> +
> +    if (sbr->bs_coupling && ch) {
> +        max_depth = 1;
> +        t_huff = vlc_sbr[T_HUFFMAN_NOISE_BAL_3_0DB].table;
> +        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_BAL_3_0DB];
> +        f_huff = vlc_sbr[F_HUFFMAN_ENV_BAL_3_0DB].table;
> +        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_BAL_3_0DB];
> +    } else {
> +        max_depth = 2;
> +        t_huff = vlc_sbr[T_HUFFMAN_NOISE_3_0DB].table;
> +        t_lav  = vlc_sbr_lav[T_HUFFMAN_NOISE_3_0DB];
> +        f_huff = vlc_sbr[F_HUFFMAN_ENV_3_0DB].table;
> +        f_lav  = vlc_sbr_lav[F_HUFFMAN_ENV_3_0DB];
> +    }
> +
> +    for (i = 0; i < ch_data->bs_num_noise; i++) {
> +        if (!ch_data->bs_df_noise[i]) {
> +            ch_data->bs_data_noise[i][0] = get_bits(gb, 5); // bs_noise_start_value_balance or bs_noise_start_value_level
> +            for (j = 1; j < sbr->n_q; j++)
> +                ch_data->bs_data_noise[i][j] = get_vlc2(gb, f_huff, 9, max_depth + 1) - f_lav;
> +        } else {
> +            for (j = 0; j < sbr->n_q; j++)
> +                ch_data->bs_data_noise[i][j] = get_vlc2(gb, t_huff, 9, max_depth) - t_lav;
> +        }
> +    }
> +}

Ditto.

> +static void sbr_sinusoidal_coding(SpectralBandReplication *sbr,
> +                                  GetBitContext *gb, SBRData *ch_data)
> +{
> +    int i;
> +    for (i = 0; i < sbr->n[1]; i++)
> +        ch_data->bs_add_harmonic[i] = get_bits1(gb);
> +}

There are a lot of loops reading one bit at a time into an array.  I
suspect gcc could use some help making those more efficient.

> +/// SBR Envelope and Noise Floor Decoding (14496-3 sp04 p201)
> +static void sbr_env_noise_floors(SpectralBandReplication *sbr, SBRData *ch_data,
> +                                 int ch)
> +{
> +    int delta = (ch == 1 && sbr->bs_coupling == 1) ? 2 : 1;
> +    int i, k, l;
> +    const int temp = sbr->n[1] & 1;
> +    for (l = 0; l < ch_data->bs_num_env[1]; l++) {
> +        if (ch_data->bs_df_env[l]) {
> +            // bs_freq_res[0] == bs_freq_res[bs_num_env[1]] from prev frame
> +            if (ch_data->bs_freq_res[l + 1] == ch_data->bs_freq_res[l]) {
> +                for (k = 0; k < sbr->n[ch_data->bs_freq_res[l + 1]]; k++)
> +                    ch_data->env_facs[l + 1][k] = ch_data->env_facs[l][k] + delta * ch_data->bs_data_env[l][k];
> +            } else if (ch_data->bs_freq_res[l + 1]) {
> +                for (k = 0; k < sbr->n[ch_data->bs_freq_res[l + 1]]; k++) {
> +                    i = (k + temp) >> 1; // find i such that f_tablelow[i] <= f_tablehigh[k] < f_tablelow[i + 1]
> +                    ch_data->env_facs[l + 1][k] = ch_data->env_facs[l][i] + delta * ch_data->bs_data_env[l][k];
> +                }
> +            } else {
> +                for (k = 0; k < sbr->n[ch_data->bs_freq_res[l + 1]]; k++) {
> +                    i = k ? 2*k - temp : 0; // find i such that f_tablehigh[i] == f_tablelow[k]
> +                    ch_data->env_facs[l + 1][k] = ch_data->env_facs[l][i] + delta * ch_data->bs_data_env[l][k];
> +                }
> +            }
> +        } else {
> +            ch_data->env_facs[l + 1][0] = delta * ch_data->bs_data_env[l][0];
> +            for (k = 1; k < sbr->n[ch_data->bs_freq_res[l + 1]]; k++)
> +                ch_data->env_facs[l + 1][k] = ch_data->env_facs[l + 1][k - 1] + delta * ch_data->bs_data_env[l][k];
> +        }
> +    }
> +
> +    for (l = 0; l < ch_data->bs_num_noise; l++) {
> +        if (ch_data->bs_df_noise[l])
> +            for (k = 0; k < sbr->n_q; k++)
> +                ch_data->noise_facs[l + 1][k] = ch_data->noise_facs[l][k] + delta * ch_data->bs_data_noise[l][k];
> +        else {
> +            ch_data->noise_facs[l + 1][0] = delta * ch_data->bs_data_noise[l][0];
> +            for (k = 1; k < sbr->n_q; k++)
> +                ch_data->noise_facs[l + 1][k] = ch_data->noise_facs[l + 1][k - 1] + delta * ch_data->bs_data_noise[l][k];
> +        }
> +    }
> +
> +    //assign 0th elements of (env|noise)_facs from last elements
> +    memcpy(  ch_data->env_facs[0],   ch_data->env_facs[ch_data->bs_num_env[1]],
> +           sizeof(  ch_data->env_facs[0]));
> +    memcpy(ch_data->noise_facs[0], ch_data->noise_facs[ch_data->bs_num_noise ],
> +           sizeof(ch_data->noise_facs[0]));
> +}

This function is impossible to read.

> +/// Dequantisation and stereo decoding (14496-3 sp04 p203)
> +static void sbr_dequant(SpectralBandReplication *sbr, int id_aac)
> +{
> +    int k, l;
> +    int ch;
> +
> +    if (id_aac == TYPE_CPE && sbr->bs_coupling) {
> +        float alpha = sbr->data[0].bs_amp_res ? 1.0f : 0.5f;
> +        float pan_offset = sbr->data[0].bs_amp_res ? 12.0f : 24.0f;
> +        for (l = 1; l <= sbr->data[0].bs_num_env[1]; l++) {
> +            for (k = 0; k < sbr->n[sbr->data[0].bs_freq_res[l]]; k++) {
> +                float temp1 = exp2f(sbr->data[0].env_facs[l][k] * alpha + 7.0f);
> +                float temp2 = (pan_offset - sbr->data[1].env_facs[l][k]) * alpha;
> +                sbr->data[0].env_facs[l][k] = temp1 / (1.0f + exp2f( temp2));
> +                sbr->data[1].env_facs[l][k] = temp1 / (1.0f + exp2f(-temp2));
> +            }
> +        }
> +        for (l = 1; l <= sbr->data[0].bs_num_noise; l++) {
> +            for (k = 0; k < sbr->n_q; k++) {
> +                float temp1 = exp2f(NOISE_FLOOR_OFFSET - sbr->data[0].noise_facs[l][k] + 1);
> +                float temp2 = 12 - sbr->data[1].noise_facs[l][k];
> +                sbr->data[0].noise_facs[l][k] = temp1 / (1.0f + exp2f( temp2));
> +                sbr->data[1].noise_facs[l][k] = temp1 / (1.0f + exp2f(-temp2));
> +            }
> +        }
> +    } else { // SCE or one non-coupled CPE
> +        for (ch = 0; ch < (id_aac == TYPE_CPE ? 2 : 1); ch++) {
> +            float alpha = sbr->data[ch].bs_amp_res ? 1.0f : 0.5f;
> +            for (l = 1; l <= sbr->data[ch].bs_num_env[1]; l++)
> +                for (k = 0; k < sbr->n[sbr->data[ch].bs_freq_res[l]]; k++)
> +                    sbr->data[ch].env_facs[l][k] =
> +                        exp2f(alpha * sbr->data[ch].env_facs[l][k] + 6.0f);
> +            for (l = 1; l <= sbr->data[ch].bs_num_noise; l++)
> +                for (k = 0; k < sbr->n_q; k++)
> +                    sbr->data[ch].noise_facs[l][k] =
> +                        exp2f(NOISE_FLOOR_OFFSET - sbr->data[ch].noise_facs[l][k]);
> +        }
> +    }
> +}

I get the feeling this could be done more efficiently.  If nothing
else, the two alpha cases could be separated to avoid multiplying by
1.0.  GCC is remarkably bad at that.

> +/**
> + * Analysis QMF Bank (14496-3 sp04 p206)
> + *
> + * @param   x       pointer to the beginning of the first sample window
> + * @param   W       array of complex-valued samples split into subbands
> + */
> +static void sbr_qmf_analysis(DSPContext *dsp, const float *in, float *x,
> +                             float *u, float W[2][32][32][2])
> +{
> +    int i, k, l;
> +    memcpy(W[0], W[1], sizeof(W[0]));
> +    memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
> +    memcpy(x+288, in    ,     1024*sizeof(x[0]));
> +    x += 319;
> +    for (l = 0; l < 32; l++) { // numTimeSlots*RATE = 16*2 as 960 sample frames
> +                               // are not supported
> +        float z[320];
> +        for (i = 0; i < 320; i++)
> +            z[i] = x[-i] * sbr_qmf_window_ds[i];

vector_fmul_reverse()

> +        for (i = 0; i < 64; i++)
> +            u[i] = z[i] + z[i + 64] + z[i + 128] + z[i + 192] + z[i + 256];

Worth simding?

> +        for (k = 0; k < 32; k++) {
> +            W[1][k][l][0] = dsp->scalarproduct_float(u, analysis_cos[k], 64);
> +            W[1][k][l][1] = dsp->scalarproduct_float(u, analysis_sin[k], 64);
> +        }
> +        x += 32;
> +    }

It might be worthwhile writing a special SIMD version of this entire
function.  The way this is written has quite a bit of overhead.

> +}
> +
> +/**
> + * Synthesis QMF Bank (14496-3 sp04 p206) and Downsampled Synthesis QMF Bank
> + * (14496-3 sp04 p206)
> + */
> +static void sbr_qmf_synthesis(DSPContext * dsp, float *out, float X[32][64][2],
> +                              float *v, const unsigned int div)
> +{
> +    int l, n;
> +    const float *sbr_qmf_window = div ? sbr_qmf_window_ds : sbr_qmf_window_us;
> +    float (*synthesis_cossin)[2] = div ? synthesis_cossin_ds : synthesis_cossin_us;
> +    for (l = 0; l < 32; l++) {
> +        memmove(&v[128 >> div], v, ((1280 - 128) >> div) * sizeof(float));
> +        for (n = 0; n < 128 >> div; n++) {
> +            v[n] = dsp->scalarproduct_float(X[l][0], (synthesis_cossin+(64>>div)*n)[0], 128 >> div);
> +        }
> +        dsp->vector_fmul_add(out, v                , sbr_qmf_window               , zero64, 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 256 >> div), sbr_qmf_window + (128 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 448 >> div), sbr_qmf_window + (192 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 512 >> div), sbr_qmf_window + (256 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 704 >> div), sbr_qmf_window + (320 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 768 >> div), sbr_qmf_window + (384 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + ( 960 >> div), sbr_qmf_window + (448 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + (1024 >> div), sbr_qmf_window + (512 >> div), out   , 64 >> div);
> +        dsp->vector_fmul_add(out, v + (1216 >> div), sbr_qmf_window + (576 >> div), out   , 64 >> div);
> +        out += 64 >> div;
> +    }
> +}

I wonder if special functions for this would be beneficial.

> +/** High Frequency Generation (14496-3 sp04 p214+) and Inverse Filtering
> + * (14496-3 sp04 p214)
> + */
> +static void sbr_hf_inverse_filter(float (*alpha0)[2], float (*alpha1)[2],
> +                                  float X_low[32][40][2], int k0)
> +{
> +    int i, j, k, n;
> +    for (k = 0; k < k0; k++) {
> +        float phi[3][2][2], dk;
> +
> +        for (i = 0; i < 3; i++) {
> +            for (j = 0; j < 2; j++) {
> +                unsigned int idxtmp1 = ENVELOPE_ADJUSTMENT_OFFSET - i;
> +                unsigned int idxtmp2 = ENVELOPE_ADJUSTMENT_OFFSET - (j + 1);
> +
> +                phi[i][j][0] = 0.0f;
> +                phi[i][j][1] = 0.0f;
> +
> +                if (i <= j + 1)
> +                for (n = 0; n < 16 * 2 + 6; n++) {
> +                    unsigned int idx1 = n + idxtmp1;
> +                    unsigned int idx2 = n + idxtmp2;
> +                    phi[i][j][0] += X_low[k][idx1][0] * X_low[k][idx2][0] +
> +                                    X_low[k][idx1][1] * X_low[k][idx2][1];
> +                    if (i != j + 1) { // imaginary part
> +                        phi[i][j][1] += X_low[k][idx1][1] * X_low[k][idx2][0] -
> +                                        X_low[k][idx1][0] * X_low[k][idx2][1];
> +                    }
> +                }
> +            }
> +        }

This seems to be asking for SIMD.

> +        dk = phi[2][1][0] * phi[1][0][0] -
> +               (phi[1][1][0] * phi[1][1][0] + phi[1][1][1] * phi[1][1][1]) / 1.000001f;
> +
> +        if (!dk) {
> +            alpha1[k][0] = 0;
> +            alpha1[k][1] = 0;
> +        } else {
> +            float temp_real, temp_im;
> +            temp_real = phi[0][0][0] * phi[1][1][0] -
> +                        phi[0][0][1] * phi[1][1][1] -
> +                        phi[0][1][0] * phi[1][0][0];
> +            temp_im   = phi[0][0][0] * phi[1][1][1] +
> +                        phi[0][0][1] * phi[1][1][0] -
> +                        phi[0][1][1] * phi[1][0][0];
> +
> +            alpha1[k][0] = temp_real / dk;
> +            alpha1[k][1] = temp_im   / dk;
> +        }
> +
> +        if (!phi[1][0][0]) {
> +            alpha0[k][0] = 0;
> +            alpha0[k][1] = 0;
> +        } else {
> +            float temp_real, temp_im;
> +            temp_real = phi[0][0][0] + alpha1[k][0] * phi[1][1][0] +
> +                                       alpha1[k][1] * phi[1][1][1];
> +            temp_im   = phi[0][0][1] + alpha1[k][1] * phi[1][1][0] -
> +                                       alpha1[k][0] * phi[1][1][1];
> +
> +            alpha0[k][0] = -temp_real / phi[1][0][0];
> +            alpha0[k][1] = -temp_im   / phi[1][0][0];
> +        }
> +
> +        if (alpha1[k][0] * alpha1[k][0] + alpha1[k][1] * alpha1[k][1] >= 16.0f ||
> +           alpha0[k][0] * alpha0[k][0] + alpha0[k][1] * alpha0[k][1] >= 16.0f) {
> +            alpha1[k][0] = 0;
> +            alpha1[k][1] = 0;
> +            alpha0[k][0] = 0;
> +            alpha0[k][1] = 0;
> +        }

These calculations look a little unstable.

> +    }
> +}
> +
> +static inline int find_freq_subband(uint16_t *table, int nel, int needle)
> +{
> +    int i;
> +    for (i = 0; i < nel; i++) {
> +        if (needle >= table[i] && needle < table[i + 1])

Is there an end marker or similar to prevent the +1 overflowing the
table?  Can the bands overlap?  If not, checking the upper limit is
unnecessary.

> +            return i;
> +    }
> +    return -1;
> +}

[...]

> +/// Assembling HF Signals (14496-3 sp04 p220)
> +static void sbr_hf_assemble(float Y[2][38][64][2], float X_high[64][40][2],
> +                            SpectralBandReplication *sbr, SBRData *ch_data,
> +                            int l_a[2])
> +{
> +    int i, j, l, m;
> +    const int h_SL = sbr->bs_smoothing_mode ? 0 : 4;
> +    const float h_smooth[5] = {

static

> +        0.33333333333333,
> +        0.30150283239582,
> +        0.21816949906249,
> +        0.11516383427084,
> +        0.03183050093751,
> +    };
> +    const int8_t phi[2][4] = {

static

> +        {  1,  0, -1,  0}, // real
> +        {  0,  1,  0, -1}, // imaginary
> +    };
> +    float g_filt[42][48], q_filt[42][48], w_temp[42][48][2];
> +    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
> +    memcpy(Y[0], Y[1], sizeof(Y[0]));
> +

[...]

> +void ff_sbr_dequant(AACContext *ac, SpectralBandReplication *sbr, int id_aac)
> +{
> +    int ch;
> +
> +    if (sbr->start) {
> +        for (ch = 0; ch < (id_aac == TYPE_CPE ? 2 : 1); ch++) {
> +        sbr_time_freq_grid(ac, sbr, &sbr->data[ch], ch);
> +        sbr_env_noise_floors(sbr, &sbr->data[ch], ch);

Indent.

> +        }
> +        sbr_dequant(sbr, id_aac);
> +    }
> +}

[...]

> +/**
> + * Spectral Band Replication per channel data
> + */
> +typedef struct {
> +    /**
> +     * @defgroup bitstream     Main bitstream data variables
> +     * @{
> +     */
> +    uint8_t            bs_frame_class;
> +    uint8_t            bs_add_harmonic_flag;
> +    uint8_t            bs_num_env[2];
> +    uint8_t            bs_freq_res[7];
> +    uint8_t            bs_var_bord[2];
> +    uint8_t            bs_num_rel[2];
> +    uint8_t            bs_rel_bord[2][3];
> +    uint8_t            bs_pointer;
> +    uint8_t            bs_num_noise;
> +    uint8_t            bs_df_env[5];
> +    uint8_t            bs_df_noise[2];
> +    uint8_t            bs_invf_mode[2][5];
> +    int32_t            bs_data_env[7][32];
> +    int32_t            bs_data_noise[2][5];
> +    uint8_t            bs_add_harmonic[48];
> +    uint8_t            bs_amp_res;
> +    /** @} */
> +
> +    /**
> +     * @defgroup state         State varaibles
> +     * @{
> +     */
> +    DECLARE_ALIGNED(16, float, synthesis_filterbank_samples)[1280];
> +    DECLARE_ALIGNED(16, float, analysis_filterbank_samples) [1312];
> +    int                l_a[2];
> +    float              bw_array[2][5];
> +    float              W[2][32][32][2];
> +    float              Y[2][38][64][2];
> +    float              g_temp[42][48];
> +    float              q_temp[42][48];
> +    uint8_t            s_indexmapped[8][48];
> +    float              env_facs[6][48];
> +    float              noise_facs[3][5];
> +    uint8_t            t_env[8];
> +    uint8_t            t_env_num_env_old;
> +    uint8_t            t_q[3];
> +    uint16_t           f_indexnoise;
> +    uint8_t            f_indexsine;
> +    /** @} */
> +} SBRData;
> +
> +/**
> + * Spectral Band Replication
> + */
> +typedef struct {
> +    uint8_t            start;
> +    int32_t            sample_rate;
> +    uint8_t            bs_amp_res_header;
> +    SpectrumParameters spectrum_params[2];
> +    /**
> +     * @defgroup bs_header_extra_2     variables associated with bs_header_extra_2
> +     * @{
> +     */
> +    uint8_t            bs_limiter_bands;
> +    uint8_t            bs_limiter_gains;
> +    uint8_t            bs_interpol_freq;
> +    uint8_t            bs_smoothing_mode;
> +    /** @} */
> +    uint8_t            bs_coupling;
> +    SBRData            data[2];
> +    uint8_t            reset;
> +    uint8_t            k[5]; ///< k0, k1, k2, kx', and kx respectively
> +    uint8_t            m[2]; ///< M' and M respectively
> +    uint8_t            n_master;
> +    uint8_t            n[2]; ///< n_low and n_high respectively
> +    uint8_t            n_q;
> +    uint8_t            n_lim;
> +    uint16_t           f_master[49];
> +    uint16_t           f_tablelow[25];
> +    uint16_t           f_tablehigh[49];
> +    uint16_t           f_tablenoise[6];
> +    uint16_t           f_tablelim[29];
> +    uint8_t            num_patches;
> +    uint8_t            patch_num_subbands[5];
> +    uint8_t            patch_start_subband[5];
> +    float              X_low[32][40][2];
> +    float              X_high[64][40][2];
> +    DECLARE_ALIGNED(16, float, X)[32][64][2];
> +    float              alpha0[64][2];
> +    float              alpha1[64][2];
> +    float              e_origmapped[7][48];
> +    float              q_mapped[7][48];
> +    uint8_t            s_mapped[7][48];
> +    float              e_curr[7][48];
> +    float              q_m[7][48];
> +    float              s_m[7][48];
> +    float              gain[7][48];
> +    DECLARE_ALIGNED(16, float, ana_filter_scratch)[64];
> +} SpectralBandReplication;

Have you tried to arrange the struct members to minimise padding
(doesn't look like it) and maximise locality?  Usually anything larger
than a cache line can be put towards the end of the struct, allowing
the smaller items to share cache lines.

> +#endif /* AVCODEC_AACSBR_H */

> +static const DECLARE_ALIGNED_16(float, sbr_qmf_window_us)[640] = {
> +     0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518,
> +    -0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564,
> +    -0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747,
> +    -0.0006312493, -0.0006540333, -0.0006777690, -0.0006941614,
> +    -0.0007157736, -0.0007255043, -0.0007440941, -0.0007490598,
> +    -0.0007681371, -0.0007724848, -0.0007834332, -0.0007779869,
> +    -0.0007803664, -0.0007801449, -0.0007757977, -0.0007630793,
> +    -0.0007530001, -0.0007319357, -0.0007215391, -0.0006917937,
> +    -0.0006650415, -0.0006341594, -0.0005946118, -0.0005564576,
> +    -0.0005145572, -0.0004606325, -0.0004095121, -0.0003501175,
> +    -0.0002896981, -0.0002098337, -0.0001446380, -0.0000617334,
> +     0.0000134949,  0.0001094383,  0.0002043017,  0.0002949531,
> +     0.0004026540,  0.0005107388,  0.0006239376,  0.0007458025,
> +     0.0008608443,  0.0009885988,  0.0011250155,  0.0012577884,

Where do these numbers come from?  The way they're written here some
have 10 significant digits, others only 6.  Is that intentional?

Are there any patterns in the tables that could be exploited?

-- 
M?ns Rullg?rd
mans at mansr.com



More information about the ffmpeg-devel mailing list