[FFmpeg-devel] [PATCH] ALS decoder

Fri Aug 21 01:40:56 CEST 2009

Hi,

>> +    uint32_t     header_size;            ///< header size of original audio file in bytes, provided for debugging
>> +    uint32_t     trailer_size;           ///< Trailer size of original audio file in bytes, provided for debugging
>> +    uint32_t     crc;                    ///< 32-bit CCITT-32 CRC checksum
>> +} ALSSpecificConfig;
> 
> Please try not to make *every* line exceed 80 columns.  You could
> easily shorten them here by reducing the amount of whitespace between
> the type and name and before the comment.
> 
>> +    int64_t           *prev_raw_samples; ///< contains unshifted raw samples from the previous block
>> +    int64_t           **raw_samples;     ///< decoded raw samples for each channel
>> +    int64_t           *raw_buffer;       ///< contains all decoded raw samples including carryover samples
>> +} ALSDecContext;
> 
> Ditto.
ok, reduced whitespaces.

> 
>> +/** Computes ceil(log2(x)) using av_log2.
>> + */
>> +static inline int ceil_log2(int x) {
>> +    return x > 0 ? av_log2((x - 1) << 1) : 0;
>> +}
> 
> Little functions like this are likely to be needed again some time.
> They should be placed in a common location.
Moved to libavutil/common.h. Please review my bad doxy wording.

> 
>> +/** Reads an ALSSpecificConfig from a buffer into the output struct.
>> + */
>> +static av_cold int read_specific_config(ALSDecContext *ctx)
>> +{
>> +    GetBitContext gb;
>> +    uint64_t ht_size;
>> +    int i, config_offset;
>> +    MPEG4AudioConfig m4ac;
>> +    ALSSpecificConfig *sconf = &ctx->sconf;
>> +    const uint8_t *buffer    = ctx->avctx->extradata;
>> +    int buffer_size          = ctx->avctx->extradata_size;
>> +
>> +    init_get_bits(&gb, buffer, buffer_size * 8);
>> +
>> +    config_offset = ff_mpeg4audio_get_config(&m4ac, buffer, buffer_size);
>> +
>> +    if (config_offset < 0)
>> +        return -1;
>> +
>> +    skip_bits_long(&gb, config_offset);
>> +    buffer_size -= config_offset >> 3;
>> +
>> +    if (buffer_size < 22)
>> +        return -1;
>> +
>> +    // read the fixed items
>> +    sconf->als_id               = get_bits_long(&gb, 32);
>> +    sconf->samp_freq            = m4ac.sample_rate;
>> +    skip_bits_long(&gb, 32);
> 
> Maybe comment what is being skipped here and elsewhere.
Ok, I thought it is obvious. Done.

> 
>> +    sconf->samples              = get_bits_long(&gb, 32);
>> +    sconf->channels             = m4ac.channels;
>> +    skip_bits(&gb, 16);
>> +    sconf->file_type            = get_bits(&gb, 3);
>> +    sconf->resolution           = get_bits(&gb, 3);
>> +    sconf->floating             = get_bits1(&gb);
>> +    sconf->msb_first            = get_bits1(&gb);
>> +    sconf->frame_length         = get_bits(&gb, 16) + 1;
>> +    sconf->random_access        = get_bits(&gb, 8);
>> +    sconf->ra_flag              = get_bits(&gb, 2);
>> +    sconf->adapt_order          = get_bits1(&gb);
>> +    sconf->coef_table           = get_bits(&gb, 2);
>> +    sconf->long_term_prediction = get_bits1(&gb);
>> +    sconf->max_order            = get_bits(&gb, 10);
>> +    sconf->block_switching      = get_bits(&gb, 2);
>> +    sconf->bgmc_mode            = get_bits1(&gb);
>> +    sconf->sb_part              = get_bits1(&gb);
>> +    sconf->joint_stereo         = get_bits1(&gb);
>> +    sconf->mc_coding            = get_bits1(&gb);
>> +    sconf->chan_config          = get_bits1(&gb);
>> +    sconf->chan_sort            = get_bits1(&gb);
>> +    sconf->crc_enabled          = get_bits1(&gb);
>> +    sconf->rlslms               = get_bits1(&gb);
>> +    skip_bits(&gb, 5);                                      // skip 5 reserved bits
> 
> Please lose some of that whitespace.  Compliments on the alignment
> though.  Diego will love you.
I did this above, but here, not even the 80-characters reason exist.
Why give up readability?

>> +
>> +        for (i = 0; i < sconf->channels; i++) {
>> +            sconf->chan_pos[i] = get_bits(&gb, chan_pos_bits);
>> +        }
> 
> We usually omit {} for single-line for/if/while statements...
I usually do that, too... done.

> 
>> +        align_get_bits(&gb);
>> +        buffer_size -= bytes_needed;
>> +        // TODO: use this to actually do channel sorting
>> +    } else {
>> +        sconf->chan_sort = 0;
>> +    }
> 
> ... except for one-line else clauses after a multiline if.
Yes.

>> +/** Reads and decodes a Rice codeword.
>> + */
>> +static int64_t decode_rice(GetBitContext *gb, unsigned int k)
>> +{
>> +    int64_t value = 0;
>> +    int64_t q = 0;
>> +    int     max = gb->size_in_bits - get_bits_count(gb) - k;
>> +
>> +    if (!k) {
>> +        q = get_unary(gb, 0, max);
>> +        return (q & 1) ? -((q + 1) >> 1) : ((q + 1) >> 1);
>> +    } else if (k == 1) {
>> +        q = get_unary(gb, 0, max);
>> +        return get_bits1(gb) ? q : -(q + 1);
>> +    } else {
>> +        unsigned int r, sub_sign;
>> +
>> +        q         = get_unary(gb, 0, max);
>> +        sub_sign  = get_bits1(gb);
>> +        r         = get_bits_long(gb, k - 1);
>> +
>> +        value = (q << (k - 1)) + r;
>> +
>> +        return sub_sign ? value : -(value + 1);
>> +    }
>> +}
> 
> This function looks like it was designed specifically to make gcc fail:
> 
> 1.  64-bit variables used unnecessarily.
> 2.  GCC hates complex conditionals.
> 3.  Compilers in general are bad at bit-hacks.
> 
> It seems to me that int is enough for 'q' in the first two cases.
> get_unary() returns int, so we start with at most that many bits.  To
> avoid overflow in q+1, do this instead:
> 
>     int r = q & 1;
>     return r ? -((q >> 1) + r) : ((q >> 1) + r);
> 
> Furthermore, gcc does stupid things with that code even with plain int
> variables.  Rewriting in a few more steps helps massively:
> 
>     int r = q & 1;
>     q = (q >> 1) + r;
>     return r ? -q : q;
> 
> This reduces the number of instructions on ARM from 6 to 4 (from 4 to
> 3 with armcc).  On MIPS it goes from 9 with a branch to 5 branch-free.
> 
> Finally, -(v + 1) is equivalent to ~v on two's complement machines,
> which we have previously agreed to assume we have.  Hence we should
> write the return values in the second and third cases like this:
> 
>     return sign ? value : ~value;
> 
> This prevents 32-bit overflow in the k==1 case and gives better code
> with several compilers in the final case, where we must resort to
> 64-bit maths.
Tricky. Works like a charm. Done.

> 
>> +/** Converts PARCOR coefficient k to direct filter coefficient.
>> + */
>> +static void parcor_to_lpc(unsigned int k, int64_t *par, int64_t *cof)
>> +{
>> +    int i;
>> +    int64_t tmp1, tmp2;
>> +
>> +    for (i = 0; i < (k+1) >> 1; i++) {
>> +        tmp1 = cof[    i    ] + ((par[k] * cof[k - i - 1] + (1 << 19)) >> 20);
>> +        tmp2 = cof[k - i - 1] + ((par[k] * cof[    i    ] + (1 << 19)) >> 20);
>> +        cof[k - i - 1] = tmp2;
>> +        cof[    i    ] = tmp1;
>> +    }
>> +
>> +    cof[k] = par[k];
>> +}
> 
> This is again a perfect trap for gcc.  64-bit maths in tight loops is
> something it is *really* bad at.  Is there no way to avoid 64-bit
> elements here?  Is perhaps the range usually smaller, so we could
> choose a 32-bit version most of the time and fall back on 64-bit only
> when required?  Regardless of compiler, 64-bit multiplication is
> expensive on any 32-bit machine.
I think the usual case uses 64 bit here, but I'm not sure. Maybe Justin
can give a comment on this?

>> +/** Reads the block data.
>> + */
>> +static int read_block_data(ALSDecContext *ctx, unsigned int ra_block,
>> +                            int64_t *raw_samples, unsigned int block_length,
>> +                            unsigned int *js_blocks, int64_t *raw_other)
>> +{
>> ...
> The block nesting is very deep here.  Perhaps splitting this large
> function into several smaller ones would help readability.  Even gcc
> should manage to inline them.
I agree, done.

> 
>> +        if (sconf->long_term_prediction) {
>> +            // TODO: LTP mode
>> +        }
>> +
>> +        start = 0;
>> ...
>> +        } else {
>> +            int store_prev_samples = (*js_blocks && raw_other) || shift_lsbs;
>> +
>> +            // store previous smaples in case that they have to be altered
>> +            if (store_prev_samples)
>> +                memcpy(ctx->prev_raw_samples, raw_samples - sconf->max_order,
>> +                       sizeof(int64_t) * sconf->max_order);
>> +
>> +            // reconstruct difference signal for prediction (joint-stereo)
>> +            if (*js_blocks && raw_other) {
>> +                int i;
>> +                if (raw_other > raw_samples) {          // D = R - L
>> +                    for (i = -1; i >= -sconf->max_order; i--)
>> +                        raw_samples[i] = raw_other[i] - raw_samples[i];
>> +                } else {                                // D = R - L
> 
> Are those two comments meant to be the same?
Yes but semantically different. This is meant to be that way.
It shows, which channel is represented by which variable.

>> ...
>> +
>> +    if (shift_lsbs) {
>> +        for (k = 0; k < block_length; k++)
>> +            raw_samples[k] <<= shift_lsbs;
>> +    }
>> +
>> +    return 0;
>> +}
> 
> Many of the loops above look like they could be easily simdified.  I
> don't know how much time is spent in each, so I haven't thought about
> it in detail.  Breaking the function apart would also help in finding
> where most time is spent.
I don't think so, at least in terms of merging several of them.
Otherwise or if you thought about another optimization, please give an
example.

> 
>> +
>> +/** Reads the frame data.
>> + */
>> +static int read_frame_data(ALSDecContext *ctx, unsigned int ra_frame)
>> +{
>> ...
> 
> Again, very deep nesting makes code hard to read.  Consider making
> some blocks separate functions.
Done.

>> ...
> 
> Maybe you already tried to fix it, but there's a scary number of
> memcpy/memmove calls above.
All of them are really necessary.

>> +
>> +    // transform decoded frame into output format
>> +    #define INTERLEAVE_OUTPUT(bps)                                                 \
> 
> That's an awful lot of whitespace.  Please cut it back so the lines
> fit in 80 columns, which will be easily possible when you address my
> next comment.
> 
>> +    {                                                                              \
>> +        int##bps##_t *dest = (int##bps##_t*) data;                                 \
>> +        shift = bps - ctx->avctx->bits_per_raw_sample;                             \
>> +        for (sample = 0; sample < ctx->cur_frame_length; sample++) {               \
>> +            for (c = 0; c < sconf->channels; c++) {                                \
>> +                *(dest++) = (int##bps##_t) (ctx->raw_samples[c][sample] << shift); \
> 
> Useless parens: *dest++ is good.  Useless cast => more useless parens.
Yes, done.

>> +    // allocate previous raw sample buffer
>> +    if (!(ctx->prev_raw_samples = av_malloc(sizeof(int64_t) * sconf->max_order))) {
>> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
>> +        decode_end(avctx);
>> +        return AVERROR(ENOMEM);
>> +    }
>> +
>> +    // allocate raw and carried sample buffer
>> +    if (!(ctx->raw_buffer = av_mallocz(sizeof(int64_t) *
>> +                                       avctx->channels * channel_size))) {
>> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer memory failed.\n");
>> +        decode_end(avctx);
>> +        return AVERROR(ENOMEM);
>> +    }
>> +
>> +    // allocate raw sample array buffer
>> +    if (!(ctx->raw_samples = av_malloc(sizeof(int64_t*) * avctx->channels))) {
>> +        av_log(avctx, AV_LOG_ERROR, "Allocating buffer array failed.\n");
>> +        decode_end(avctx);
>> +        return AVERROR(ENOMEM);
>> +    }
> 
> This looks very repetitive...
Thus.... it is good/bad/crazy?
The log message depends on the buffer allocated, the size of the buffer
is never the same...

> 
>> +    // allocate raw and carried samples buffers
>> +    ctx->raw_samples[0] = ctx->raw_buffer + sconf->max_order;
>> +    for (c = 1; c < avctx->channels; c++) {
>> ...
>> +
>> Index: libavcodec/als_data.h
>> ===================================================================
>> --- libavcodec/als_data.h	(revision 0)
>> +++ libavcodec/als_data.h	(revision 0)
>> @@ -0,0 +1,147 @@
>> +/*
>> + * ALS header file for common data
>> + * Copyright (c) 2009 Thilo Borgmann <thilo.borgmann _at_ googlemail.com>
>> ...
>> +#ifndef AVCODEC_ALS_DATA_H
>> +#define AVCODEC_ALS_DATA_H
>> +
>> +
>> +#include <stdint.h>
>> +
>> +/** Rice parameters and corresponding index offsets for decoding the
>> + *  indices of scaled PARCOR values. The table choosen is set globally
>> + *  by the encoder and stored in ALSSpecificConfig.
>> + */
>> +int8_t parcor_rice_table[3][20][2] = {
>> +                        {
>> +                        {-52, 4},
> 
> WTF happened to the indentation here?
> 
>> +
>> +/** Scaled PARCOR values used for the first two PARCOR coefficients.
>> + *  To be indexed by the Rice coded indices.
>> + *  Generated by: parcor_scaled_values[i] = 32 + ((i * (i+1)) << 7) - (1 << 20)
>> + */
>> +int32_t parcor_scaled_values[] = {-1048544, -1048288, -1047776, -1047008,
>> +                                  -1045984, -1044704, -1043168, -1041376,
> 
> And here.
Would you like block- or function-like indents?
No kidding - I ask that before it goes back and forth and I don't care
which one...
(Or are there table-like indention rules I've missed?)

Revision 4 attached.

Thanks for your review!

-Thilo
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: als_decoder.rev4.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090821/3d2679cb/attachment.asc>