[FFmpeg-devel] [PATCH 1/4] avcodec: add siren audio decoder

Lynne dev at lynne.ee
Thu May 16 13:51:46 EEST 2019


May 16, 2019, 10:43 AM by onemda at gmail.com:

> Signed-off-by: Paul B Mahol <> onemda at gmail.com <mailto:onemda at gmail.com>> >
> ---
>  libavcodec/Makefile     |   1 +
>  libavcodec/allcodecs.c  |   1 +
>  libavcodec/avcodec.h    |   1 +
>  libavcodec/codec_desc.c |   7 +
>  libavcodec/siren.c      | 724 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 734 insertions(+)
>  create mode 100644 libavcodec/siren.c
>
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index edccd73037..b2bc61650c 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -578,6 +578,7 @@ OBJS-$(CONFIG_SIPR_DECODER)            += sipr.o acelp_pitch_delay.o \
>  celp_math.o acelp_vectors.o \
>  acelp_filters.o celp_filters.o \
>  sipr16k.o
> +OBJS-$(CONFIG_SIREN_DECODER)           += siren.o
>  OBJS-$(CONFIG_SMACKAUD_DECODER)        += smacker.o
>  OBJS-$(CONFIG_SMACKER_DECODER)         += smacker.o
>  OBJS-$(CONFIG_SMC_DECODER)             += smc.o
> diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> index 6178d31b5c..8c273a490e 100644
> --- a/libavcodec/allcodecs.c
> +++ b/libavcodec/allcodecs.c
> @@ -468,6 +468,7 @@ extern AVCodec ff_sbc_encoder;
>  extern AVCodec ff_sbc_decoder;
>  extern AVCodec ff_shorten_decoder;
>  extern AVCodec ff_sipr_decoder;
> +extern AVCodec ff_siren_decoder;
>  extern AVCodec ff_smackaud_decoder;
>  extern AVCodec ff_sonic_encoder;
>  extern AVCodec ff_sonic_decoder;
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index b749946633..6a382207c3 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -651,6 +651,7 @@ enum AVCodecID {
>  AV_CODEC_ID_SBC,
>  AV_CODEC_ID_ATRAC9,
>  AV_CODEC_ID_HCOM,
> +    AV_CODEC_ID_SIREN,
>  
>  /* subtitle codecs */
>  AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 621b16e160..a3139458f5 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -2978,6 +2978,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
>  .long_name = NULL_IF_CONFIG_SMALL("HCOM Audio"),
>  .props     = AV_CODEC_PROP_LOSSY,
>  },
> +    {
> +        .id        = AV_CODEC_ID_SIREN,
> +        .type      = AVMEDIA_TYPE_AUDIO,
> +        .name      = "siren",
> +        .long_name = NULL_IF_CONFIG_SMALL("Siren"),
> +        .props     = AV_CODEC_PROP_LOSSY,
> +    },
>  
>  /* subtitle codecs */
>  {
> diff --git a/libavcodec/siren.c b/libavcodec/siren.c
> new file mode 100644
> index 0000000000..f9e9897c6b
> --- /dev/null
> +++ b/libavcodec/siren.c
> @@ -0,0 +1,724 @@
> +/*
> + * Siren audio decoder
> + * Copyright (c) 2012 Youness Alaoui <> kakaroto at kakaroto.homelinux.net <mailto:kakaroto at kakaroto.homelinux.net>> >
> + * Copyright (c) 2018 Paul B Mahol
> + * Copyright (c) 2019 Lynne <> dev at lynne.ee <mailto:dev at lynne.ee>> >
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/tx.h"
> +#include "libavutil/float_dsp.h"
> +
> +#include "avcodec.h"
> +#include "get_bits.h"
> +#include "internal.h"
> +#include "mathops.h"
> +
> +#define STEPSIZE    0.3010299957
>

Just hardcode this in the powf call.



> +
> +static const uint16_t checksum_table[4] = { 0x7F80, 0x7878, 0x6666, 0x5555 };
> +static const uint8_t index_table[8] = {4, 4, 3, 3, 2, 2, 1, 0};
> +static const uint8_t vector_dimension[8] = { 2, 2, 2, 4, 4, 5, 5, 1 };
> +static const uint8_t number_of_vectors[8] = { 10, 10, 10, 5, 5, 4, 4, 20 };
> +static const uint8_t expected_bits_table[8] = { 52, 47, 43, 37, 29, 22, 16, 0 };
> +static const int8_t differential_decoder_tree[27][24][2] = {
> +  {{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, -12}, {-11, -10}, {-8, -9}, {-7, -6}, {-13, 12}, {-5, -4}, {0, 13}, {-3, -14}, {-2, 14}, {-1, 15}, {-15, 16}, {-16, 17}, {-17, 18}, {19, 20}, {21, 22}, {-18, -19}, {-20, -21}, {-22, -23}, {-32, -32}},
>

Tidy this table up? Its 240 chars long.



> +
> +static const float noise_category7 = 0.70711f;
>

Hardcode this as well.



> +
> +typedef struct SirenContext {
> +    GetBitContext gb;
> +
> +    int packet_size;
> +    int number_of_coefs;
> +    int rate_control_bits;
> +    int rate_control_possibilities;
> +    int checksum_bits;
> +    int esf_adjustment;
> +    int number_of_regions;
> +    int scale_factor;
> +    int sample_rate_bits;
> +    int bits_per_frame;
> +    int region_size;
> +
> +    int dw1, dw2, dw3, dw4;
> +
> +    int absolute_region_power_index[28];
> +    float decoder_standard_deviation[28];
> +    int power_categories[28];
> +    int category_balance[28];
> +    float standard_deviation[64];
> +    float deviation_inverse[64];
> +    int input_frame[20];
> +    float backup_frame[320];
> +    float coefs[320];
> +
> +    AVFloatDSPContext *fdsp;
> +    av_tx_fn           tx_fn;
> +    AVTXContext       *tx_ctx;
> +
> +    DECLARE_ALIGNED(32, float, context)[320];
> +    DECLARE_ALIGNED(32, float, temp)[320];
> +    DECLARE_ALIGNED(32, float, tx_in)[320];
> +    DECLARE_ALIGNED(32, float, output_frame)[320];
> +    DECLARE_ALIGNED(32, float, window)[320];
>

You only need context, temp and window.
context doesn't need 320 floats, it only uses 160. Also you should rename it to
prev_win.



> +} SirenContext;
> +
> +static av_cold int siren_init(AVCodecContext *avctx)
> +{
> +    const float scale = 1.0f;
>
>
const float scale = 1.0 / 32768;



> +    SirenContext *s = avctx->priv_data;
> +    int i;
> +
> +    avctx->channels       = 1;
> +    avctx->channel_layout = AV_CH_LAYOUT_MONO;
> +    avctx->sample_fmt     = AV_SAMPLE_FMT_S16;
>

AV_SAMPLE_FMT_FLT



> +
> +    s->packet_size = 40;
> +    s->number_of_coefs = 320;
> +    s->rate_control_bits = 4;
> +    s->rate_control_possibilities = 16;
> +    s->checksum_bits = 0;
> +    s->esf_adjustment = 7;
> +    s->number_of_regions = 14;
> +    s->scale_factor = 1;
> +    s->bits_per_frame = avctx->sample_rate / 50;
> +    s->region_size = 20;
> +    s->dw1 = s->dw2 = s->dw3 = s->dw4 = 1;
> +
> +    for (i = 0; i < 64; i++) {
> +        float region_power = powf(10, (i - 24) * STEPSIZE);
> +
> +        s->standard_deviation[i] = sqrtf(region_power);
> +        s->deviation_inverse[i] = 1.f / s->standard_deviation[i];
>

Nit: 1.0 instead of 1.f



> +    }
> +
> +    for (i = 0; i < 320; i++) {
> +        float angle = ((i + 0.5f) * M_PI_2) / 320.f;
> +        s->window[i] = sinf(angle);
> +    }
> +
> +    s->fdsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
> +    if (!s->fdsp)
> +        return AVERROR(ENOMEM);
> +
> +    return av_tx_init(&s->tx_ctx, &s->tx_fn, AV_TX_FLOAT_MDCT, 1, 320, &scale, 0);
> +}
> +
> +static int decode_envelope(SirenContext *s, GetBitContext *gb,
> +                           int number_of_regions, float *decoder_standard_deviation,
> +                           int *absolute_region_power_index, int esf_adjustment)
> +{
> +    int i, index;
> +
> +    absolute_region_power_index[0] = get_bits(gb, 5) - esf_adjustment;
> +    decoder_standard_deviation[0] =
> +        s->standard_deviation[absolute_region_power_index[0] + 24];
> +
> +    for (i = 1; i < number_of_regions; i++) {
> +        index = 0;
> +        do {
> +            index = differential_decoder_tree[i - 1][index][get_bits1(gb)];
> +        } while (index > 0);
> +
> +        absolute_region_power_index[i] =
> +            absolute_region_power_index[i - 1] - index - 12;
> +        decoder_standard_deviation[i] =
> +            s->standard_deviation[absolute_region_power_index[i] + 24];
> +    }
> +
> +    return get_bits_count(gb);
> +}
> +
> +static int categorize_regions(int number_of_regions, int number_of_available_bits,
> +                              int *absolute_region_power_index, int *power_categories,
> +                              int *category_balance)
> +{
> +    int region, delta, i, temp;
> +    int expected_number_of_code_bits;
> +    int min, max;
> +    int offset,
> +        num_rate_control_possibilities,
> +        raw_value, raw_max_idx = 0, raw_min_idx = 0;
> +    int max_rate_categories[28];
> +    int min_rate_categories[28];
> +    int temp_category_balances[64];
> +    int *min_rate_ptr = NULL;
> +    int *max_rate_ptr = NULL;
> +
> +    if (number_of_regions == 14) {
> +        num_rate_control_possibilities = 16;
> +    } else {
> +        num_rate_control_possibilities = 32;
> +    }
> +
> +    offset = -32;
> +    for (delta = 32; number_of_regions > 0 && delta > 0; delta /= 2) {
> +        expected_number_of_code_bits = 0;
> +        for (region = 0; region < number_of_regions; region++) {
> +            i = (delta + offset -
> +                 absolute_region_power_index[region]) >> 1;
> +            i = av_clip_uintp2(i, 3);
> +            power_categories[region] = i;
> +            expected_number_of_code_bits += expected_bits_table[i];
> +
> +        }
> +        if (expected_number_of_code_bits >= number_of_available_bits - 32)
> +            offset += delta;
> +    }
> +
> +    expected_number_of_code_bits = 0;
> +    for (region = 0; region < number_of_regions; region++) {
> +        i = (offset - absolute_region_power_index[region]) >> 1;
> +        i = av_clip_uintp2(i, 3);
> +        max_rate_categories[region] = min_rate_categories[region] =
> +            power_categories[region] = i;
> +        expected_number_of_code_bits += expected_bits_table[i];
> +    }
> +
> +    min = max = expected_number_of_code_bits;
> +    min_rate_ptr = max_rate_ptr =
> +        temp_category_balances + num_rate_control_possibilities;
> +    for (i = 0; i < num_rate_control_possibilities - 1; i++) {
> +        if (min + max > number_of_available_bits * 2) {
> +            raw_value = -99;
> +            for (region = number_of_regions - 1; region >= 0; region--) {
> +                if (min_rate_categories[region] < 7) {
> +                    temp =
> +                        offset - absolute_region_power_index[region] -
> +                        2 * min_rate_categories[region];
> +                    if (temp > raw_value) {
> +                        raw_value = temp;
> +                        raw_min_idx = region;
> +                    }
> +                }
> +            }
> +            *min_rate_ptr++ = raw_min_idx;
> +            min +=
> +                expected_bits_table[min_rate_categories[raw_min_idx] + 1] -
> +                expected_bits_table[min_rate_categories[raw_min_idx]];
> +            min_rate_categories[raw_min_idx]++;
> +        } else {
> +            raw_value = 99;
> +            for (region = 0; region < number_of_regions; region++) {
> +                if (max_rate_categories[region] > 0) {
> +                    temp =
> +                        offset - absolute_region_power_index[region] -
> +                        2 * max_rate_categories[region];
> +                    if (temp < raw_value) {
> +                        raw_value = temp;
> +                        raw_max_idx = region;
> +                    }
> +                }
> +            }
> +
> +            *--max_rate_ptr = raw_max_idx;
> +            max += expected_bits_table[max_rate_categories[raw_max_idx] - 1] -
> +                   expected_bits_table[max_rate_categories[raw_max_idx]];
> +            max_rate_categories[raw_max_idx]--;
> +        }
> +    }
> +
> +    for (region = 0; region < number_of_regions; region++)
> +        power_categories[region] = max_rate_categories[region];
> +
> +    for (i = 0; i < num_rate_control_possibilities - 1; i++)
> +        category_balance[i] = *max_rate_ptr++;
> +
> +    return 0;
> +}
> +
> +static int get_dw(SirenContext *s)
> +{
> +    int ret = s->dw1 + s->dw4;
> +
> +    if ((ret & 0x8000) != 0)
> +        ret++;
> +
> +    s->dw1 = s->dw2;
> +    s->dw2 = s->dw3;
> +    s->dw3 = s->dw4;
> +    s->dw4 = ret;
>

You should add a flush function to set those to 1 as well as set prev_win to 0 for seeking.



> +
> +    return ret;
> +}
> +
> +static int decode_vector(SirenContext *s, int number_of_regions,
> +                         int number_of_available_bits, float *decoder_standard_deviation,
> +                         int *power_categories, float *coefs, int scale_factor)
> +{
> +    GetBitContext *gb = &s->gb;
> +    float *coefs_ptr;
> +    float decoded_value;
> +    float noise;
> +    const uint16_t *decoder_tree;
> +    int region;
> +    int category;
> +    int i, j;
> +    int index;
> +    int error;
> +    int dw1;
> +    int dw2;
> +
> +    error = 0;
> +    for (region = 0; region < number_of_regions; region++) {
> +        category = power_categories[region];
> +        coefs_ptr = coefs + (region * s->region_size);
> +
> +        if (category < 7) {
> +            decoder_tree = decoder_tables[category];
> +
> +            for (i = 0; i < number_of_vectors[category]; i++) {
> +                index = 0;
> +                do {
> +                    if (get_bits_left(gb) <= 0) {
> +                        error = 1;
> +                        break;
> +                    }
> +
> +                    index = decoder_tree[index + get_bits1(gb)];
> +                } while ((index & 1) == 0);
> +
> +                index >>= 1;
> +
> +                if (error == 0 && get_bits_left(gb) >= 0) {
> +                    for (j = 0; j < vector_dimension[category]; j++) {
> +                        decoded_value = mlt_quant[category][index & ((1 << index_table[category]) - 1)];
> +                        index >>= index_table[category];
> +
> +                        if (decoded_value != 0) {
> +                            if (!get_bits1(gb))
> +                                decoded_value *= -decoder_standard_deviation[region];
> +                            else
> +                                decoded_value *= decoder_standard_deviation[region];
> +                        }
> +
> +                        *coefs_ptr++ = decoded_value * scale_factor;
> +                    }
> +                } else {
> +                    error = 1;
> +                    break;
> +                }
> +            }
> +
> +            if (error == 1) {
> +                for (j = region + 1; j < number_of_regions; j++)
> +                    power_categories[j] = 7;
> +                category = 7;
> +            }
> +        }
> +
> +        coefs_ptr = coefs + (region * s->region_size);
> +
> +        if (category == 5) {
> +            i = 0;
> +            for (j = 0; j < s->region_size; j++) {
> +                if (*coefs_ptr != 0) {
> +                    i++;
> +                    if (fabs(*coefs_ptr) >
> +                        2.0 * decoder_standard_deviation[region]) {
> +                        i += 3;
> +                    }
> +                }
> +                coefs_ptr++;
> +            }
> +
> +            noise =
> +                decoder_standard_deviation[region] * noise_category5[i];
> +        } else if (category == 6) {
> +            i = 0;
> +            for (j = 0; j < s->region_size; j++) {
> +                if (*coefs_ptr++ != 0)
> +                    i++;
> +            }
> +
> +            noise =
> +                decoder_standard_deviation[region] * noise_category6[i];
> +        } else if (category == 7) {
> +            noise = decoder_standard_deviation[region] * noise_category7;
> +        } else {
> +            noise = 0;
> +        }
> +
> +        coefs_ptr = coefs + (region * s->region_size);
> +
> +        if (category == 5 || category == 6 || category == 7) {
> +            dw1 = get_dw(s);
> +            dw2 = get_dw(s);
> +
> +            for (j = 0; j < 10; j++) {
> +                if (category == 7 || *coefs_ptr == 0) {
> +                    if ((dw1 & 1))
> +                        *coefs_ptr = noise;
> +                    else
> +                        *coefs_ptr = -noise;
> +                }
> +                coefs_ptr++;
> +                dw1 >>= 1;
> +
> +                if (category == 7 || *coefs_ptr == 0) {
> +                    if ((dw2 & 1))
> +                        *coefs_ptr = noise;
> +                    else
> +                        *coefs_ptr = -noise;
> +                }
> +                coefs_ptr++;
> +                dw2 >>= 1;
> +            }
> +        }
> +    }
> +
> +    return error == 1 ? -1 : get_bits_left(gb);
> +}
> +
> +static int decode_samples(SirenContext *s, float *coefs, float *old_win, int dct_length, float *samples)
> +{
> +    s->tx_fn(s->tx_ctx, s->temp, coefs, sizeof(float));
> +
> +    s->fdsp->vector_fmul_window(samples, old_win, s->temp,
> +                                s->window, dct_length >> 1);
> +
> +    memcpy(old_win, s->temp + (dct_length >> 1), sizeof(float)*dct_length >> 1);
> +
> +    return 1;
> +}
> +
> +static int siren_decode(AVCodecContext *avctx, void *data,
> +                        int *got_frame, AVPacket *pkt)
> +{
> +    SirenContext *s = avctx->priv_data;
> +    GetBitContext *gb = &s->gb;
> +    AVFrame *frame = data;
> +    int ret, number_of_valid_coefs = 20 * s->number_of_regions;
> +    int frame_error = 0, i, rate_control = 0;
> +    int checksum, calculated_checksum;
> +
> +    if (s->checksum_bits > 0)
> +        memcpy(s->input_frame, pkt->data, FFMIN(s->packet_size, sizeof(s->input_frame)));
> +
> +    if ((ret = init_get_bits8(gb, pkt->data, pkt->size)) < 0)
> +        return ret;
> +
> +    decode_envelope(s, gb, s->number_of_regions,
> +                    s->decoder_standard_deviation,
> +                    s->absolute_region_power_index, s->esf_adjustment);
> +
> +    rate_control = get_bits(gb, s->rate_control_bits);
> +
> +    categorize_regions(s->number_of_regions, get_bits_left(gb),
> +                       s->absolute_region_power_index, s->power_categories,
> +                       s->category_balance);
> +
> +    for (i = 0; i < rate_control; i++) {
> +        s->power_categories[s->category_balance[i]]++;
> +    }
> +
> +    decode_vector(s, s->number_of_regions, get_bits_left(gb),
> +                  s->decoder_standard_deviation, s->power_categories,
> +                  s->coefs, s->scale_factor);
> +
> +    if (get_bits_left(gb) > 0) {
> +        for (i = 0; i < get_bits_left(gb); i++) {
> +            if (!get_bits1(gb))
> +                frame_error = 1;
> +        }
> +    } else if (get_bits_left(gb) < 0 &&
> +               rate_control + 1 < s->rate_control_possibilities) {
> +        frame_error |= 2;
> +    }
> +
> +    for (i = 0; i < s->number_of_regions; i++) {
> +        if (s->absolute_region_power_index[i] > 33 ||
> +            s->absolute_region_power_index[i] < -31)
> +            frame_error |= 4;
> +    }
> +
> +    if (s->checksum_bits > 0) {
>

This is dead code, s->checksum_bits is always 0 as init'd by the init function.
You should probably remove it or maybe make it warn if enabled with a flag.
The checksum algorithm is either 16-bit ANSI or CCITT.
If you remove the checksum code you can also remove s->packet_size,
s->checksum_bits, s->input_frame and clean up siren_decode().



> +        int bytes_per_frame = s->bits_per_frame >> 4;
> +        int idx = 0, sum = 0;
> +
> +        checksum = s->input_frame[bytes_per_frame - 1] & ((1 << s->checksum_bits) - 1);
> +        s->input_frame[bytes_per_frame - 1] &= ~checksum;
> +        do {
> +            sum ^= (s->input_frame[idx] & 0xFFFF) << (idx % 15);
> +        } while (++idx < bytes_per_frame);
> +
> +        sum = (sum >> 15) ^ (sum & 0x7FFF);
> +        calculated_checksum = 0;
> +        for (i = 0; i < 4; i++) {
> +            int j, temp1 = checksum_table[i] & sum;
> +            for (j = 8; j > 0; j >>= 1) {
> +                int temp2 = temp1 >> j;
> +                temp1 ^= temp2;
> +            }
> +            calculated_checksum <<= 1;
> +            calculated_checksum |= temp1 & 1;
> +        }
> +
> +        if (checksum != calculated_checksum)
> +            frame_error |= 8;
> +    }
> +
> +    if (frame_error != 0) {
> +        for (i = 0; i < number_of_valid_coefs; i++) {
> +            s->coefs[i] = s->backup_frame[i];
> +            s->backup_frame[i] = 0;
> +        }
> +    } else {
> +        for (i = 0; i < number_of_valid_coefs; i++)
> +            s->backup_frame[i] = s->coefs[i];
> +    }
> +
> +    for (i = number_of_valid_coefs; i < s->number_of_coefs; i++)
> +        s->coefs[i] = 0;
> +
> +    *got_frame = decode_samples(s, s->coefs, s->context, s->number_of_coefs, s->output_frame);
> +    if (*got_frame) {
> +        int16_t *dst;
> +
> +        frame->nb_samples = 320;
> +        if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
> +            return ret;
> +        dst = (int16_t *)frame->data[0];
> +
> +        for (i = 0; i < frame->nb_samples; i++) {
> +            dst[i] = av_clip_int16(s->output_frame[i]);
> +        }
> +    }
>

Since you've changed the sample format you can just decode directly to the frame like:
frame->nb_samples = 320;
if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
        return ret;

decode_samples(s, s->coefs, s->prev_win, s->number_of_coefs,
               (float *)frame->data[0]);
*got_frame = 1;

And make decode_samples return a void.


More information about the ffmpeg-devel mailing list