[FFmpeg-devel] [PATCH 1/3] sbc: implement SBC codec (low-complexity subband codec)

Rostislav Pehlivanov atomnuker at gmail.com
Mon Nov 6 06:40:56 EET 2017


On 5 November 2017 at 23:35, Aurelien Jacobs <aurel at gnuage.org> wrote:

> This was originally based on libsbc, and was fully integrated into ffmpeg.
> ---
>  doc/general.texi                 |   2 +
>  libavcodec/Makefile              |   4 +
>  libavcodec/allcodecs.c           |   2 +
>  libavcodec/arm/Makefile          |   3 +
>  libavcodec/arm/sbcdsp_armv6.S    | 245 ++++++++++++++
>  libavcodec/arm/sbcdsp_init_arm.c | 105 ++++++
>  libavcodec/arm/sbcdsp_neon.S     | 714 ++++++++++++++++++++++++++++++
> +++++++++
>  libavcodec/avcodec.h             |   2 +
>  libavcodec/codec_desc.c          |  12 +
>  libavcodec/sbc.c                 | 316 +++++++++++++++++
>  libavcodec/sbc.h                 | 121 +++++++
>  libavcodec/sbcdec.c              | 469 +++++++++++++++++++++++++
>  libavcodec/sbcdec_data.c         | 127 +++++++
>  libavcodec/sbcdec_data.h         |  44 +++
>  libavcodec/sbcdsp.c              | 569 +++++++++++++++++++++++++++++++
>  libavcodec/sbcdsp.h              |  86 +++++
>  libavcodec/sbcdsp_data.c         | 335 ++++++++++++++++++
>  libavcodec/sbcdsp_data.h         |  57 ++++
>  libavcodec/sbcenc.c              | 461 +++++++++++++++++++++++++
>  libavcodec/x86/Makefile          |   2 +
>  libavcodec/x86/sbcdsp.asm        | 290 ++++++++++++++++
>  libavcodec/x86/sbcdsp_init.c     |  51 +++
>  22 files changed, 4017 insertions(+)
>  create mode 100644 libavcodec/arm/sbcdsp_armv6.S
>  create mode 100644 libavcodec/arm/sbcdsp_init_arm.c
>  create mode 100644 libavcodec/arm/sbcdsp_neon.S
>  create mode 100644 libavcodec/sbc.c
>  create mode 100644 libavcodec/sbc.h
>  create mode 100644 libavcodec/sbcdec.c
>  create mode 100644 libavcodec/sbcdec_data.c
>  create mode 100644 libavcodec/sbcdec_data.h
>  create mode 100644 libavcodec/sbcdsp.c
>  create mode 100644 libavcodec/sbcdsp.h
>  create mode 100644 libavcodec/sbcdsp_data.c
>  create mode 100644 libavcodec/sbcdsp_data.h
>  create mode 100644 libavcodec/sbcenc.c
>  create mode 100644 libavcodec/x86/sbcdsp.asm
>  create mode 100644 libavcodec/x86/sbcdsp_init.c
>
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index c4134424f0..2d541bf64a 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -632,6 +632,8 @@ enum AVCodecID {
>      AV_CODEC_ID_ATRAC3AL,
>      AV_CODEC_ID_ATRAC3PAL,
>      AV_CODEC_ID_DOLBY_E,
> +    AV_CODEC_ID_SBC,
> +    AV_CODEC_ID_MSBC,
>
>
See below.


>      /* subtitle codecs */
>      AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID
> pointing at the start of subtitle codecs.
> diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
> index 92bf1d2681..8d613507e0 100644
> --- a/libavcodec/codec_desc.c
> +++ b/libavcodec/codec_desc.c
> @@ -2859,6 +2859,18 @@ static const AVCodecDescriptor codec_descriptors[]
> = {
>          .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
>          .props     = AV_CODEC_PROP_LOSSY,
>      },
> +    {
> +        .id        = AV_CODEC_ID_SBC,
> +        .type      = AVMEDIA_TYPE_AUDIO,
> +        .name      = "sbc",
> +        .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband
> codec)"),
> +    },
> +    {
> +        .id        = AV_CODEC_ID_MSBC,
> +        .type      = AVMEDIA_TYPE_AUDIO,
> +        .name      = "msbc",
> +        .long_name = NULL_IF_CONFIG_SMALL("mSBC (wideband speech mono
> SBC)"),
> +    },
>

Is there a bitstream difference between the two? I don't think so, so you
should instead define FF_PROFILE_SBC_WB and use a single codec ID.


>
>      /* subtitle codecs */
>      {
> diff --git a/libavcodec/sbc.c b/libavcodec/sbc.c
> new file mode 100644
> index 0000000000..99d02cc56a
> --- /dev/null
> +++ b/libavcodec/sbc.c
> @@ -0,0 +1,316 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017  Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2012-2013  Intel Corporation
> + * Copyright (C) 2008-2010  Nokia Corporation
> + * Copyright (C) 2004-2010  Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005  Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2008  Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * SBC common functions for the encoder and decoder
> + */
> +
> +#include "avcodec.h"
> +#include "sbc.h"
> +
> +/*
> + * Calculates the CRC-8 of the first len bits in data
> + */
> +static const uint8_t crc_table[256] = {
> +    0x00, 0x1D, 0x3A, 0x27, 0x74, 0x69, 0x4E, 0x53,
> +    0xE8, 0xF5, 0xD2, 0xCF, 0x9C, 0x81, 0xA6, 0xBB,
> +    0xCD, 0xD0, 0xF7, 0xEA, 0xB9, 0xA4, 0x83, 0x9E,
> +    0x25, 0x38, 0x1F, 0x02, 0x51, 0x4C, 0x6B, 0x76,
> +    0x87, 0x9A, 0xBD, 0xA0, 0xF3, 0xEE, 0xC9, 0xD4,
> +    0x6F, 0x72, 0x55, 0x48, 0x1B, 0x06, 0x21, 0x3C,
> +    0x4A, 0x57, 0x70, 0x6D, 0x3E, 0x23, 0x04, 0x19,
> +    0xA2, 0xBF, 0x98, 0x85, 0xD6, 0xCB, 0xEC, 0xF1,
> +    0x13, 0x0E, 0x29, 0x34, 0x67, 0x7A, 0x5D, 0x40,
> +    0xFB, 0xE6, 0xC1, 0xDC, 0x8F, 0x92, 0xB5, 0xA8,
> +    0xDE, 0xC3, 0xE4, 0xF9, 0xAA, 0xB7, 0x90, 0x8D,
> +    0x36, 0x2B, 0x0C, 0x11, 0x42, 0x5F, 0x78, 0x65,
> +    0x94, 0x89, 0xAE, 0xB3, 0xE0, 0xFD, 0xDA, 0xC7,
> +    0x7C, 0x61, 0x46, 0x5B, 0x08, 0x15, 0x32, 0x2F,
> +    0x59, 0x44, 0x63, 0x7E, 0x2D, 0x30, 0x17, 0x0A,
> +    0xB1, 0xAC, 0x8B, 0x96, 0xC5, 0xD8, 0xFF, 0xE2,
> +    0x26, 0x3B, 0x1C, 0x01, 0x52, 0x4F, 0x68, 0x75,
> +    0xCE, 0xD3, 0xF4, 0xE9, 0xBA, 0xA7, 0x80, 0x9D,
> +    0xEB, 0xF6, 0xD1, 0xCC, 0x9F, 0x82, 0xA5, 0xB8,
> +    0x03, 0x1E, 0x39, 0x24, 0x77, 0x6A, 0x4D, 0x50,
> +    0xA1, 0xBC, 0x9B, 0x86, 0xD5, 0xC8, 0xEF, 0xF2,
> +    0x49, 0x54, 0x73, 0x6E, 0x3D, 0x20, 0x07, 0x1A,
> +    0x6C, 0x71, 0x56, 0x4B, 0x18, 0x05, 0x22, 0x3F,
> +    0x84, 0x99, 0xBE, 0xA3, 0xF0, 0xED, 0xCA, 0xD7,
> +    0x35, 0x28, 0x0F, 0x12, 0x41, 0x5C, 0x7B, 0x66,
> +    0xDD, 0xC0, 0xE7, 0xFA, 0xA9, 0xB4, 0x93, 0x8E,
> +    0xF8, 0xE5, 0xC2, 0xDF, 0x8C, 0x91, 0xB6, 0xAB,
> +    0x10, 0x0D, 0x2A, 0x37, 0x64, 0x79, 0x5E, 0x43,
> +    0xB2, 0xAF, 0x88, 0x95, 0xC6, 0xDB, 0xFC, 0xE1,
> +    0x5A, 0x47, 0x60, 0x7D, 0x2E, 0x33, 0x14, 0x09,
> +    0x7F, 0x62, 0x45, 0x58, 0x0B, 0x16, 0x31, 0x2C,
> +    0x97, 0x8A, 0xAD, 0xB0, 0xE3, 0xFE, 0xD9, 0xC4
> +};
> +
> +uint8_t ff_sbc_crc8(const uint8_t *data, size_t len)
> +{
> +    uint8_t crc = 0x0f;
> +    size_t i;
> +    uint8_t octet;
> +
> +    for (i = 0; i < len / 8; i++)
> +        crc = crc_table[crc ^ data[i]];
> +
> +    octet = data[i];
> +    for (i = 0; i < len % 8; i++) {
> +        char bit = ((octet ^ crc) & 0x80) >> 7;
> +
> +        crc = ((crc & 0x7f) << 1) ^ (bit ? 0x1d : 0);
> +
> +        octet = octet << 1;
> +    }
> +
> +    return crc;
> +}
>


We have CRC functions already, look in libavutil/crc.h



> +                        if (subbands == 4)
> +                            loudness = frame->scale_factor[ch][sb] -
> sbc_offset4[sf][sb];
> +                        else
> +                            loudness = frame->scale_factor[ch][sb] -
> sbc_offset8[sf][sb];
> +                        if (loudness > 0)
> +                            bitneed[ch][sb] = loudness / 2;
> +                        else
> +                            bitneed[ch][sb] = loudness;
>


bitneed[ch][sb] = loudness >> (loudness > 0);



>
> +
> +static int sbc_decode_frame(AVCodecContext *avctx,
> +                            void *data, int *got_frame_ptr,
> +                            AVPacket *avpkt)
> +{
> +    SBCDecContext *sbc = avctx->priv_data;
> +    int i, ch, samples, ret;
> +    AVFrame *frame = data;
> +    int16_t *ptr;
> +
> +    if (!sbc)
> +        return AVERROR(EIO);
> +
> +    sbc->frame.length = sbc->unpack_frame(avpkt->data, &sbc->frame,
> avpkt->size);
> +    if (sbc->frame.length <= 0)
> +        return sbc->frame.length;
> +
> +    samples = sbc_synthesize_audio(&sbc->dsp, &sbc->frame);
> +
> +    frame->nb_samples = samples;
> +    if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
> +        return ret;
> +    ptr = (int16_t *)frame->data[0];
> +
> +    for (i = 0; i < samples; i++)
> +        for (ch = 0; ch < sbc->frame.channels; ch++)
> +            *ptr++ = sbc->frame.pcm_sample[ch][i];
> +
>

Once again, use planar sample formats



> +    *got_frame_ptr = 1;
> +
> +    return sbc->frame.length;
> +}
> +
> +#if CONFIG_SBC_DECODER
> +AVCodec ff_sbc_decoder = {
> +    .name                  = "sbc",
> +    .long_name             = NULL_IF_CONFIG_SMALL("SBC (low-complexity
> subband codec)"),
> +    .type                  = AVMEDIA_TYPE_AUDIO,
> +    .id                    = AV_CODEC_ID_SBC,
> +    .priv_data_size        = sizeof(SBCDecContext),
> +    .init                  = sbc_decode_init,
> +    .decode                = sbc_decode_frame,
> +    .capabilities          = AV_CODEC_CAP_DR1,
> +    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
> +                                                  AV_CH_LAYOUT_STEREO, 0},
> +    .sample_fmts           = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
>


AV_SAMPLE_FMT_S16P



> +
>  AV_SAMPLE_FMT_NONE },
> +    .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000,
> 0 },
> +};
> +#endif
> +
> +#if CONFIG_MSBC_DECODER
> +AVCodec ff_msbc_decoder = {
> +    .name                  = "msbc",
> +    .long_name             = NULL_IF_CONFIG_SMALL("mSBC (wideband speech
> mono SBC)"),
> +    .type                  = AVMEDIA_TYPE_AUDIO,
> +    .id                    = AV_CODEC_ID_MSBC,
> +    .priv_data_size        = sizeof(SBCDecContext),
> +    .init                  = msbc_decode_init,
> +    .decode                = sbc_decode_frame,
> +    .capabilities          = AV_CODEC_CAP_DR1,
> +    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO, 0},
> +    .sample_fmts           = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
>


AV_SAMPLE_FMT_S16P



> +
> +/*
> + * A reference C code of analysis filter with SIMD-friendly tables
> + * reordering and code layout. This code can be used to develop platform
> + * specific SIMD optimizations. Also it may be used as some kind of test
> + * for compiler autovectorization capabilities (who knows, if the compiler
> + * is very good at this stuff, hand optimized assembly may be not strictly
> + * needed for some platform).
> + *
> + * Note: It is also possible to make a simple variant of analysis filter,
> + * which needs only a single constants table without taking care about
> + * even/odd cases. This simple variant of filter can be implemented
> without
> + * input data permutation. The only thing that would be lost is the
> + * possibility to use pairwise SIMD multiplications. But for some simple
> + * CPU cores without SIMD extensions it can be useful. If anybody is
> + * interested in implementing such variant of a filter, sourcecode from
> + * bluez versions 4.26/4.27 can be used as a reference and the history of
> + * the changes in git repository done around that time may be worth
> checking.
> + */
> +
> +static void sbc_analyze_4_simd(const int16_t *in, int32_t *out,
> +                               const int16_t *consts)
> +{
> +    int32_t t1[4];
> +    int16_t t2[4];
> +    int hop = 0;
> +
> +    /* rounding coefficient */
> +    t1[0] = t1[1] = t1[2] = t1[3] =
> +        (int32_t) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
> +
> +    /* low pass polyphase filter */
> +    for (hop = 0; hop < 40; hop += 8) {
> +        t1[0] += (int32_t) in[hop] * consts[hop];
> +        t1[0] += (int32_t) in[hop + 1] * consts[hop + 1];
> +        t1[1] += (int32_t) in[hop + 2] * consts[hop + 2];
> +        t1[1] += (int32_t) in[hop + 3] * consts[hop + 3];
> +        t1[2] += (int32_t) in[hop + 4] * consts[hop + 4];
> +        t1[2] += (int32_t) in[hop + 5] * consts[hop + 5];
> +        t1[3] += (int32_t) in[hop + 6] * consts[hop + 6];
> +        t1[3] += (int32_t) in[hop + 7] * consts[hop + 7];
> +    }
> +
> +    /* scaling */
> +    t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
> +    t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
> +    t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
> +    t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
> +
> +    /* do the cos transform */
> +    t1[0]  = (int32_t) t2[0] * consts[40 + 0];
> +    t1[0] += (int32_t) t2[1] * consts[40 + 1];
> +    t1[1]  = (int32_t) t2[0] * consts[40 + 2];
> +    t1[1] += (int32_t) t2[1] * consts[40 + 3];
> +    t1[2]  = (int32_t) t2[0] * consts[40 + 4];
> +    t1[2] += (int32_t) t2[1] * consts[40 + 5];
> +    t1[3]  = (int32_t) t2[0] * consts[40 + 6];
> +    t1[3] += (int32_t) t2[1] * consts[40 + 7];
> +
> +    t1[0] += (int32_t) t2[2] * consts[40 + 8];
> +    t1[0] += (int32_t) t2[3] * consts[40 + 9];
> +    t1[1] += (int32_t) t2[2] * consts[40 + 10];
> +    t1[1] += (int32_t) t2[3] * consts[40 + 11];
> +    t1[2] += (int32_t) t2[2] * consts[40 + 12];
> +    t1[2] += (int32_t) t2[3] * consts[40 + 13];
> +    t1[3] += (int32_t) t2[2] * consts[40 + 14];
> +    t1[3] += (int32_t) t2[3] * consts[40 + 15];
> +
> +    out[0] = t1[0] >>
> +        (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> +    out[1] = t1[1] >>
> +        (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> +    out[2] = t1[2] >>
> +        (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> +    out[3] = t1[3] >>
> +        (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
> +}
> +
> +static void sbc_analyze_8_simd(const int16_t *in, int32_t *out,
> +                               const int16_t *consts)
> +{
> +    int32_t t1[8];
> +    int16_t t2[8];
> +    int i, hop;
> +
> +    /* rounding coefficient */
> +    t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
> +        (int32_t) 1 << (SBC_PROTO_FIXED8_SCALE-1);
> +
> +    /* low pass polyphase filter */
> +    for (hop = 0; hop < 80; hop += 16) {
> +        t1[0] += (int32_t) in[hop] * consts[hop];
> +        t1[0] += (int32_t) in[hop + 1] * consts[hop + 1];
> +        t1[1] += (int32_t) in[hop + 2] * consts[hop + 2];
> +        t1[1] += (int32_t) in[hop + 3] * consts[hop + 3];
> +        t1[2] += (int32_t) in[hop + 4] * consts[hop + 4];
> +        t1[2] += (int32_t) in[hop + 5] * consts[hop + 5];
> +        t1[3] += (int32_t) in[hop + 6] * consts[hop + 6];
> +        t1[3] += (int32_t) in[hop + 7] * consts[hop + 7];
> +        t1[4] += (int32_t) in[hop + 8] * consts[hop + 8];
> +        t1[4] += (int32_t) in[hop + 9] * consts[hop + 9];
> +        t1[5] += (int32_t) in[hop + 10] * consts[hop + 10];
> +        t1[5] += (int32_t) in[hop + 11] * consts[hop + 11];
> +        t1[6] += (int32_t) in[hop + 12] * consts[hop + 12];
> +        t1[6] += (int32_t) in[hop + 13] * consts[hop + 13];
> +        t1[7] += (int32_t) in[hop + 14] * consts[hop + 14];
> +        t1[7] += (int32_t) in[hop + 15] * consts[hop + 15];
> +    }
> +
> +    /* scaling */
> +    t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
> +    t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
> +
> +
> +    /* do the cos transform */
> +    t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0;
> +
> +    for (i = 0; i < 4; i++) {
> +        t1[0] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 0];
> +        t1[0] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 1];
> +        t1[1] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 2];
> +        t1[1] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 3];
> +        t1[2] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 4];
> +        t1[2] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 5];
> +        t1[3] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 6];
> +        t1[3] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 7];
> +        t1[4] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 8];
> +        t1[4] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 9];
> +        t1[5] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 10];
> +        t1[5] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 11];
> +        t1[6] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 12];
> +        t1[6] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 13];
> +        t1[7] += (int32_t) t2[i * 2 + 0] * consts[80 + i * 16 + 14];
> +        t1[7] += (int32_t) t2[i * 2 + 1] * consts[80 + i * 16 + 15];
> +    }
> +
> +    for (i = 0; i < 8; i++)
> +        out[i] = t1[i] >>
> +            (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
> +}
> +
>


What does it do here? A PQF into an FFT?
I might investigate using lavc's fixed point mdct for this maybe, I hate
custom fixed-point analysis transforms.



> +static inline void sbc_analyze_4b_4s_simd(SBCDSPContext *s,
> +                                          int16_t *x, int32_t *out, int
> out_stride)
> +{
> +    /* Analyze blocks */
> +    s->sbc_analyze_4(x + 12, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_odd);
> +    out += out_stride;
> +    s->sbc_analyze_4(x + 8, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_even);
> +    out += out_stride;
> +    s->sbc_analyze_4(x + 4, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_odd);
> +    out += out_stride;
> +    s->sbc_analyze_4(x + 0, out, ff_sbcdsp_analysis_consts_
> fixed4_simd_even);
> +
> +    emms_c();
> +}
> +
> +static inline void sbc_analyze_4b_8s_simd(SBCDSPContext *s,
> +                                          int16_t *x, int32_t *out, int
> out_stride)
> +{
> +    /* Analyze blocks */
> +    s->sbc_analyze_8(x + 24, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_odd);
> +    out += out_stride;
> +    s->sbc_analyze_8(x + 16, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_even);
> +    out += out_stride;
> +    s->sbc_analyze_8(x + 8, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_odd);
> +    out += out_stride;
> +    s->sbc_analyze_8(x + 0, out, ff_sbcdsp_analysis_consts_
> fixed8_simd_even);
> +
> +    emms_c();
> +}
> +
> +static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
> +                                               int16_t *x, int32_t *out,
> +                                               int out_stride);
> +
> +static inline void sbc_analyze_1b_8s_simd_odd(SBCDSPContext *s,
> +                                              int16_t *x, int32_t *out,
> +                                              int out_stride)
> +{
> +    s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_odd);
> +    s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_even;
> +
> +    emms_c();
> +}
> +
> +static inline void sbc_analyze_1b_8s_simd_even(SBCDSPContext *s,
> +                                               int16_t *x, int32_t *out,
> +                                               int out_stride)
> +{
> +    s->sbc_analyze_8(x, out, ff_sbcdsp_analysis_consts_fixed8_simd_even);
> +    s->sbc_analyze_8s = sbc_analyze_1b_8s_simd_odd;
> +
> +    emms_c();
> +}
> +
> +#define PCM(i)  AV_RN16(pcm + 2*(i))
>

Don't use a define, just substitute it directly.


> +
> +/*
> + * Internal helper functions for input data processing. In order to get
> + * optimal performance, it is important to have "nsamples" and "nchannels"
> + * arguments used with this inline function as compile time constants.
> + */
> +
> +static av_always_inline int sbc_encoder_process_input_s4_internal(
> +    int position, const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
> +    int nsamples, int nchannels)
> +{
> +    /* handle X buffer wraparound */
> +    if (position < nsamples) {
> +        if (nchannels > 0)
> +            memcpy(&X[0][SBC_X_BUFFER_SIZE - 40], &X[0][position],
> +                            36 * sizeof(int16_t));
> +        if (nchannels > 1)
> +            memcpy(&X[1][SBC_X_BUFFER_SIZE - 40], &X[1][position],
> +                            36 * sizeof(int16_t));
> +        position = SBC_X_BUFFER_SIZE - 40;
> +    }
> +
> +    /* copy/permutate audio samples */
> +    while ((nsamples -= 8) >= 0) {
> +        position -= 8;
> +        if (nchannels > 0) {
> +            int16_t *x = &X[0][position];
> +            x[0]  = PCM(0 + 7 * nchannels);
> +            x[1]  = PCM(0 + 3 * nchannels);
> +            x[2]  = PCM(0 + 6 * nchannels);
> +            x[3]  = PCM(0 + 4 * nchannels);
> +            x[4]  = PCM(0 + 0 * nchannels);
> +            x[5]  = PCM(0 + 2 * nchannels);
> +            x[6]  = PCM(0 + 1 * nchannels);
> +            x[7]  = PCM(0 + 5 * nchannels);
> +        }
> +        if (nchannels > 1) {
> +            int16_t *x = &X[1][position];
> +            x[0]  = PCM(1 + 7 * nchannels);
> +            x[1]  = PCM(1 + 3 * nchannels);
> +            x[2]  = PCM(1 + 6 * nchannels);
> +            x[3]  = PCM(1 + 4 * nchannels);
> +            x[4]  = PCM(1 + 0 * nchannels);
> +            x[5]  = PCM(1 + 2 * nchannels);
> +            x[6]  = PCM(1 + 1 * nchannels);
> +            x[7]  = PCM(1 + 5 * nchannels);
> +        }
> +        pcm += 16 * nchannels;
> +    }
> +
> +    return position;
> +}
> +
> +static av_always_inline int sbc_encoder_process_input_s8_internal(
> +    int position, const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
> +    int nsamples, int nchannels)
> +{
> +    /* handle X buffer wraparound */
> +    if (position < nsamples) {
> +        if (nchannels > 0)
> +            memcpy(&X[0][SBC_X_BUFFER_SIZE - 72], &X[0][position],
> +                            72 * sizeof(int16_t));
> +        if (nchannels > 1)
> +            memcpy(&X[1][SBC_X_BUFFER_SIZE - 72], &X[1][position],
> +                            72 * sizeof(int16_t));
> +        position = SBC_X_BUFFER_SIZE - 72;
> +    }
> +
> +    if (position % 16 == 8) {
> +        position -= 8;
> +        nsamples -= 8;
> +        if (nchannels > 0) {
> +            int16_t *x = &X[0][position];
> +            x[0]  = PCM(0 + (15-8) * nchannels);
> +            x[2]  = PCM(0 + (14-8) * nchannels);
> +            x[3]  = PCM(0 + (8-8) * nchannels);
> +            x[4]  = PCM(0 + (13-8) * nchannels);
> +            x[5]  = PCM(0 + (9-8) * nchannels);
> +            x[6]  = PCM(0 + (12-8) * nchannels);
> +            x[7]  = PCM(0 + (10-8) * nchannels);
> +            x[8]  = PCM(0 + (11-8) * nchannels);
> +        }
> +        if (nchannels > 1) {
> +            int16_t *x = &X[1][position];
> +            x[0]  = PCM(1 + (15-8) * nchannels);
> +            x[2]  = PCM(1 + (14-8) * nchannels);
> +            x[3]  = PCM(1 + (8-8) * nchannels);
> +            x[4]  = PCM(1 + (13-8) * nchannels);
> +            x[5]  = PCM(1 + (9-8) * nchannels);
> +            x[6]  = PCM(1 + (12-8) * nchannels);
> +            x[7]  = PCM(1 + (10-8) * nchannels);
> +            x[8]  = PCM(1 + (11-8) * nchannels);
> +        }
> +
> +        pcm += 16 * nchannels;
> +    }
> +
> +    /* copy/permutate audio samples */
> +    while (nsamples >= 16) {
> +        position -= 16;
> +        if (nchannels > 0) {
> +            int16_t *x = &X[0][position];
> +            x[0]  = PCM(0 + 15 * nchannels);
> +            x[1]  = PCM(0 + 7 * nchannels);
> +            x[2]  = PCM(0 + 14 * nchannels);
> +            x[3]  = PCM(0 + 8 * nchannels);
> +            x[4]  = PCM(0 + 13 * nchannels);
> +            x[5]  = PCM(0 + 9 * nchannels);
> +            x[6]  = PCM(0 + 12 * nchannels);
> +            x[7]  = PCM(0 + 10 * nchannels);
> +            x[8]  = PCM(0 + 11 * nchannels);
> +            x[9]  = PCM(0 + 3 * nchannels);
> +            x[10] = PCM(0 + 6 * nchannels);
> +            x[11] = PCM(0 + 0 * nchannels);
> +            x[12] = PCM(0 + 5 * nchannels);
> +            x[13] = PCM(0 + 1 * nchannels);
> +            x[14] = PCM(0 + 4 * nchannels);
> +            x[15] = PCM(0 + 2 * nchannels);
> +        }
> +        if (nchannels > 1) {
> +            int16_t *x = &X[1][position];
> +            x[0]  = PCM(1 + 15 * nchannels);
> +            x[1]  = PCM(1 + 7 * nchannels);
> +            x[2]  = PCM(1 + 14 * nchannels);
> +            x[3]  = PCM(1 + 8 * nchannels);
> +            x[4]  = PCM(1 + 13 * nchannels);
> +            x[5]  = PCM(1 + 9 * nchannels);
> +            x[6]  = PCM(1 + 12 * nchannels);
> +            x[7]  = PCM(1 + 10 * nchannels);
> +            x[8]  = PCM(1 + 11 * nchannels);
> +            x[9]  = PCM(1 + 3 * nchannels);
> +            x[10] = PCM(1 + 6 * nchannels);
> +            x[11] = PCM(1 + 0 * nchannels);
> +            x[12] = PCM(1 + 5 * nchannels);
> +            x[13] = PCM(1 + 1 * nchannels);
> +            x[14] = PCM(1 + 4 * nchannels);
> +            x[15] = PCM(1 + 2 * nchannels);
> +        }
> +        pcm += 32 * nchannels;
> +        nsamples -= 16;
> +    }
> +
> +    if (nsamples == 8) {
> +        position -= 8;
> +        if (nchannels > 0) {
> +            int16_t *x = &X[0][position];
> +            x[-7] = PCM(0 + 7 * nchannels);
> +            x[1]  = PCM(0 + 3 * nchannels);
> +            x[2]  = PCM(0 + 6 * nchannels);
> +            x[3]  = PCM(0 + 0 * nchannels);
> +            x[4]  = PCM(0 + 5 * nchannels);
> +            x[5]  = PCM(0 + 1 * nchannels);
> +            x[6]  = PCM(0 + 4 * nchannels);
> +            x[7]  = PCM(0 + 2 * nchannels);
> +        }
> +        if (nchannels > 1) {
> +            int16_t *x = &X[1][position];
> +            x[-7] = PCM(1 + 7 * nchannels);
> +            x[1]  = PCM(1 + 3 * nchannels);
> +            x[2]  = PCM(1 + 6 * nchannels);
> +            x[3]  = PCM(1 + 0 * nchannels);
> +            x[4]  = PCM(1 + 5 * nchannels);
> +            x[5]  = PCM(1 + 1 * nchannels);
> +            x[6]  = PCM(1 + 4 * nchannels);
> +            x[7]  = PCM(1 + 2 * nchannels);
> +        }
> +    }
> +
> +    return position;
> +}
> +
> +/*
> + * Input data processing functions. The data is endian converted if
> needed,
> + * channels are deintrleaved and audio samples are reordered for use in
> + * SIMD-friendly analysis filter function. The results are put into "X"
> + * array, getting appended to the previous data (or it is better to say
> + * prepended, as the buffer is filled from top to bottom). Old data is
> + * discarded when neededed, but availability of (10 * nrof_subbands)
> + * contiguous samples is always guaranteed for the input to the analysis
> + * filter. This is achieved by copying a sufficient part of old data
> + * to the top of the buffer on buffer wraparound.
> + */
> +
> +static int sbc_enc_process_input_4s(int position, const uint8_t *pcm,
> +                                    int16_t X[2][SBC_X_BUFFER_SIZE],
> +                                    int nsamples, int nchannels)
> +{
> +    if (nchannels > 1)
> +        return sbc_encoder_process_input_s4_internal(
> +            position, pcm, X, nsamples, 2);
> +    else
> +        return sbc_encoder_process_input_s4_internal(
> +            position, pcm, X, nsamples, 1);
> +}
>

That's just silly, do
return sbc_encoder_process_input_s4_internal(position, pcm, X, nsamples, 1
+ (nchannels > 1));

Or better yet remove the wrapper function.



> +
> +static int sbc_enc_process_input_8s(int position, const uint8_t *pcm,
> +                                    int16_t X[2][SBC_X_BUFFER_SIZE],
> +                                    int nsamples, int nchannels)
> +{
> +    if (nchannels > 1)
> +        return sbc_encoder_process_input_s8_internal(
> +            position, pcm, X, nsamples, 2);
> +    else
> +        return sbc_encoder_process_input_s8_internal(
> +            position, pcm, X, nsamples, 1);
> +}
> +
>

Same here.


>
> +++ b/libavcodec/sbcdsp_data.c
> @@ -0,0 +1,335 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017  Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2008-2010  Nokia Corporation
> + * Copyright (C) 2004-2010  Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005  Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2006  Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * miscellaneous SBC tables
> + */
> +
> +#include "sbcdsp_data.h"
> +
> +#define F_PROTO4(x) (int32_t) ((x * 2) * \
> +    ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_COS4(x) (int32_t) ((x) * \
> +    ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_PROTO8(x) (int32_t) ((x * 2) * \
> +    ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +#define F_COS8(x) (int32_t) ((x) * \
> +    ((int32_t) 1 << (sizeof(int16_t) * CHAR_BIT - 1)) + 0.5)
> +
>


We require 8 bit bytes, so s/CHAR_BIT/8/g throughout.


+++ b/libavcodec/sbcdsp_data.h
> @@ -0,0 +1,57 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017  Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2008-2010  Nokia Corporation
> + * Copyright (C) 2004-2010  Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005  Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2006  Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * miscellaneous SBC tables
> + */
> +
> +#ifndef AVCODEC_SBCDSP_DATA_H
> +#define AVCODEC_SBCDSP_DATA_H
> +
> +#include "sbc.h"
> +
> +#define SBC_PROTO_FIXED4_SCALE      ((sizeof(int16_t) * CHAR_BIT - 1) + 1)
> +#define SBC_COS_TABLE_FIXED4_SCALE  ((sizeof(int16_t) * CHAR_BIT - 1)    )
> +#define SBC_PROTO_FIXED8_SCALE      ((sizeof(int16_t) * CHAR_BIT - 1) + 1)
> +#define SBC_COS_TABLE_FIXED8_SCALE  ((sizeof(int16_t) * CHAR_BIT - 1)    )
> +
>

Same


>
> +
> +    /* align the last crc byte */
> +    if (crc_pos % 8)
> +        crc_header[crc_pos >> 3] <<= 8 - (crc_pos % 8);
>
>
put_bits_align?


> +    avpkt->data[3] = ff_sbc_crc8(crc_header, crc_pos);
> +
> +    ff_sbc_calculate_bits(frame, bits);
> +
> +    for (ch = 0; ch < frame_channels; ch++) {
> +        for (sb = 0; sb < frame_subbands; sb++) {
> +            levels[ch][sb] = ((1 << bits[ch][sb]) - 1) <<
> +                (32 - (frame->scale_factor[ch][sb] +
> +                    SCALE_OUT_BITS + 2));
> +            sb_sample_delta[ch][sb] = (uint32_t) 1 <<
> +                (frame->scale_factor[ch][sb] +
> +                    SCALE_OUT_BITS + 1);
> +        }
> +    }
> +
> +    for (blk = 0; blk < frame->blocks; blk++) {
> +        for (ch = 0; ch < frame_channels; ch++) {
> +            for (sb = 0; sb < frame_subbands; sb++) {
> +
> +                if (bits[ch][sb] == 0)
> +                    continue;
> +
> +                audio_sample = ((uint64_t) levels[ch][sb] *
> +                    (sb_sample_delta[ch][sb] +
> +                    frame->sb_sample_f[blk][ch][sb])) >> 32;
> +
> +                put_bits(&pb, bits[ch][sb], audio_sample);
> +            }
> +        }
> +    }
> +
> +    flush_put_bits(&pb);
> +
> +    return (put_bits_count(&pb) + 7) / 8;
> +}
> +
> +static size_t sbc_pack_frame(AVPacket *avpkt, struct sbc_frame *frame,
> int joint)
> +{
> +    int frame_subbands = 4;
> +
> +    avpkt->data[0] = SBC_SYNCWORD;
> +
> +    avpkt->data[1] = (frame->frequency & 0x03) << 6;
> +    avpkt->data[1] |= (frame->block_mode & 0x03) << 4;
> +    avpkt->data[1] |= (frame->mode & 0x03) << 2;
> +    avpkt->data[1] |= (frame->allocation & 0x01) << 1;
> +
>

Use put_bits?


> +
> +    if (frame->subbands == 4) {
> +        if (frame->channels == 1)
> +            return sbc_pack_frame_internal(avpkt, frame, 4, 1, joint);
>

return sbc_pack_frame_internal(avpkt, frame, 4, 1 + (frame->channels == 1),
joint);


> +            return sbc_pack_frame_internal(avpkt, frame, 8, 1, joint);
> +        else
> +            return sbc_pack_frame_internal(avpkt, frame, 8, 2, joint);
>

return sbc_pack_frame_internal(avpkt, frame, 8, 1 + (frame->channels == 1),
joint);


> +    }
> +}
> +
> +static size_t msbc_pack_frame(AVPacket *avpkt, struct sbc_frame *frame,
> int joint)
> +{
> +    avpkt->data[0] = MSBC_SYNCWORD;
> +    avpkt->data[1] = 0;
> +    avpkt->data[2] = 0;
> +
> +    return sbc_pack_frame_internal(avpkt, frame, 8, 1, joint);
> +}
> +
> +static void sbc_encoder_init(bool msbc, SBCDSPContext *s,
> +                             const struct sbc_frame *frame)
> +{
> +    memset(&s->X, 0, sizeof(s->X));
> +    s->position = (SBC_X_BUFFER_SIZE - frame->subbands * 9) & ~7;
> +    if (msbc)
> +        s->increment = 1;
> +    else
> +        s->increment = 4;

+
>

Save a line, use a ternary.


> +
> +    sbc->pack_frame = sbc_pack_frame;
> +
> +    sbc->frequency = SBC_FREQ_44100;
>


Yet in the AVCodec structure the encoder specifies it supports 16khz, 32khz
and 48khz.
You should remove the SBC_FREQ macros and use avctx->sample_rate directly.
Also remove any unsupported samplerates.



> +    sbc->mode = SBC_MODE_STEREO;
> +    if (sbc->joint_stereo)
> +        sbc->mode = SBC_MODE_JOINT_STEREO;
> +    else if (sbc->dual_channel)
> +        sbc->mode = SBC_MODE_DUAL_CHANNEL;
> +    sbc->subbands >>= 3;
> +    sbc->blocks = (sbc->blocks >> 2) - 1;
> +
> +    if (!avctx->frame_size)
> +        avctx->frame_size = 4*(sbc->subbands + 1) * 4*(sbc->blocks + 1);
> +
> +    for (int i = 0; avctx->codec->supported_samplerates[i]; i++)
> +        if (avctx->sample_rate == avctx->codec->supported_samplerates[i])
> +            sbc->frequency = i;
> +
> +    if (avctx->channels == 1)
> +        sbc->mode = SBC_MODE_MONO;
> +
> +    return 0;
> +}
> +
> +static int msbc_encode_init(AVCodecContext *avctx)
> +{
> +    SBCEncContext *sbc = avctx->priv_data;
> +
> +    sbc->msbc = true;
> +    sbc->pack_frame = msbc_pack_frame;
> +
> +    sbc->frequency = SBC_FREQ_16000;
> +    sbc->blocks = MSBC_BLOCKS;
> +    sbc->subbands = SBC_SB_8;
> +    sbc->mode = SBC_MODE_MONO;
> +    sbc->allocation = SBC_AM_LOUDNESS;
> +    sbc->bitpool = 26;
> +
> +    if (!avctx->frame_size)
> +        avctx->frame_size = 8 * MSBC_BLOCKS;
> +
>

Does the encoder actually accept arbitrary custom frame sizes?


>
> +
> +#if CONFIG_SBC_ENCODER
> +AVCodec ff_sbc_encoder = {
> +    .name                  = "sbc",
> +    .long_name             = NULL_IF_CONFIG_SMALL("SBC (low-complexity
> subband codec)"),
> +    .type                  = AVMEDIA_TYPE_AUDIO,
> +    .id                    = AV_CODEC_ID_SBC,
> +    .priv_data_size        = sizeof(SBCEncContext),
> +    .init                  = sbc_encode_init,
> +    .encode2               = sbc_encode_frame,
> +    .channel_layouts       = (const uint64_t[]) { AV_CH_LAYOUT_MONO,
> +                                                  AV_CH_LAYOUT_STEREO, 0},
> +    .sample_fmts           = (const enum AVSampleFormat[]) {
> AV_SAMPLE_FMT_S16,
> +
>  AV_SAMPLE_FMT_NONE },
>

Planar?


> +    .supported_samplerates = (const int[]) { 16000, 32000, 44100, 48000,
> 0 },
>

Remove the samplerates the encoder doesn't support.
Also add the internal codec cap about threadsafe init since the encoder
doesn't init any global tables to both this and the aptX encoders.


>
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> +    movq          m0, [inq]
> +    movq          m1, [inq+8]
> +    pmaddwd       m0, [constsq]
> +    pmaddwd       m1, [constsq+8]
> +    paddd         m0, [scale_mask]
> +    paddd         m1, [scale_mask]
> +
> +    movq          m2, [inq+16]
> +    movq          m3, [inq+24]
> +    pmaddwd       m2, [constsq+16]
> +    pmaddwd       m3, [constsq+24]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+32]
> +    movq          m3, [inq+40]
> +    pmaddwd       m2, [constsq+32]
> +    pmaddwd       m3, [constsq+40]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+48]
> +    movq          m3, [inq+56]
> +    pmaddwd       m2, [constsq+48]
> +    pmaddwd       m3, [constsq+56]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
> +    movq          m2, [inq+64]
> +    movq          m3, [inq+72]
> +    pmaddwd       m2, [constsq+64]
> +    pmaddwd       m3, [constsq+72]
> +    paddd         m0, m2
> +    paddd         m1, m3
> +
>

Loops?


> +    psrad         m0, 16    ; SBC_PROTO_FIXED4_SCALE
> +    psrad         m1, 16    ; SBC_PROTO_FIXED4_SCALE
> +    packssdw      m0, m0
> +    packssdw      m1, m1
> +
> +    movq          m2, m0
> +    pmaddwd       m0, [constsq+80]
> +    pmaddwd       m2, [constsq+88]
> +
> +    movq          m3, m1
> +    pmaddwd       m1, [constsq+96]
> +    pmaddwd       m3, [constsq+104]
> +    paddd         m0, m1
> +    paddd         m2, m3
> +
> +    movq          [outq  ], m0
> +    movq          [outq+8], m2
> +
> +    RET
> +
> +
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t
> *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> +    movq          m0, [inq]
> +    movq          m1, [inq+8]
> +    movq          m2, [inq+16]
> +    movq          m3, [inq+24]
> +    pmaddwd       m0, [constsq]
> +    pmaddwd       m1, [constsq+8]
> +    pmaddwd       m2, [constsq+16]
> +    pmaddwd       m3, [constsq+24]
> +    paddd         m0, [scale_mask]
> +    paddd         m1, [scale_mask]
> +    paddd         m2, [scale_mask]
> +    paddd         m3, [scale_mask]
> +
> +    movq          m4, [inq+32]
> +    movq          m5, [inq+40]
> +    movq          m6, [inq+48]
> +    movq          m7, [inq+56]
> +    pmaddwd       m4, [constsq+32]
> +    pmaddwd       m5, [constsq+40]
> +    pmaddwd       m6, [constsq+48]
> +    pmaddwd       m7, [constsq+56]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+64]
> +    movq          m5, [inq+72]
> +    movq          m6, [inq+80]
> +    movq          m7, [inq+88]
> +    pmaddwd       m4, [constsq+64]
> +    pmaddwd       m5, [constsq+72]
> +    pmaddwd       m6, [constsq+80]
> +    pmaddwd       m7, [constsq+88]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+96]
> +    movq          m5, [inq+104]
> +    movq          m6, [inq+112]
> +    movq          m7, [inq+120]
> +    pmaddwd       m4, [constsq+96]
> +    pmaddwd       m5, [constsq+104]
> +    pmaddwd       m6, [constsq+112]
> +    pmaddwd       m7, [constsq+120]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    movq          m4, [inq+128]
> +    movq          m5, [inq+136]
> +    movq          m6, [inq+144]
> +    movq          m7, [inq+152]
> +    pmaddwd       m4, [constsq+128]
> +    pmaddwd       m5, [constsq+136]
> +    pmaddwd       m6, [constsq+144]
> +    pmaddwd       m7, [constsq+152]
> +    paddd         m0, m4
> +    paddd         m1, m5
> +    paddd         m2, m6
> +    paddd         m3, m7
> +
> +    psrad         m0, 16    ; SBC_PROTO_FIXED8_SCALE
> +    psrad         m1, 16    ; SBC_PROTO_FIXED8_SCALE
> +    psrad         m2, 16    ; SBC_PROTO_FIXED8_SCALE
> +    psrad         m3, 16    ; SBC_PROTO_FIXED8_SCALE
> +
> +    packssdw      m0, m0
> +    packssdw      m1, m1
> +    packssdw      m2, m2
> +    packssdw      m3, m3
> +
> +    movq          m4, m0
> +    movq          m5, m0
> +    pmaddwd       m4, [constsq+160]
> +    pmaddwd       m5, [constsq+168]
> +
> +    movq          m6, m1
> +    movq          m7, m1
> +    pmaddwd       m6, [constsq+192]
> +    pmaddwd       m7, [constsq+200]
> +    paddd         m4, m6
> +    paddd         m5, m7
> +
> +    movq          m6, m2
> +    movq          m7, m2
> +    pmaddwd       m6, [constsq+224]
> +    pmaddwd       m7, [constsq+232]
> +    paddd         m4, m6
> +    paddd         m5, m7
> +
> +    movq          m6, m3
> +    movq          m7, m3
> +    pmaddwd       m6, [constsq+256]
> +    pmaddwd       m7, [constsq+264]
> +    paddd         m4, m6
> +    paddd         m5, m7
> +
> +    movq          [outq  ], m4
> +    movq          [outq+8], m5
> +
> +    movq          m5, m0
> +    pmaddwd       m0, [constsq+176]
> +    pmaddwd       m5, [constsq+184]
> +
> +    movq          m7, m1
> +    pmaddwd       m1, [constsq+208]
> +    pmaddwd       m7, [constsq+216]
> +    paddd         m0, m1
> +    paddd         m5, m7
> +
> +    movq          m7, m2
> +    pmaddwd       m2, [constsq+240]
> +    pmaddwd       m7, [constsq+248]
> +    paddd         m0, m2
> +    paddd         m5, m7
> +
> +    movq          m7, m3
> +    pmaddwd       m3, [constsq+272]
> +    pmaddwd       m7, [constsq+280]
> +    paddd         m0, m3
> +    paddd         m5, m7
> +


Has the person writing the SIMD seriously not heard of loops?
I see no reason for this to not work on larger registers if loops were used
here.
This seems trivial do to properly so if you can't be bothered to fix it
leave it to me or jamrial to do after the core of the encoder has been
merged.


> +    movq          [outq+16], m0
> +    movq          [outq+24], m5
> +
> +    RET
> +
> +
> +;*******************************************************************
> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
> +;                              uint32_t scale_factor[2][8],
> +;                              int blocks, int channels, int subbands)
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_calc_scalefactors, 5, 9, 3, sb_sample_f, scale_factor,
> blocks, channels, subbands, ch, sb, sa, sf, blk
> +    shl           channelsd, 5
> +    mov           chq, 0
> +.loop_1:
> +    lea           saq, [sb_sample_fq + chq]
> +    lea           sfq, [scale_factorq + chq]
> +
> +    mov           sbd, 0
> +.loop_2:
> +    ; blk = (blocks - 1) * 64;
> +    lea           blkq, [blocksq - 1]
> +    shl           blkd, 6
> +
> +    movq          m0, [scale_mask]
> +.loop_3:
> +    movq          m1, [saq+blkq]
> +    pxor          m2, m2
> +    pcmpgtd       m1, m2
> +    paddd         m1, [saq+blkq]
> +    pcmpgtd       m2, m1
> +    pxor          m1, m2
> +
> +    por           m0, m1
> +
> +    sub           blkd, 64
> +    jns           .loop_3
> +
> +    movd          blkd, m0
> +    psrlq         m0,   32
> +    bsr           blkd, blkd
> +    sub           blkd, 15    ; SCALE_OUT_BITS
> +    mov           [sfq], blkd
> +
> +    movd          blkd, m0
> +    bsr           blkd, blkd
> +    sub           blkd, 15    ; SCALE_OUT_BITS
> +    mov           [sfq+4], blkd
> +
> +    add           saq, 8
> +    add           sfq, 8
> +
> +    add           sbd, 2
> +    cmp           sbd, subbandsd
> +    jl            .loop_2
> +
> +    add           chd, 32
> +    cmp           chd, channelsd
> +    jl            .loop_1
> +
>

This function's hardly doing SIMD and I would like to see comparison to the
C version before accepting it. I somehow doubt it'll be faster.

+
> +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_MMX(cpu_flags)) {
> +        s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
> +        s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
> +        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
> +    }
> +}
>


MMX? In this day and age?



Anyway, its mostly not bad, will need some work before its cleaned of
libsbc's NIH.


More information about the ffmpeg-devel mailing list