[FFmpeg-devel] [PATCH 6/8] sbcenc: add MMX optimizations

James Almer jamrial at gmail.com
Wed Dec 20 23:26:38 EET 2017


On 12/20/2017 5:06 PM, Aurelien Jacobs wrote:
> On Wed, Dec 20, 2017 at 03:47:35PM -0300, James Almer wrote:
>> On 12/17/2017 6:47 PM, Aurelien Jacobs wrote:
>>> This was originally based on libsbc, and was fully integrated into ffmpeg.
>>> ---
>>>  libavcodec/sbcdsp.c          |   3 +
>>>  libavcodec/sbcdsp.h          |   2 +
>>>  libavcodec/x86/Makefile      |   2 +
>>>  libavcodec/x86/sbcdsp.asm    | 284 +++++++++++++++++++++++++++++++++++++++++++
>>>  libavcodec/x86/sbcdsp_init.c |  51 ++++++++
>>>  5 files changed, 342 insertions(+)
>>>  create mode 100644 libavcodec/x86/sbcdsp.asm
>>>  create mode 100644 libavcodec/x86/sbcdsp_init.c
>>>
>>> diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
>>> index 16faf5ba9b..9bb60cdd5e 100644
>>> --- a/libavcodec/sbcdsp.c
>>> +++ b/libavcodec/sbcdsp.c
>>> @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
>>>      /* Default implementation for scale factors calculation */
>>>      s->sbc_calc_scalefactors = sbc_calc_scalefactors;
>>>      s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
>>> +
>>> +    if (ARCH_X86)
>>> +        ff_sbcdsp_init_x86(s);
>>>  }
>>> diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
>>> index 66ed7d324e..127e6a8a11 100644
>>> --- a/libavcodec/sbcdsp.h
>>> +++ b/libavcodec/sbcdsp.h
>>> @@ -80,4 +80,6 @@ struct sbc_dsp_context {
>>>   */
>>>  void ff_sbcdsp_init(SBCDSPContext *s);
>>>  
>>> +void ff_sbcdsp_init_x86(SBCDSPContext *s);
>>> +
>>>  #endif /* AVCODEC_SBCDSP_H */
>>> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
>>> index a805cd37b4..2350c8bbee 100644
>>> --- a/libavcodec/x86/Makefile
>>> +++ b/libavcodec/x86/Makefile
>>> @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
>>>  OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
>>>  OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
>>>  OBJS-$(CONFIG_RV40_DECODER)            += x86/rv40dsp_init.o
>>> +OBJS-$(CONFIG_SBC_ENCODER)             += x86/sbcdsp_init.o
>>>  OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
>>>  OBJS-$(CONFIG_TAK_DECODER)             += x86/takdsp_init.o
>>>  OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
>>> @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
>>>  X86ASM-OBJS-$(CONFIG_PRORES_DECODER)   += x86/proresdsp.o
>>>  X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
>>>  X86ASM-OBJS-$(CONFIG_RV40_DECODER)     += x86/rv40dsp.o
>>> +X86ASM-OBJS-$(CONFIG_SBC_ENCODER)      += x86/sbcdsp.o
>>>  X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER)     += x86/svq1enc.o
>>>  X86ASM-OBJS-$(CONFIG_TAK_DECODER)      += x86/takdsp.o
>>>  X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER)   += x86/mlpdsp.o
>>> diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
>>> new file mode 100644
>>> index 0000000000..00b48a821b
>>> --- /dev/null
>>> +++ b/libavcodec/x86/sbcdsp.asm
>>> @@ -0,0 +1,284 @@
>>> +;******************************************************************************
>>> +;* SIMD optimized SBC encoder DSP functions
>>> +;*
>>> +;* Copyright (C) 2017  Aurelien Jacobs <aurel at gnuage.org>
>>> +;* Copyright (C) 2008-2010  Nokia Corporation
>>> +;* Copyright (C) 2004-2010  Marcel Holtmann <marcel at holtmann.org>
>>> +;* Copyright (C) 2004-2005  Henryk Ploetz <henryk at ploetzli.ch>
>>> +;* Copyright (C) 2005-2006  Brad Midgley <bmidgley at xmission.com>
>>> +;*
>>> +;* This file is part of FFmpeg.
>>> +;*
>>> +;* FFmpeg is free software; you can redistribute it and/or
>>> +;* modify it under the terms of the GNU Lesser General Public
>>> +;* License as published by the Free Software Foundation; either
>>> +;* version 2.1 of the License, or (at your option) any later version.
>>> +;*
>>> +;* FFmpeg is distributed in the hope that it will be useful,
>>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +;* Lesser General Public License for more details.
>>> +;*
>>> +;* You should have received a copy of the GNU Lesser General Public
>>> +;* License along with FFmpeg; if not, write to the Free Software
>>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> +;******************************************************************************
>>> +
>>> +%include "libavutil/x86/x86util.asm"
>>> +
>>> +SECTION_RODATA
>>> +
>>> +scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
>>> +
>>> +SECTION .text
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
>>> +    movq          m0, [inq]
>>> +    movq          m1, [inq+8]
>>> +    pmaddwd       m0, [constsq]
>>> +    pmaddwd       m1, [constsq+8]
>>> +    paddd         m0, [scale_mask]
>>> +    paddd         m1, [scale_mask]
>>> +
>>> +    movq          m2, [inq+16]
>>> +    movq          m3, [inq+24]
>>> +    pmaddwd       m2, [constsq+16]
>>> +    pmaddwd       m3, [constsq+24]
>>> +    paddd         m0, m2
>>> +    paddd         m1, m3
>>> +
>>> +    movq          m2, [inq+32]
>>> +    movq          m3, [inq+40]
>>> +    pmaddwd       m2, [constsq+32]
>>> +    pmaddwd       m3, [constsq+40]
>>> +    paddd         m0, m2
>>> +    paddd         m1, m3
>>> +
>>> +    movq          m2, [inq+48]
>>> +    movq          m3, [inq+56]
>>> +    pmaddwd       m2, [constsq+48]
>>> +    pmaddwd       m3, [constsq+56]
>>> +    paddd         m0, m2
>>> +    paddd         m1, m3
>>> +
>>> +    movq          m2, [inq+64]
>>> +    movq          m3, [inq+72]
>>> +    pmaddwd       m2, [constsq+64]
>>> +    pmaddwd       m3, [constsq+72]
>>> +    paddd         m0, m2
>>> +    paddd         m1, m3
>>> +
>>> +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
>>> +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
>>> +    packssdw      m0, m0
>>> +    packssdw      m1, m1
>>> +
>>> +    movq          m2, m0
>>> +    pmaddwd       m0, [constsq+80]
>>> +    pmaddwd       m2, [constsq+88]
>>> +
>>> +    movq          m3, m1
>>> +    pmaddwd       m1, [constsq+96]
>>> +    pmaddwd       m3, [constsq+104]
>>> +    paddd         m0, m1
>>> +    paddd         m2, m3
>>> +
>>> +    movq          [outq  ], m0
>>> +    movq          [outq+8], m2
>>> +
>>> +    RET
>>> +
>>> +
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
>>> +    movq          m0, [inq]
>>> +    movq          m1, [inq+8]
>>> +    movq          m2, [inq+16]
>>> +    movq          m3, [inq+24]
>>> +    pmaddwd       m0, [constsq]
>>> +    pmaddwd       m1, [constsq+8]
>>> +    pmaddwd       m2, [constsq+16]
>>> +    pmaddwd       m3, [constsq+24]
>>> +    paddd         m0, [scale_mask]
>>> +    paddd         m1, [scale_mask]
>>> +    paddd         m2, [scale_mask]
>>> +    paddd         m3, [scale_mask]
>>> +
>>> +    movq          m4, [inq+32]
>>> +    movq          m5, [inq+40]
>>> +    movq          m6, [inq+48]
>>> +    movq          m7, [inq+56]
>>> +    pmaddwd       m4, [constsq+32]
>>> +    pmaddwd       m5, [constsq+40]
>>> +    pmaddwd       m6, [constsq+48]
>>> +    pmaddwd       m7, [constsq+56]
>>> +    paddd         m0, m4
>>> +    paddd         m1, m5
>>> +    paddd         m2, m6
>>> +    paddd         m3, m7
>>> +
>>> +    movq          m4, [inq+64]
>>> +    movq          m5, [inq+72]
>>> +    movq          m6, [inq+80]
>>> +    movq          m7, [inq+88]
>>> +    pmaddwd       m4, [constsq+64]
>>> +    pmaddwd       m5, [constsq+72]
>>> +    pmaddwd       m6, [constsq+80]
>>> +    pmaddwd       m7, [constsq+88]
>>> +    paddd         m0, m4
>>> +    paddd         m1, m5
>>> +    paddd         m2, m6
>>> +    paddd         m3, m7
>>> +
>>> +    movq          m4, [inq+96]
>>> +    movq          m5, [inq+104]
>>> +    movq          m6, [inq+112]
>>> +    movq          m7, [inq+120]
>>> +    pmaddwd       m4, [constsq+96]
>>> +    pmaddwd       m5, [constsq+104]
>>> +    pmaddwd       m6, [constsq+112]
>>> +    pmaddwd       m7, [constsq+120]
>>> +    paddd         m0, m4
>>> +    paddd         m1, m5
>>> +    paddd         m2, m6
>>> +    paddd         m3, m7
>>> +
>>> +    movq          m4, [inq+128]
>>> +    movq          m5, [inq+136]
>>> +    movq          m6, [inq+144]
>>> +    movq          m7, [inq+152]
>>> +    pmaddwd       m4, [constsq+128]
>>> +    pmaddwd       m5, [constsq+136]
>>> +    pmaddwd       m6, [constsq+144]
>>> +    pmaddwd       m7, [constsq+152]
>>> +    paddd         m0, m4
>>> +    paddd         m1, m5
>>> +    paddd         m2, m6
>>> +    paddd         m3, m7
>>> +
>>> +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
>>> +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
>>> +    psrad         m2, 16    ; SBC_PROTO_FIXED_SCALE
>>> +    psrad         m3, 16    ; SBC_PROTO_FIXED_SCALE
>>> +
>>> +    packssdw      m0, m0
>>> +    packssdw      m1, m1
>>> +    packssdw      m2, m2
>>> +    packssdw      m3, m3
>>> +
>>> +    movq          m4, m0
>>> +    movq          m5, m0
>>> +    pmaddwd       m4, [constsq+160]
>>> +    pmaddwd       m5, [constsq+168]
>>> +
>>> +    movq          m6, m1
>>> +    movq          m7, m1
>>> +    pmaddwd       m6, [constsq+192]
>>> +    pmaddwd       m7, [constsq+200]
>>> +    paddd         m4, m6
>>> +    paddd         m5, m7
>>> +
>>> +    movq          m6, m2
>>> +    movq          m7, m2
>>> +    pmaddwd       m6, [constsq+224]
>>> +    pmaddwd       m7, [constsq+232]
>>> +    paddd         m4, m6
>>> +    paddd         m5, m7
>>> +
>>> +    movq          m6, m3
>>> +    movq          m7, m3
>>> +    pmaddwd       m6, [constsq+256]
>>> +    pmaddwd       m7, [constsq+264]
>>> +    paddd         m4, m6
>>> +    paddd         m5, m7
>>> +
>>> +    movq          [outq  ], m4
>>> +    movq          [outq+8], m5
>>> +
>>> +    movq          m5, m0
>>> +    pmaddwd       m0, [constsq+176]
>>> +    pmaddwd       m5, [constsq+184]
>>> +
>>> +    movq          m7, m1
>>> +    pmaddwd       m1, [constsq+208]
>>> +    pmaddwd       m7, [constsq+216]
>>> +    paddd         m0, m1
>>> +    paddd         m5, m7
>>> +
>>> +    movq          m7, m2
>>> +    pmaddwd       m2, [constsq+240]
>>> +    pmaddwd       m7, [constsq+248]
>>> +    paddd         m0, m2
>>> +    paddd         m5, m7
>>> +
>>> +    movq          m7, m3
>>> +    pmaddwd       m3, [constsq+272]
>>> +    pmaddwd       m7, [constsq+280]
>>> +    paddd         m0, m3
>>> +    paddd         m5, m7
>>> +
>>> +    movq          [outq+16], m0
>>> +    movq          [outq+24], m5
>>> +
>>> +    RET
>>> +
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
>>> +;                              uint32_t scale_factor[2][8],
>>> +;                              int blocks, int channels, int subbands)
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
>>> +    ; subbands = 4 * subbands * channels
>>> +    shl  subbandsq, 2
>>> +    cmp  channelsq, 2
>>> +    jl   .loop_1
>>> +    shl  subbandsq, 1
>>> +
>>> +.loop_1:
>>> +    sub           subbandsq, 8
>>> +    lea           ptrq, [sb_sample_fq + subbandsq]
>>> +
>>> +    ; blk = (blocks - 1) * 64;
>>> +    lea           blkq, [blocksq - 1]
>>> +    shl           blkq, 6
>>> +
>>> +    movq          m0, [scale_mask]
>>> +.loop_2:
>>> +    movq          m1, [ptrq+blkq]
>>> +    pxor          m2, m2
>>> +    pcmpgtd       m1, m2
>>> +    paddd         m1, [ptrq+blkq]
>>> +    pcmpgtd       m2, m1
>>> +    pxor          m1, m2
>>> +
>>> +    por           m0, m1
>>> +
>>> +    sub           blkq, 64
>>> +    jns           .loop_2
>>> +
>>> +    movd          blkd, m0
>>> +    psrlq         m0,   32
>>> +    bsr           blkd, blkd
>>> +    sub           blkd, 15    ; SCALE_OUT_BITS
>>> +    mov           [scale_factorq + subbandsq], blkd
>>> +
>>> +    movd          blkd, m0
>>> +    bsr           blkd, blkd
>>> +    sub           blkd, 15    ; SCALE_OUT_BITS
>>> +    mov           [scale_factorq + subbandsq + 4], blkd
>>> +
>>> +    cmp           subbandsq, 0
>>> +    jg            .loop_1
>>> +
>>> +    emms
>>> +    RET
>>
>> These should be done in SSE2. There's no reason for them to be MMX.
> 
> There is at least one reason for it to be MMX. It is existing legacy
> code that is ported to ffmpeg. So the MMX code is here and working.

The code had to be adapted to x86inc syntax, so it's not a copy paste
done without effort. Said effort could have also been used to turn them
into SSE2, as it's trivial at least for the first two.

I'm not discrediting your work in porting these, I'm saying that said
work should have gone into making them up to date as well.
I'm also not blocking this patch if you decide to not make them SSE2.

I'll point a few things about the existing asm in a separate email.


More information about the ffmpeg-devel mailing list