[FFmpeg-devel] [PATCH 2/2] x86/dcadec: add ff_lfe_fir0_float_{sse, sse2, avx, fma3}

Fri Feb 5 20:46:20 CET 2016

Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math.

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/dcadsp.c          |   3 +
 libavcodec/dcadsp.h          |   1 +
 libavcodec/x86/Makefile      |   4 +-
 libavcodec/x86/dcadsp.asm    | 199 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/dcadsp_init.c |  45 ++++++++++
 5 files changed, 250 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/x86/dcadsp.asm
 create mode 100644 libavcodec/x86/dcadsp_init.c

diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 6acfe0b..09faee5 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -410,4 +410,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
     s->dmix_scale_inv = dmix_scale_inv_c;
 
     s->assemble_freq_bands = assemble_freq_bands_c;
+
+    if (ARCH_X86)
+        ff_dcadsp_init_x86(s);
 }
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index f594abf..c82b7b1 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -87,5 +87,6 @@ typedef struct DCADSPContext {
 } DCADSPContext;
 
 av_cold void ff_dcadsp_init(DCADSPContext *s);
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s);
 
 #endif
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index ce06b90..7f53b73 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -44,7 +44,7 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER)      += x86/g722dsp_init.o
 OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
-OBJS-$(CONFIG_DCA_DECODER)             += x86/synth_filter_init.o
+OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
 OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
@@ -132,7 +132,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
 YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 YASM-OBJS-$(CONFIG_ALAC_DECODER)       += x86/alacdsp.o
 YASM-OBJS-$(CONFIG_APNG_DECODER)       += x86/pngdsp.o
-YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/synth_filter.o
+YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o x86/synth_filter.o
 YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o \
                                           x86/dwt_yasm.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
new file mode 100644
index 0000000..7036e48
--- /dev/null
+++ b/libavcodec/x86/dcadsp.asm
@@ -0,0 +1,199 @@
+;******************************************************************************
+;* SIMD-optimized functions for the DCA decoder
+;* Copyright (C) 2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%define sizeof_float 4
+%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64)
+
+%macro LFE_FIR0_FLOAT 0
+cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
+    shr nblocksd, 1
+    sub     lfeq, 7*sizeof_float
+    mov    cnt1d, 32*sizeof_float
+    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
+    lea   coeffq, [coeffq+cnt1q*8]
+    add samplesq, cnt1q
+    neg    cnt1q
+
+.loop:
+%if cpuflag(avx)
+    cvtdq2ps  m4, [lfeq+16]
+    cvtdq2ps  m5, [lfeq   ]
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%elif cpuflag(sse2)
+    movu      m4, [lfeq+16]
+    movu      m5, [lfeq   ]
+    cvtdq2ps  m4, m4
+    cvtdq2ps  m5, m5
+    pshufd    m7, m4, q0123
+    pshufd    m6, m5, q0123
+%else
+    cvtpi2ps  m4, [lfeq+16]
+    cvtpi2ps  m0, [lfeq+24]
+    cvtpi2ps  m5, [lfeq   ]
+    cvtpi2ps  m1, [lfeq+8 ]
+    shufps    m4, m0, q1010
+    shufps    m5, m1, q1010
+    shufps    m7, m4, m4, q0123
+    shufps    m6, m5, m5, q0123
+%endif
+
+.inner_loop:
+%if ARCH_X86_64
+    movaps    m8, [coeffq+cnt1q*8   ]
+    movaps    m9, [coeffq+cnt1q*8+16]
+    movaps   m10, [coeffq+cnt1q*8+32]
+    movaps   m11, [coeffq+cnt1q*8+48]
+%if cpuflag(fma3)
+    movaps   m12, [coeffq+cnt1q*8+64]
+    movaps   m13, [coeffq+cnt1q*8+80]
+    movaps   m14, [coeffq+cnt1q*8+96]
+    movaps   m15, [coeffq+cnt1q*8+112]
+    mulps     m0, m7, m8
+    mulps     m1, m7, m10
+    mulps     m2, m7, m12
+    mulps     m3, m7, m14
+    fmaddps   m0, m6, m9, m0
+    fmaddps   m1, m6, m11, m1
+    fmaddps   m2, m6, m13, m2
+    fmaddps   m3, m6, m15, m3
+
+    haddps    m0, m1
+    haddps    m2, m3
+    haddps    m0, m2
+    movaps [samplesq+cnt1q], m0
+%else
+    mulps     m0, m7, m8
+    mulps     m1, m6, m9
+    mulps     m2, m7, m10
+    mulps     m3, m6, m11
+    addps     m0, m1
+    addps     m2, m3
+
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    movaps        m1, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    fmaddps   m0, m6, m1, m0
+    fmaddps   m2, m6, [coeffq+cnt1q*8+48], m2
+%else
+    mulps     m0, m7, [coeffq+cnt1q*8   ]
+    mulps     m1, m6, [coeffq+cnt1q*8+16]
+    mulps     m2, m7, [coeffq+cnt1q*8+32]
+    mulps     m3, m6, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+%endif
+    unpckhps  m3, m0, m2
+    unpcklps  m0, m2
+    addps     m3, m0
+    movhlps   m2, m3
+    addps     m2, m3
+    movlps [samplesq+cnt1q], m2
+%endif; ARCH
+
+%if ARCH_X86_64
+%if cpuflag(fma3)
+    mulps     m8, m5
+    mulps    m10, m5
+    mulps    m12, m5
+    mulps    m14, m5
+    fmaddps   m8, m4, m9, m8
+    fmaddps  m10, m4, m11, m10
+    fmaddps  m12, m4, m13, m12
+    fmaddps  m14, m4, m15, m14
+
+    haddps   m10, m8
+    haddps   m14, m12
+    haddps   m14, m10
+    movaps [samplesq+cnt2q], m14
+%else
+    mulps     m8, m5
+    mulps     m9, m4
+    mulps    m10, m5
+    mulps    m11, m4
+    addps     m8, m9
+    addps    m10, m11
+
+    unpckhps m11, m10, m8
+    unpcklps m10, m8
+    addps    m11, m10
+    movhlps   m8, m11
+    addps     m8, m11
+    movlps [samplesq+cnt2q], m8
+%endif
+%else ; ARCH_X86_32
+%if cpuflag(fma3)
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    fmaddps   m0, m4, m1, m0
+    fmaddps   m2, m4, [coeffq+cnt1q*8+48], m2
+%else
+    mulps     m0, m5, [coeffq+cnt1q*8   ]
+    mulps     m1, m4, [coeffq+cnt1q*8+16]
+    mulps     m2, m5, [coeffq+cnt1q*8+32]
+    mulps     m3, m4, [coeffq+cnt1q*8+48]
+    addps     m0, m1
+    addps     m2, m3
+%endif
+    unpckhps  m3, m2, m0
+    unpcklps  m2, m0
+    addps     m3, m2
+    movhlps   m0, m3
+    addps     m0, m3
+    movlps [samplesq+cnt2q], m0
+%endif; ARCH
+
+    sub    cnt2d, 8 + FMA3_OFFSET
+    add    cnt1q, 8 + FMA3_OFFSET
+    jl .inner_loop
+
+    add     lfeq, 4
+    add samplesq,  64*sizeof_float
+    mov    cnt1q, -32*sizeof_float
+    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
+    sub nblocksd, 1
+    jg .loop
+    RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+LFE_FIR0_FLOAT
+%endif
+INIT_XMM sse2
+LFE_FIR0_FLOAT
+INIT_XMM avx
+LFE_FIR0_FLOAT
+INIT_XMM fma3
+LFE_FIR0_FLOAT
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
new file mode 100644
index 0000000..ed53b7b
--- /dev/null
+++ b/libavcodec/x86/dcadsp_init.c
@@ -0,0 +1,45 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dcadsp.h"
+
+#define LFE_FIR1_FLOAT_FUNC(opt)                                               \
+void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples,         \
+                             const float *filter_coeff, ptrdiff_t npcmblocks);
+
+LFE_FIR1_FLOAT_FUNC(sse)
+LFE_FIR1_FLOAT_FUNC(sse2)
+LFE_FIR1_FLOAT_FUNC(avx)
+LFE_FIR1_FLOAT_FUNC(fma3)
+
+av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
+    if (EXTERNAL_AVX(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+    if (EXTERNAL_FMA3(cpu_flags))
+        s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
+}
-- 
2.7.0