[FFmpeg-devel] [PATCH 2/2] x86/mlpdec: add ff_mlp_rematrix_channel_{sse4, avx2}

Tue Sep 30 20:50:33 CEST 2014

2x to 2.5x faster than the C version.

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/mlpdec.c                        |   4 +-
 libavcodec/x86/Makefile                    |   6 +-
 libavcodec/x86/mlpdsp.asm                  | 198 +++++++++++++++++++++++++++++
 libavcodec/x86/{mlpdsp.c => mlpdsp_init.c} |  22 +++-
 4 files changed, 225 insertions(+), 5 deletions(-)
 create mode 100644 libavcodec/x86/mlpdsp.asm
 rename libavcodec/x86/{mlpdsp.c => mlpdsp_init.c} (86%)

diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 2c5426c..d26c277 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -105,7 +105,7 @@ typedef struct SubStream {
     /// Whether the LSBs of the matrix output are encoded in the bitstream.
     uint8_t     lsb_bypass[MAX_MATRICES];
     /// Matrix coefficients, stored as 2.14 fixed point.
-    int32_t     matrix_coeff[MAX_MATRICES][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, matrix_coeff)[MAX_MATRICES][MAX_CHANNELS];
     /// Left shift to apply to noise values in 0x31eb substreams.
     uint8_t     matrix_noise_shift[MAX_MATRICES];
     //@}
@@ -159,7 +159,7 @@ typedef struct MLPDecodeContext {
 
     int8_t      noise_buffer[MAX_BLOCKSIZE_POW2];
     int8_t      bypassed_lsbs[MAX_BLOCKSIZE][MAX_CHANNELS];
-    int32_t     sample_buffer[MAX_BLOCKSIZE][MAX_CHANNELS];
+    DECLARE_ALIGNED(32, int32_t, sample_buffer)[MAX_BLOCKSIZE][MAX_CHANNELS];
 
     MLPDSPContext dsp;
 } MLPDecodeContext;
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 9f34abd..2fa56b9 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -41,7 +41,7 @@ OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
@@ -52,7 +52,7 @@ OBJS-$(CONFIG_RV40_DECODER)            += x86/rv34dsp_init.o            \
 OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
 OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
 OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
 OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
@@ -132,6 +132,7 @@ YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
                                           x86/hevc_deblock.o            \
                                           x86/hevc_idct.o               \
                                           x86/hevc_res_add.o
+YASM-OBJS-$(CONFIG_MLP_DECODER)        += x86/mlpdsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
@@ -139,6 +140,7 @@ YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
 YASM-OBJS-$(CONFIG_SVQ1_ENCODER)       += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER)     += x86/mlpdsp.o
 YASM-OBJS-$(CONFIG_TTA_DECODER)        += x86/ttadsp.o
 YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000..c2b53a7
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,198 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, 0xb1
+    pshufd        m3, m1, 0xb1
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, 0xb1
+    pshufd        m4, m2, 0xb1
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, 0x4e
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, 0x4e
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= (matrix_noise_shift)
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; jump if matrix_noise_shift != 0
+    jne .shift
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop_one                                    ; jump if true
+
+align 16
+.loop:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop
+    RET
+
+align 16
+.loop_one:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, 0xb1
+    pshufd       xm3, xm1, 0xb1
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop_one
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any reg, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, samples, index2, accum, \
+                ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, \
+                index, dest_ch, noise_buffer, index2, accum, \
+                ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop_shift_one               ; jump if maxchan < 4
+
+align 16
+.loop_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop_shift
+    RET
+
+align 16
+.loop_shift_one:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, 0xb1
+    pshufd       xm3, xm1, 0xb1
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop_shift_one
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
similarity index 86%
rename from libavcodec/x86/mlpdsp.c
rename to libavcodec/x86/mlpdsp_init.c
index f090fd7..dc0bc58 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -26,6 +26,22 @@
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
 #if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
@@ -178,9 +194,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
-- 
1.8.5.5