[FFmpeg-devel] [PATCH 7/9] sbcenc: add MMX optimizations
Aurelien Jacobs
aurel at gnuage.org
Sat Dec 23 20:01:45 EET 2017
This was originally based on libsbc, and was fully integrated into ffmpeg.
Rough speed test:
C version: speed= 592x
MMX version: speed= 785x
---
libavcodec/sbcdsp.c | 3 +
libavcodec/sbcdsp.h | 2 +
libavcodec/x86/Makefile | 2 +
libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/sbcdsp_init.c | 51 ++++++++
5 files changed, 342 insertions(+)
create mode 100644 libavcodec/x86/sbcdsp.asm
create mode 100644 libavcodec/x86/sbcdsp_init.c
diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
index e155387f0d..2d0addcf28 100644
--- a/libavcodec/sbcdsp.c
+++ b/libavcodec/sbcdsp.c
@@ -379,4 +379,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
/* Default implementation for scale factors calculation */
s->sbc_calc_scalefactors = sbc_calc_scalefactors;
s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
+
+ if (ARCH_X86)
+ ff_sbcdsp_init_x86(s);
}
diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
index 66ed7d324e..127e6a8a11 100644
--- a/libavcodec/sbcdsp.h
+++ b/libavcodec/sbcdsp.h
@@ -80,4 +80,6 @@ struct sbc_dsp_context {
*/
void ff_sbcdsp_init(SBCDSPContext *s);
+void ff_sbcdsp_init_x86(SBCDSPContext *s);
+
#endif /* AVCODEC_SBCDSP_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a805cd37b4..2350c8bbee 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
+OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
@@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
+X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
new file mode 100644
index 0000000000..0538705fb7
--- /dev/null
+++ b/libavcodec/x86/sbcdsp.asm
@@ -0,0 +1,284 @@
+;******************************************************************************
+;* SIMD optimized SBC encoder DSP functions
+;*
+;* Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
+;* Copyright (C) 2008-2010 Nokia Corporation
+;* Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
+;* Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
+;* Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
+
+SECTION .text
+
+;*******************************************************************
+;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
+ movq m0, [inq]
+ movq m1, [inq+8]
+ pmaddwd m0, [constsq]
+ pmaddwd m1, [constsq+8]
+ paddd m0, [scale_mask]
+ paddd m1, [scale_mask]
+
+ movq m2, [inq+16]
+ movq m3, [inq+24]
+ pmaddwd m2, [constsq+16]
+ pmaddwd m3, [constsq+24]
+ paddd m0, m2
+ paddd m1, m3
+
+ movq m2, [inq+32]
+ movq m3, [inq+40]
+ pmaddwd m2, [constsq+32]
+ pmaddwd m3, [constsq+40]
+ paddd m0, m2
+ paddd m1, m3
+
+ movq m2, [inq+48]
+ movq m3, [inq+56]
+ pmaddwd m2, [constsq+48]
+ pmaddwd m3, [constsq+56]
+ paddd m0, m2
+ paddd m1, m3
+
+ movq m2, [inq+64]
+ movq m3, [inq+72]
+ pmaddwd m2, [constsq+64]
+ pmaddwd m3, [constsq+72]
+ paddd m0, m2
+ paddd m1, m3
+
+ psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
+ psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
+ packssdw m0, m0
+ packssdw m1, m1
+
+ movq m2, m0
+ pmaddwd m0, [constsq+80]
+ pmaddwd m2, [constsq+88]
+
+ movq m3, m1
+ pmaddwd m1, [constsq+96]
+ pmaddwd m3, [constsq+104]
+ paddd m0, m1
+ paddd m2, m3
+
+ movq [outq ], m0
+ movq [outq+8], m2
+
+ RET
+
+
+
+;*******************************************************************
+;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
+ movq m0, [inq]
+ movq m1, [inq+8]
+ movq m2, [inq+16]
+ movq m3, [inq+24]
+ pmaddwd m0, [constsq]
+ pmaddwd m1, [constsq+8]
+ pmaddwd m2, [constsq+16]
+ pmaddwd m3, [constsq+24]
+ paddd m0, [scale_mask]
+ paddd m1, [scale_mask]
+ paddd m2, [scale_mask]
+ paddd m3, [scale_mask]
+
+ movq m4, [inq+32]
+ movq m5, [inq+40]
+ movq m6, [inq+48]
+ movq m7, [inq+56]
+ pmaddwd m4, [constsq+32]
+ pmaddwd m5, [constsq+40]
+ pmaddwd m6, [constsq+48]
+ pmaddwd m7, [constsq+56]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+
+ movq m4, [inq+64]
+ movq m5, [inq+72]
+ movq m6, [inq+80]
+ movq m7, [inq+88]
+ pmaddwd m4, [constsq+64]
+ pmaddwd m5, [constsq+72]
+ pmaddwd m6, [constsq+80]
+ pmaddwd m7, [constsq+88]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+
+ movq m4, [inq+96]
+ movq m5, [inq+104]
+ movq m6, [inq+112]
+ movq m7, [inq+120]
+ pmaddwd m4, [constsq+96]
+ pmaddwd m5, [constsq+104]
+ pmaddwd m6, [constsq+112]
+ pmaddwd m7, [constsq+120]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+
+ movq m4, [inq+128]
+ movq m5, [inq+136]
+ movq m6, [inq+144]
+ movq m7, [inq+152]
+ pmaddwd m4, [constsq+128]
+ pmaddwd m5, [constsq+136]
+ pmaddwd m6, [constsq+144]
+ pmaddwd m7, [constsq+152]
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+
+ psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
+ psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
+ psrad m2, 16 ; SBC_PROTO_FIXED_SCALE
+ psrad m3, 16 ; SBC_PROTO_FIXED_SCALE
+
+ packssdw m0, m0
+ packssdw m1, m1
+ packssdw m2, m2
+ packssdw m3, m3
+
+ movq m4, m0
+ movq m5, m0
+ pmaddwd m4, [constsq+160]
+ pmaddwd m5, [constsq+168]
+
+ movq m6, m1
+ movq m7, m1
+ pmaddwd m6, [constsq+192]
+ pmaddwd m7, [constsq+200]
+ paddd m4, m6
+ paddd m5, m7
+
+ movq m6, m2
+ movq m7, m2
+ pmaddwd m6, [constsq+224]
+ pmaddwd m7, [constsq+232]
+ paddd m4, m6
+ paddd m5, m7
+
+ movq m6, m3
+ movq m7, m3
+ pmaddwd m6, [constsq+256]
+ pmaddwd m7, [constsq+264]
+ paddd m4, m6
+ paddd m5, m7
+
+ movq [outq ], m4
+ movq [outq+8], m5
+
+ movq m5, m0
+ pmaddwd m0, [constsq+176]
+ pmaddwd m5, [constsq+184]
+
+ movq m7, m1
+ pmaddwd m1, [constsq+208]
+ pmaddwd m7, [constsq+216]
+ paddd m0, m1
+ paddd m5, m7
+
+ movq m7, m2
+ pmaddwd m2, [constsq+240]
+ pmaddwd m7, [constsq+248]
+ paddd m0, m2
+ paddd m5, m7
+
+ movq m7, m3
+ pmaddwd m3, [constsq+272]
+ pmaddwd m7, [constsq+280]
+ paddd m0, m3
+ paddd m5, m7
+
+ movq [outq+16], m0
+ movq [outq+24], m5
+
+ RET
+
+
+;*******************************************************************
+;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
+; uint32_t scale_factor[2][8],
+; int blocks, int channels, int subbands)
+;*******************************************************************
+INIT_MMX mmx
+cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
+ ; subbands = 4 * subbands * channels
+ shl subbandsd, 2
+ cmp channelsd, 2
+ jl .loop_1
+ shl subbandsd, 1
+
+.loop_1:
+ sub subbandsq, 8
+ lea ptrq, [sb_sample_fq + subbandsq]
+
+ ; blk = (blocks - 1) * 64;
+ lea blkq, [blocksq - 1]
+ shl blkd, 6
+
+ movq m0, [scale_mask]
+.loop_2:
+ movq m1, [ptrq+blkq]
+ pxor m2, m2
+ pcmpgtd m1, m2
+ paddd m1, [ptrq+blkq]
+ pcmpgtd m2, m1
+ pxor m1, m2
+
+ por m0, m1
+
+ sub blkq, 64
+ jns .loop_2
+
+ movd blkd, m0
+ psrlq m0, 32
+ bsr blkd, blkd
+ sub blkd, 15 ; SCALE_OUT_BITS
+ mov [scale_factorq + subbandsq], blkd
+
+ movd blkd, m0
+ bsr blkd, blkd
+ sub blkd, 15 ; SCALE_OUT_BITS
+ mov [scale_factorq + subbandsq + 4], blkd
+
+ cmp subbandsq, 0
+ jg .loop_1
+
+ emms
+ RET
diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
new file mode 100644
index 0000000000..86effecfdf
--- /dev/null
+++ b/libavcodec/x86/sbcdsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC MMX optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands);
+
+av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
+ s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
+ s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
+ }
+}
--
2.15.1
More information about the ffmpeg-devel
mailing list