[FFmpeg-devel] [PATCH 8/9] sbcenc: add armv6 and neon asm optimizations
Aurelien Jacobs
aurel at gnuage.org
Thu Feb 22 00:37:17 EET 2018
This was originally based on libsbc, and was fully integrated into ffmpeg.
---
libavcodec/arm/Makefile | 3 +
libavcodec/arm/sbcdsp_armv6.S | 245 ++++++++++++++
libavcodec/arm/sbcdsp_init_arm.c | 105 ++++++
libavcodec/arm/sbcdsp_neon.S | 714 +++++++++++++++++++++++++++++++++++++++
libavcodec/sbcdsp.c | 2 +
libavcodec/sbcdsp.h | 1 +
6 files changed, 1070 insertions(+)
create mode 100644 libavcodec/arm/sbcdsp_armv6.S
create mode 100644 libavcodec/arm/sbcdsp_init_arm.c
create mode 100644 libavcodec/arm/sbcdsp_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 1eeac5449e..fd2401f4e5 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -42,6 +42,7 @@ OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
+OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
@@ -81,6 +82,7 @@ ARMV6-OBJS-$(CONFIG_VP8DSP) += arm/vp8_armv6.o \
# decoders/encoders
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
+ARMV6-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_armv6.o
# VFP optimizations
@@ -140,6 +142,7 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
+NEON-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \
diff --git a/libavcodec/arm/sbcdsp_armv6.S b/libavcodec/arm/sbcdsp_armv6.S
new file mode 100644
index 0000000000..f1ff845798
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_armv6.S
@@ -0,0 +1,245 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline.
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_sbc_analyze_4_armv6, export=1
+ @ r0 = in, r1 = out, r2 = consts
+ push {r1, r3-r7, lr}
+ push {r8-r12, r14}
+ ldrd r4, r5, [r0, #0]
+ ldrd r6, r7, [r2, #0]
+ ldrd r8, r9, [r0, #16]
+ ldrd r10, r11, [r2, #16]
+ mov r14, #0x8000
+ smlad r3, r4, r6, r14
+ smlad r12, r5, r7, r14
+ ldrd r4, r5, [r0, #32]
+ ldrd r6, r7, [r2, #32]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #48]
+ ldrd r10, r11, [r2, #48]
+ smlad r3, r4, r6, r3
+ smlad r12, r5, r7, r12
+ ldrd r4, r5, [r0, #64]
+ ldrd r6, r7, [r2, #64]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #8]
+ ldrd r10, r11, [r2, #8]
+ smlad r3, r4, r6, r3 @ t1[0] is done
+ smlad r12, r5, r7, r12 @ t1[1] is done
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r2, #24]
+ pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1]
+ smlad r12, r8, r10, r14
+ smlad r14, r9, r11, r14
+ ldrd r8, r9, [r0, #40]
+ ldrd r10, r11, [r2, #40]
+ smlad r12, r4, r6, r12
+ smlad r14, r5, r7, r14
+ ldrd r4, r5, [r0, #56]
+ ldrd r6, r7, [r2, #56]
+ smlad r12, r8, r10, r12
+ smlad r14, r9, r11, r14
+ ldrd r8, r9, [r0, #72]
+ ldrd r10, r11, [r2, #72]
+ smlad r12, r4, r6, r12
+ smlad r14, r5, r7, r14
+ ldrd r4, r5, [r2, #80] @ start loading cos table
+ smlad r12, r8, r10, r12 @ t1[2] is done
+ smlad r14, r9, r11, r14 @ t1[3] is done
+ ldrd r6, r7, [r2, #88]
+ ldrd r8, r9, [r2, #96]
+ ldrd r10, r11, [r2, #104] @ cos table fully loaded
+ pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3]
+ smuad r4, r3, r4
+ smuad r5, r3, r5
+ smlad r4, r12, r8, r4
+ smlad r5, r12, r9, r5
+ smuad r6, r3, r6
+ smuad r7, r3, r7
+ smlad r6, r12, r10, r6
+ smlad r7, r12, r11, r7
+ pop {r8-r12, r14}
+ stmia r1, {r4, r5, r6, r7}
+ pop {r1, r3-r7, pc}
+endfunc
+
+function ff_sbc_analyze_8_armv6, export=1
+ @ r0 = in, r1 = out, r2 = consts
+ push {r1, r3-r7, lr}
+ push {r8-r12, r14}
+ ldrd r4, r5, [r0, #24]
+ ldrd r6, r7, [r2, #24]
+ ldrd r8, r9, [r0, #56]
+ ldrd r10, r11, [r2, #56]
+ mov r14, #0x8000
+ smlad r3, r4, r6, r14
+ smlad r12, r5, r7, r14
+ ldrd r4, r5, [r0, #88]
+ ldrd r6, r7, [r2, #88]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #120]
+ ldrd r10, r11, [r2, #120]
+ smlad r3, r4, r6, r3
+ smlad r12, r5, r7, r12
+ ldrd r4, r5, [r0, #152]
+ ldrd r6, r7, [r2, #152]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #16]
+ ldrd r10, r11, [r2, #16]
+ smlad r3, r4, r6, r3 @ t1[6] is done
+ smlad r12, r5, r7, r12 @ t1[7] is done
+ ldrd r4, r5, [r0, #48]
+ ldrd r6, r7, [r2, #48]
+ pkhtb r3, r12, r3, asr #16 @ combine t1[6] and t1[7]
+ str r3, [sp, #-4]! @ save to stack
+ smlad r3, r8, r10, r14
+ smlad r12, r9, r11, r14
+ ldrd r8, r9, [r0, #80]
+ ldrd r10, r11, [r2, #80]
+ smlad r3, r4, r6, r3
+ smlad r12, r5, r7, r12
+ ldrd r4, r5, [r0, #112]
+ ldrd r6, r7, [r2, #112]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #144]
+ ldrd r10, r11, [r2, #144]
+ smlad r3, r4, r6, r3
+ smlad r12, r5, r7, r12
+ ldrd r4, r5, [r0, #0]
+ ldrd r6, r7, [r2, #0]
+ smlad r3, r8, r10, r3 @ t1[4] is done
+ smlad r12, r9, r11, r12 @ t1[5] is done
+ ldrd r8, r9, [r0, #32]
+ ldrd r10, r11, [r2, #32]
+ pkhtb r3, r12, r3, asr #16 @ combine t1[4] and t1[5]
+ str r3, [sp, #-4]! @ save to stack
+ smlad r3, r4, r6, r14
+ smlad r12, r5, r7, r14
+ ldrd r4, r5, [r0, #64]
+ ldrd r6, r7, [r2, #64]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #96]
+ ldrd r10, r11, [r2, #96]
+ smlad r3, r4, r6, r3
+ smlad r12, r5, r7, r12
+ ldrd r4, r5, [r0, #128]
+ ldrd r6, r7, [r2, #128]
+ smlad r3, r8, r10, r3
+ smlad r12, r9, r11, r12
+ ldrd r8, r9, [r0, #8]
+ ldrd r10, r11, [r2, #8]
+ smlad r3, r4, r6, r3 @ t1[0] is done
+ smlad r12, r5, r7, r12 @ t1[1] is done
+ ldrd r4, r5, [r0, #40]
+ ldrd r6, r7, [r2, #40]
+ pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1]
+ smlad r12, r8, r10, r14
+ smlad r14, r9, r11, r14
+ ldrd r8, r9, [r0, #72]
+ ldrd r10, r11, [r2, #72]
+ smlad r12, r4, r6, r12
+ smlad r14, r5, r7, r14
+ ldrd r4, r5, [r0, #104]
+ ldrd r6, r7, [r2, #104]
+ smlad r12, r8, r10, r12
+ smlad r14, r9, r11, r14
+ ldrd r8, r9, [r0, #136]
+ ldrd r10, r11, [r2, #136]!
+ smlad r12, r4, r6, r12
+ smlad r14, r5, r7, r14
+ ldrd r4, r5, [r2, #(160 - 136 + 0)]
+ smlad r12, r8, r10, r12 @ t1[2] is done
+ smlad r14, r9, r11, r14 @ t1[3] is done
+ ldrd r6, r7, [r2, #(160 - 136 + 8)]
+ smuad r4, r3, r4
+ smuad r5, r3, r5
+ pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3]
+ @ r3 = t2[0:1]
+ @ r12 = t2[2:3]
+ pop {r0, r14} @ t2[4:5], t2[6:7]
+ ldrd r8, r9, [r2, #(160 - 136 + 32)]
+ smuad r6, r3, r6
+ smuad r7, r3, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 40)]
+ smlad r4, r12, r8, r4
+ smlad r5, r12, r9, r5
+ ldrd r8, r9, [r2, #(160 - 136 + 64)]
+ smlad r6, r12, r10, r6
+ smlad r7, r12, r11, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 72)]
+ smlad r4, r0, r8, r4
+ smlad r5, r0, r9, r5
+ ldrd r8, r9, [r2, #(160 - 136 + 96)]
+ smlad r6, r0, r10, r6
+ smlad r7, r0, r11, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 104)]
+ smlad r4, r14, r8, r4
+ smlad r5, r14, r9, r5
+ ldrd r8, r9, [r2, #(160 - 136 + 16 + 0)]
+ smlad r6, r14, r10, r6
+ smlad r7, r14, r11, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 16 + 8)]
+ stmia r1!, {r4, r5}
+ smuad r4, r3, r8
+ smuad r5, r3, r9
+ ldrd r8, r9, [r2, #(160 - 136 + 16 + 32)]
+ stmia r1!, {r6, r7}
+ smuad r6, r3, r10
+ smuad r7, r3, r11
+ ldrd r10, r11, [r2, #(160 - 136 + 16 + 40)]
+ smlad r4, r12, r8, r4
+ smlad r5, r12, r9, r5
+ ldrd r8, r9, [r2, #(160 - 136 + 16 + 64)]
+ smlad r6, r12, r10, r6
+ smlad r7, r12, r11, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 16 + 72)]
+ smlad r4, r0, r8, r4
+ smlad r5, r0, r9, r5
+ ldrd r8, r9, [r2, #(160 - 136 + 16 + 96)]
+ smlad r6, r0, r10, r6
+ smlad r7, r0, r11, r7
+ ldrd r10, r11, [r2, #(160 - 136 + 16 + 104)]
+ smlad r4, r14, r8, r4
+ smlad r5, r14, r9, r5
+ smlad r6, r14, r10, r6
+ smlad r7, r14, r11, r7
+ pop {r8-r12, r14}
+ stmia r1!, {r4, r5, r6, r7}
+ pop {r1, r3-r7, pc}
+endfunc
diff --git a/libavcodec/arm/sbcdsp_init_arm.c b/libavcodec/arm/sbcdsp_init_arm.c
new file mode 100644
index 0000000000..6bf7e729ef
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_init_arm.c
@@ -0,0 +1,105 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimization for some basic "building bricks"
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/sbcdsp.h"
+
+void ff_sbc_analyze_4_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_armv6(const int16_t *in, int32_t *out, const int16_t *consts);
+
+void ff_sbc_analyze_4_neon(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_analyze_8_neon(const int16_t *in, int32_t *out, const int16_t *consts);
+void ff_sbc_calc_scalefactors_neon(int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands);
+int ff_sbc_calc_scalefactors_j_neon(int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int subbands);
+int ff_sbc_enc_process_input_4s_neon(int position, const uint8_t *pcm,
+ int16_t X[2][SBC_X_BUFFER_SIZE],
+ int nsamples, int nchannels);
+int ff_sbc_enc_process_input_8s_neon(int position, const uint8_t *pcm,
+ int16_t X[2][SBC_X_BUFFER_SIZE],
+ int nsamples, int nchannels);
+
+DECLARE_ALIGNED(SBC_ALIGN, int32_t, ff_sbcdsp_joint_bits_mask)[8] = {
+ 8, 4, 2, 1, 128, 64, 32, 16
+};
+
+#if HAVE_BIGENDIAN
+#define PERM(a, b, c, d) { \
+ (a * 2) + 1, (a * 2) + 0, \
+ (b * 2) + 1, (b * 2) + 0, \
+ (c * 2) + 1, (c * 2) + 0, \
+ (d * 2) + 1, (d * 2) + 0 \
+ }
+#else
+#define PERM(a, b, c, d) { \
+ (a * 2) + 0, (a * 2) + 1, \
+ (b * 2) + 0, (b * 2) + 1, \
+ (c * 2) + 0, (c * 2) + 1, \
+ (d * 2) + 0, (d * 2) + 1 \
+ }
+#endif
+
+DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_4)[2][8] = {
+ PERM(7, 3, 6, 4),
+ PERM(0, 2, 1, 5)
+};
+
+DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_8)[4][8] = {
+ PERM(15, 7, 14, 8),
+ PERM(13, 9, 12, 10),
+ PERM(11, 3, 6, 0),
+ PERM( 5, 1, 4, 2)
+};
+
+av_cold void ff_sbcdsp_init_arm(SBCDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_armv6(cpu_flags)) {
+ s->sbc_analyze_4 = ff_sbc_analyze_4_armv6;
+ s->sbc_analyze_8 = ff_sbc_analyze_8_armv6;
+ }
+
+ if (have_neon(cpu_flags)) {
+ s->sbc_analyze_4 = ff_sbc_analyze_4_neon;
+ s->sbc_analyze_8 = ff_sbc_analyze_8_neon;
+ s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_neon;
+ s->sbc_calc_scalefactors_j = ff_sbc_calc_scalefactors_j_neon;
+ if (s->increment != 1) {
+ s->sbc_enc_process_input_4s = ff_sbc_enc_process_input_4s_neon;
+ s->sbc_enc_process_input_8s = ff_sbc_enc_process_input_8s_neon;
+ }
+ }
+}
diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
new file mode 100644
index 0000000000..d83d21d202
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -0,0 +1,714 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
+ * Copyright (C) 2008-2010 Nokia Corporation
+ * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
+ * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
+ * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARM NEON optimizations
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define SBC_PROTO_FIXED_SCALE 16
+
+function ff_sbc_analyze_4_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+ vld1.16 {d4, d5}, [r0, :64]!
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmull.s16 q0, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmull.s16 q1, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+
+ vmlal.s16 q0, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmlal.s16 q1, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q0, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmlal.s16 q1, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+
+ vmlal.s16 q0, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmlal.s16 q1, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q0, d4, d8
+ vmlal.s16 q1, d5, d9
+
+ vpadd.s32 d0, d0, d1
+ vpadd.s32 d1, d2, d3
+
+ vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE
+
+ vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+ vmull.s16 q3, d2, d0
+ vmull.s16 q4, d3, d0
+ vmlal.s16 q3, d4, d1
+ vmlal.s16 q4, d5, d1
+
+ vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */
+ vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1}, [r1, :128]
+
+ bx lr
+endfunc
+
+function ff_sbc_analyze_8_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+ vld1.16 {d4, d5}, [r0, :64]!
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmull.s16 q6, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmull.s16 q7, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+ vmull.s16 q8, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmull.s16 q9, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q6, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmlal.s16 q7, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+ vmlal.s16 q8, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmlal.s16 q9, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q6, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmlal.s16 q7, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+ vmlal.s16 q8, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmlal.s16 q9, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q6, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmlal.s16 q7, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+ vmlal.s16 q8, d6, d10
+ vld1.16 {d4, d5}, [r0, :64]!
+ vmlal.s16 q9, d7, d11
+ vld1.16 {d8, d9}, [r2, :128]!
+
+ vmlal.s16 q6, d4, d8
+ vld1.16 {d6, d7}, [r0, :64]!
+ vmlal.s16 q7, d5, d9
+ vld1.16 {d10, d11}, [r2, :128]!
+
+ vmlal.s16 q8, d6, d10
+ vmlal.s16 q9, d7, d11
+
+ vpadd.s32 d0, d12, d13
+ vpadd.s32 d1, d14, d15
+ vpadd.s32 d2, d16, d17
+ vpadd.s32 d3, d18, d19
+
+ vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE
+ vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE
+ vmovn.s32 d0, q0
+ vmovn.s32 d1, q1
+
+ vdup.i32 d3, d1[1] /* TODO: can be eliminated */
+ vdup.i32 d2, d1[0] /* TODO: can be eliminated */
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+ vld1.16 {d4, d5}, [r2, :128]!
+ vmull.s16 q6, d4, d0
+ vld1.16 {d6, d7}, [r2, :128]!
+ vmull.s16 q7, d5, d0
+ vmull.s16 q8, d6, d0
+ vmull.s16 q9, d7, d0
+
+ vld1.16 {d4, d5}, [r2, :128]!
+ vmlal.s16 q6, d4, d1
+ vld1.16 {d6, d7}, [r2, :128]!
+ vmlal.s16 q7, d5, d1
+ vmlal.s16 q8, d6, d1
+ vmlal.s16 q9, d7, d1
+
+ vld1.16 {d4, d5}, [r2, :128]!
+ vmlal.s16 q6, d4, d2
+ vld1.16 {d6, d7}, [r2, :128]!
+ vmlal.s16 q7, d5, d2
+ vmlal.s16 q8, d6, d2
+ vmlal.s16 q9, d7, d2
+
+ vld1.16 {d4, d5}, [r2, :128]!
+ vmlal.s16 q6, d4, d3
+ vld1.16 {d6, d7}, [r2, :128]!
+ vmlal.s16 q7, d5, d3
+ vmlal.s16 q8, d6, d3
+ vmlal.s16 q9, d7, d3
+
+ vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */
+ vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */
+ vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */
+ vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1, d2, d3}, [r1, :128]
+
+ bx lr
+endfunc
+
+function ff_sbc_calc_scalefactors_neon, export=1
+ @ parameters
+ @ r0 = sb_sample_f
+ @ r1 = scale_factor
+ @ r2 = blocks
+ @ r3 = channels
+ @ r4 = subbands
+ @ local variables
+ @ r5 = in_loop_1
+ @ r6 = in
+ @ r7 = out_loop_1
+ @ r8 = out
+ @ r9 = ch
+ @ r10 = sb
+ @ r11 = inc
+ @ r12 = blk
+
+ push {r1-r2, r4-r12}
+ ldr r4, [sp, #44]
+ mov r11, #64
+
+ mov r9, #0
+1:
+ add r5, r0, r9, lsl#5
+ add r7, r1, r9, lsl#5
+
+ mov r10, #0
+2:
+ add r6, r5, r10, lsl#2
+ add r8, r7, r10, lsl#2
+ mov r12, r2
+
+ vmov.s32 q0, #0
+ vmov.s32 q1, #0x8000 @ 1 << SCALE_OUT_BITS
+ vmov.s32 q14, #1
+ vmov.s32 q15, #16 @ 31 - SCALE_OUT_BITS
+ vadd.s32 q1, q1, q14
+3:
+ vld1.32 {d16, d17}, [r6, :128], r11
+ vabs.s32 q8, q8
+ vld1.32 {d18, d19}, [r6, :128], r11
+ vabs.s32 q9, q9
+ vld1.32 {d20, d21}, [r6, :128], r11
+ vabs.s32 q10, q10
+ vld1.32 {d22, d23}, [r6, :128], r11
+ vabs.s32 q11, q11
+ vmax.s32 q0, q0, q8
+ vmax.s32 q1, q1, q9
+ vmax.s32 q0, q0, q10
+ vmax.s32 q1, q1, q11
+ subs r12, r12, #4
+ bgt 3b
+ vmax.s32 q0, q0, q1
+ vsub.s32 q0, q0, q14
+ vclz.s32 q0, q0
+ vsub.s32 q0, q15, q0
+ vst1.32 {d0, d1}, [r8, :128]
+
+ add r10, r10, #4
+ cmp r10, r4
+ blt 2b
+
+ add r9, r9, #1
+ cmp r9, r3
+ blt 1b
+
+ pop {r1-r2, r4-r12}
+ bx lr
+endfunc
+
+/*
+ * constants: q13 = (31 - SCALE_OUT_BITS)
+ * q14 = 1
+ * input: q0 - ((1 << SCALE_OUT_BITS) + 1)
+ * r5 - samples for channel 0
+ * r6 - samples for shannel 1
+ * output: q0, q1 - scale factors without joint stereo
+ * q2, q3 - scale factors with joint stereo
+ * q15 - joint stereo selection mask
+ */
+.macro calc_scalefactors
+ vmov.s32 q1, q0
+ vmov.s32 q2, q0
+ vmov.s32 q3, q0
+ mov r3, r2
+1:
+ vld1.32 {d18, d19}, [r6, :128], r11
+ vbic.s32 q11, q9, q14
+ vld1.32 {d16, d17}, [r5, :128], r11
+ vhadd.s32 q10, q8, q11
+ vhsub.s32 q11, q8, q11
+ vabs.s32 q8, q8
+ vabs.s32 q9, q9
+ vabs.s32 q10, q10
+ vabs.s32 q11, q11
+ vmax.s32 q0, q0, q8
+ vmax.s32 q1, q1, q9
+ vmax.s32 q2, q2, q10
+ vmax.s32 q3, q3, q11
+ subs r3, r3, #1
+ bgt 1b
+ vsub.s32 q0, q0, q14
+ vsub.s32 q1, q1, q14
+ vsub.s32 q2, q2, q14
+ vsub.s32 q3, q3, q14
+ vclz.s32 q0, q0
+ vclz.s32 q1, q1
+ vclz.s32 q2, q2
+ vclz.s32 q3, q3
+ vsub.s32 q0, q13, q0
+ vsub.s32 q1, q13, q1
+ vsub.s32 q2, q13, q2
+ vsub.s32 q3, q13, q3
+.endm
+
+/*
+ * constants: q14 = 1
+ * input: q15 - joint stereo selection mask
+ * r5 - value set by calc_scalefactors macro
+ * r6 - value set by calc_scalefactors macro
+ */
+.macro update_joint_stereo_samples
+ sub r8, r6, r11
+ sub r7, r5, r11
+ sub r6, r6, r11, asl #1
+ sub r5, r5, r11, asl #1
+ vld1.32 {d18, d19}, [r6, :128]
+ vbic.s32 q11, q9, q14
+ vld1.32 {d16, d17}, [r5, :128]
+ vld1.32 {d2, d3}, [r8, :128]
+ vbic.s32 q3, q1, q14
+ vld1.32 {d0, d1}, [r7, :128]
+ vhsub.s32 q10, q8, q11
+ vhadd.s32 q11, q8, q11
+ vhsub.s32 q2, q0, q3
+ vhadd.s32 q3, q0, q3
+ vbif.s32 q10, q9, q15
+ vbif.s32 d22, d16, d30
+ sub r11, r10, r11, asl #1
+ sub r3, r2, #2
+2:
+ vbif.s32 d23, d17, d31
+ vst1.32 {d20, d21}, [r6, :128], r11
+ vbif.s32 d4, d2, d30
+ vld1.32 {d18, d19}, [r6, :128]
+ vbif.s32 d5, d3, d31
+ vst1.32 {d22, d23}, [r5, :128], r11
+ vbif.s32 d6, d0, d30
+ vld1.32 {d16, d17}, [r5, :128]
+ vbif.s32 d7, d1, d31
+ vst1.32 {d4, d5}, [r8, :128], r11
+ vbic.s32 q11, q9, q14
+ vld1.32 {d2, d3}, [r8, :128]
+ vst1.32 {d6, d7}, [r7, :128], r11
+ vbic.s32 q3, q1, q14
+ vld1.32 {d0, d1}, [r7, :128]
+ vhsub.s32 q10, q8, q11
+ vhadd.s32 q11, q8, q11
+ vhsub.s32 q2, q0, q3
+ vhadd.s32 q3, q0, q3
+ vbif.s32 q10, q9, q15
+ vbif.s32 d22, d16, d30
+ subs r3, r3, #2
+ bgt 2b
+ sub r11, r10, r11, asr #1
+ vbif.s32 d23, d17, d31
+ vst1.32 {d20, d21}, [r6, :128]
+ vbif.s32 q2, q1, q15
+ vst1.32 {d22, d23}, [r5, :128]
+ vbif.s32 q3, q0, q15
+ vst1.32 {d4, d5}, [r8, :128]
+ vst1.32 {d6, d7}, [r7, :128]
+.endm
+
+function ff_sbc_calc_scalefactors_j_neon, export=1
+ @ parameters
+ @ r0 = in = sb_sample_f
+ @ r1 = out = scale_factor
+ @ r2 = blocks
+ @ r3 = subbands
+ @ local variables
+ @ r4 = consts = ff_sbcdsp_joint_bits_mask
+ @ r5 = in0
+ @ r6 = in1
+ @ r7 = out0
+ @ r8 = out1
+ @ r10 = zero
+ @ r11 = inc
+ @ return r0 = joint
+
+ push {r3-r11}
+ movrelx r4, X(ff_sbcdsp_joint_bits_mask)
+ mov r10, #0
+ mov r11, #64
+
+ vmov.s32 q14, #1
+ vmov.s32 q13, #16 @ 31 - SCALE_OUT_BITS
+
+ cmp r3, #4
+ bne 8f
+
+4: @ 4 subbands
+ add r5, r0, #0
+ add r6, r0, #32
+ add r7, r1, #0
+ add r8, r1, #32
+ vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
+ vadd.s32 q0, q0, q14
+
+ calc_scalefactors
+
+ @ check whether to use joint stereo for subbands 0, 1, 2
+ vadd.s32 q15, q0, q1
+ vadd.s32 q9, q2, q3
+ vmov.s32 d31[1], r10 @ last subband -> no joint
+ vld1.32 {d16, d17}, [r4, :128]!
+ vcgt.s32 q15, q15, q9
+
+ @ calculate and save to memory 'joint' variable
+ @ update and save scale factors to memory
+ vand.s32 q8, q8, q15
+ vbit.s32 q0, q2, q15
+ vpadd.s32 d16, d16, d17
+ vbit.s32 q1, q3, q15
+ vpadd.s32 d16, d16, d16
+ vst1.32 {d0, d1}, [r7, :128]
+ vst1.32 {d2, d3}, [r8, :128]
+ vmov.32 r0, d16[0]
+
+ update_joint_stereo_samples
+ b 9f
+
+8: @ 8 subbands
+ add r5, r0, #16
+ add r6, r0, #48
+ add r7, r1, #16
+ add r8, r1, #48
+ vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
+ vadd.s32 q0, q0, q14
+
+ calc_scalefactors
+
+ @ check whether to use joint stereo for subbands 4, 5, 6
+ vadd.s32 q15, q0, q1
+ vadd.s32 q9, q2, q3
+ vmov.s32 d31[1], r10 @ last subband -> no joint
+ vld1.32 {d16, d17}, [r4, :128]!
+ vcgt.s32 q15, q15, q9
+
+ @ calculate part of 'joint' variable and save it to d24
+ @ update and save scale factors to memory
+ vand.s32 q8, q8, q15
+ vbit.s32 q0, q2, q15
+ vpadd.s32 d16, d16, d17
+ vbit.s32 q1, q3, q15
+ vst1.32 {d0, d1}, [r7, :128]
+ vst1.32 {d2, d3}, [r8, :128]
+ vpadd.s32 d24, d16, d16
+
+ update_joint_stereo_samples
+
+ add r5, r0, #0
+ add r6, r0, #32
+ add r7, r1, #0
+ add r8, r1, #32
+ vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS
+ vadd.s32 q0, q0, q14
+
+ calc_scalefactors
+
+ @ check whether to use joint stereo for subbands 0, 1, 2, 3
+ vadd.s32 q15, q0, q1
+ vadd.s32 q9, q2, q3
+ vld1.32 {d16, d17}, [r4, :128]!
+ vcgt.s32 q15, q15, q9
+
+ @ combine last part of 'joint' with d24 and save to memory
+ @ update and save scale factors to memory
+ vand.s32 q8, q8, q15
+ vbit.s32 q0, q2, q15
+ vpadd.s32 d16, d16, d17
+ vbit.s32 q1, q3, q15
+ vpadd.s32 d16, d16, d16
+ vst1.32 {d0, d1}, [r7, :128]
+ vadd.s32 d16, d16, d24
+ vst1.32 {d2, d3}, [r8, :128]
+ vmov.32 r0, d16[0]
+
+ update_joint_stereo_samples
+9:
+ pop {r3-r11}
+ bx lr
+endfunc
+
+function ff_sbc_enc_process_input_4s_neon, export=1
+ @ parameters
+ @ r0 = positioin
+ @ r1 = pcm
+ @ r2 = X
+ @ r3 = nsamples
+ @ r4 = nchannels
+ @ local variables
+ @ r5 = ff_sbc_input_perm_4
+ @ r6 = src / x
+ @ r7 = dst / y
+
+ push {r1, r3-r7}
+ ldr r4, [sp, #24]
+ movrelx r5, X(ff_sbc_input_perm_4)
+
+ @ handle X buffer wraparound
+ cmp r0, r3
+ bge 1f @ if (position < nsamples)
+ add r7, r2, #576 @ &X[0][SBC_X_BUFFER_SIZE - 40]
+ add r6, r2, r0, lsl#1 @ &X[0][position]
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0}, [r6, :64]!
+ vst1.16 {d0}, [r7, :64]!
+ cmp r4, #1
+ ble 2f @ if (nchannels > 1)
+ add r7, r2, #1232 @ &X[1][SBC_X_BUFFER_SIZE - 40]
+ add r6, r2, #656
+ add r6, r6, r0, lsl#1 @ &X[1][position]
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0}, [r6, :64]!
+ vst1.16 {d0}, [r7, :64]!
+2:
+ mov r0, #288 @ SBC_X_BUFFER_SIZE - 40
+1:
+
+ add r6, r2, r0, lsl#1 @ &X[0][position]
+ add r7, r6, #656 @ &X[1][position]
+
+ cmp r4, #1
+ ble 8f @ if (nchannels > 1)
+ tst r1, #1
+ beq 7f @ if (pcm & 1)
+ @ poor 'pcm' alignment
+ vld1.8 {d0, d1}, [r5, :128]
+1:
+ sub r6, r6, #16
+ sub r7, r7, #16
+ sub r0, r0, #8
+ vld1.8 {d4, d5}, [r1]!
+ vuzp.16 d4, d5
+ vld1.8 {d20, d21}, [r1]!
+ vuzp.16 d20, d21
+ vswp d5, d20
+ vtbl.8 d16, {d4, d5}, d0
+ vtbl.8 d17, {d4, d5}, d1
+ vtbl.8 d18, {d20, d21}, d0
+ vtbl.8 d19, {d20, d21}, d1
+ vst1.16 {d16, d17}, [r6, :128]
+ vst1.16 {d18, d19}, [r7, :128]
+ subs r3, r3, #8
+ bgt 1b
+ b 9f
+7:
+ @ proper 'pcm' alignment
+ vld1.8 {d0, d1}, [r5, :128]
+1:
+ sub r6, r6, #16
+ sub r7, r7, #16
+ sub r0, r0, #8
+ vld2.16 {d4, d5}, [r1]!
+ vld2.16 {d20, d21}, [r1]!
+ vswp d5, d20
+ vtbl.8 d16, {d4, d5}, d0
+ vtbl.8 d17, {d4, d5}, d1
+ vtbl.8 d18, {d20, d21}, d0
+ vtbl.8 d19, {d20, d21}, d1
+ vst1.16 {d16, d17}, [r6, :128]
+ vst1.16 {d18, d19}, [r7, :128]
+ subs r3, r3, #8
+ bgt 1b
+ b 9f
+8:
+ @ mono
+ vld1.8 {d0, d1}, [r5, :128]
+1:
+ sub r6, r6, #16
+ sub r0, r0, #8
+ vld1.8 {d4, d5}, [r1]!
+ vtbl.8 d16, {d4, d5}, d0
+ vtbl.8 d17, {d4, d5}, d1
+ vst1.16 {d16, d17}, [r6, :128]
+ subs r3, r3, #8
+ bgt 1b
+9:
+ pop {r1, r3-r7}
+ bx lr
+endfunc
+
+function ff_sbc_enc_process_input_8s_neon, export=1
+ @ parameters
+ @ r0 = positioin
+ @ r1 = pcm
+ @ r2 = X
+ @ r3 = nsamples
+ @ r4 = nchannels
+ @ local variables
+ @ r5 = ff_sbc_input_perm_8
+ @ r6 = src
+ @ r7 = dst
+
+ push {r1, r3-r7}
+ ldr r4, [sp, #24]
+ movrelx r5, X(ff_sbc_input_perm_8)
+
+ @ handle X buffer wraparound
+ cmp r0, r3
+ bge 1f @ if (position < nsamples)
+ add r7, r2, #512 @ &X[0][SBC_X_BUFFER_SIZE - 72]
+ add r6, r2, r0, lsl#1 @ &X[0][position]
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1}, [r6, :128]!
+ vst1.16 {d0, d1}, [r7, :128]!
+ cmp r4, #1
+ ble 2f @ if (nchannels > 1)
+ add r7, r2, #1168 @ &X[1][SBC_X_BUFFER_SIZE - 72]
+ add r6, r2, #656
+ add r6, r6, r0, lsl#1 @ &X[1][position]
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1, d2, d3}, [r6, :128]!
+ vst1.16 {d0, d1, d2, d3}, [r7, :128]!
+ vld1.16 {d0, d1}, [r6, :128]!
+ vst1.16 {d0, d1}, [r7, :128]!
+2:
+ mov r0, #256 @ SBC_X_BUFFER_SIZE - 72
+1:
+
+ add r6, r2, r0, lsl#1 @ &X[0][position]
+ add r7, r6, #656 @ &X[1][position]
+
+ cmp r4, #1
+ ble 8f @ if (nchannels > 1)
+ tst r1, #1
+ beq 7f @ if (pcm & 1)
+ @ poor 'pcm' alignment
+ vld1.8 {d0, d1, d2, d3}, [r5, :128]
+1:
+ sub r6, r6, #32
+ sub r7, r7, #32
+ sub r0, r0, #16
+ vld1.8 {d4, d5, d6, d7}, [r1]!
+ vuzp.16 q2, q3
+ vld1.8 {d20, d21, d22, d23}, [r1]!
+ vuzp.16 q10, q11
+ vswp q3, q10
+ vtbl.8 d16, {d4, d5, d6, d7}, d0
+ vtbl.8 d17, {d4, d5, d6, d7}, d1
+ vtbl.8 d18, {d4, d5, d6, d7}, d2
+ vtbl.8 d19, {d4, d5, d6, d7}, d3
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]
+ vtbl.8 d16, {d20, d21, d22, d23}, d0
+ vtbl.8 d17, {d20, d21, d22, d23}, d1
+ vtbl.8 d18, {d20, d21, d22, d23}, d2
+ vtbl.8 d19, {d20, d21, d22, d23}, d3
+ vst1.16 {d16, d17, d18, d19}, [r7, :128]
+ subs r3, r3, #16
+ bgt 1b
+ b 9f
+7:
+ @ proper 'pcm' alignment
+ vld1.8 {d0, d1, d2, d3}, [r5, :128]
+1:
+ sub r6, r6, #32
+ sub r7, r7, #32
+ sub r0, r0, #16
+ vld2.16 {d4, d5, d6, d7}, [r1]!
+ vld2.16 {d20, d21, d22, d23}, [r1]!
+ vswp q3, q10
+ vtbl.8 d16, {d4, d5, d6, d7}, d0
+ vtbl.8 d17, {d4, d5, d6, d7}, d1
+ vtbl.8 d18, {d4, d5, d6, d7}, d2
+ vtbl.8 d19, {d4, d5, d6, d7}, d3
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]
+ vtbl.8 d16, {d20, d21, d22, d23}, d0
+ vtbl.8 d17, {d20, d21, d22, d23}, d1
+ vtbl.8 d18, {d20, d21, d22, d23}, d2
+ vtbl.8 d19, {d20, d21, d22, d23}, d3
+ vst1.16 {d16, d17, d18, d19}, [r7, :128]
+ subs r3, r3, #16
+ bgt 1b
+ b 9f
+8:
+ @ mono
+ vld1.8 {d0, d1, d2, d3}, [r5, :128]
+1:
+ sub r6, r6, #32
+ sub r0, r0, #16
+ vld1.8 {d4, d5, d6, d7}, [r1]!
+ vtbl.8 d16, {d4, d5, d6, d7}, d0
+ vtbl.8 d17, {d4, d5, d6, d7}, d1
+ vtbl.8 d18, {d4, d5, d6, d7}, d2
+ vtbl.8 d19, {d4, d5, d6, d7}, d3
+ vst1.16 {d16, d17, d18, d19}, [r6, :128]
+ subs r3, r3, #16
+ bgt 1b
+9:
+ pop {r1, r3-r7}
+ bx lr
+endfunc
diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
index 2d0addcf28..e745595da0 100644
--- a/libavcodec/sbcdsp.c
+++ b/libavcodec/sbcdsp.c
@@ -380,6 +380,8 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
s->sbc_calc_scalefactors = sbc_calc_scalefactors;
s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
+ if (ARCH_ARM)
+ ff_sbcdsp_init_arm(s);
if (ARCH_X86)
ff_sbcdsp_init_x86(s);
}
diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
index 127e6a8a11..334c058e6d 100644
--- a/libavcodec/sbcdsp.h
+++ b/libavcodec/sbcdsp.h
@@ -80,6 +80,7 @@ struct sbc_dsp_context {
*/
void ff_sbcdsp_init(SBCDSPContext *s);
+void ff_sbcdsp_init_arm(SBCDSPContext *s);
void ff_sbcdsp_init_x86(SBCDSPContext *s);
#endif /* AVCODEC_SBCDSP_H */
--
2.16.1
More information about the ffmpeg-devel
mailing list