[FFmpeg-cvslog] arm64: port synth_filter_float_neon from arm

Sat Jan 2 11:15:06 CET 2016

ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Tue Dec  1 13:37:41 2015 +0100| [705f5e5e155f6f280a360af220fc5b30cfcee702] | committer: Janne Grunau

arm64: port synth_filter_float_neon from arm

~25% faster dts decoding overall. The checkasm CPU cycles numbers are
not that useful since synth_filter_float() calls FFTContext.imdct_half().

                         cortex-a57   cortex-a53
synth_filter_float_c:    1866.2       3490.9
synth_filter_float_neon:  915.0       1531.5

With fftc.imdct_half forced to imdct_half_neon:
                         cortex-a57   cortex-a53
synth_filter_float_c:    1718.4       3025.3
synth_filter_float_neon:  926.2       1530.1

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=705f5e5e155f6f280a360af220fc5b30cfcee702
---

 libavcodec/aarch64/Makefile            |    3 +-
 libavcodec/aarch64/asm-offsets.h       |    3 +
 libavcodec/aarch64/dcadsp_init.c       |   16 +++++
 libavcodec/aarch64/synth_filter_neon.S |  119 ++++++++++++++++++++++++++++++++
 libavcodec/synth_filter.c              |    8 ++-
 libavcodec/synth_filter.h              |    1 +
 6 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 0b614a3..2175578 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -16,7 +16,8 @@ OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
 
 ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
 
-NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
+                                           aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 45b5c40..60e32dd 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -27,4 +27,7 @@
 #define CELT_TMP                        0x10
 #define CELT_TWIDDLE                    (CELT_TMP + 0x8)    // loaded as pair
 
+/* FFTContext */
+#define IMDCT_HALF                      0x48
+
 #endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
index ad91070..c66ec3f 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -22,7 +22,15 @@
 
 #include "libavutil/aarch64/cpu.h"
 #include "libavutil/attributes.h"
+#include "libavutil/internal.h"
 #include "libavcodec/dcadsp.h"
+#include "libavcodec/fft.h"
+
+#include "asm-offsets.h"
+
+#if HAVE_NEON || HAVE_VFP
+AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
+#endif
 
 void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
@@ -49,3 +57,11 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
         s->decode_hf  = ff_decode_hf_neon;
     }
 }
+
+av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
new file mode 100644
index 0000000..9551bff
--- /dev/null
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2015 Janne Grunau <janne-libav at jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm-offsets.h"
+
+#include "libavutil/aarch64/asm.S"
+
+.macro inner_loop
+        ld1             {v29.4s},  [x9],  x15
+        ld1             {v28.4s},  [x8],  x15
+        ld1             {v30.4s},  [x10], x15
+        ld1             {v31.4s},  [x11], x15
+        rev64           v28.4s, v28.4s
+        ld1             {v24.4s},  [x4],  x15
+        ld1             {v25.4s},  [x5],  x15
+        rev64           v31.4s, v31.4s
+        ld1             {v26.4s},  [x6],  x15
+        fmla            v5.4s,  v25.4s, v29.4s
+        ld1             {v27.4s},  [x7],  x15
+        ext             v28.16b, v28.16b, v28.16b, #8
+        ext             v31.16b, v31.16b, v31.16b, #8
+        fmla            v6.4s,  v26.4s, v30.4s
+        fmls            v4.4s,  v24.4s, v28.4s
+        fmla            v7.4s,  v27.4s, v31.4s
+.endm
+
+function ff_synth_filter_float_neon, export=1
+        ldr             w7,  [x2]               // *synth_buf_offset
+        ldr             x9,  [x0, #IMDCT_HALF]  // imdct_half function pointer
+        sxtw            x7,  w7
+        stp             x3,  x4,  [sp, #-64]!
+        add             x1,  x1,  x7,  lsl #2   // synth_buf
+        sub             w8,  w7,  #32
+        stp             x5,  x1,  [sp, #16]
+        bic             x7,  x7,  #63
+        and             w8,  w8,  #511
+        stp             x7,  x30, [sp, #32]
+        str             w8,  [x2]
+        str             s0,  [sp, #48]
+
+        mov             x2,  x6                 // in
+
+        blr             x9
+
+        ldp             x2,  x4,  [sp]          // synct_buf_2, window
+        ldp             x13, x9,  [sp, #16]     // out, synth_buf
+        ldp             x0,  x30, [sp, #32]     // *synth_buf_offset
+        ldr             s0,  [sp, #48]
+
+        add             x3,  x2,  #16*4         // synct_buf_2 + 16
+        add             x14, x13, #16*4         // out + 16
+        add             x8,  x9,  #12*4
+        mov             x15, #64*4
+        mov             x1,  #4
+1:
+        add             x10, x9,  #16*4         // synth_buf
+        add             x11, x8,  #16*4
+        add             x5,  x4,  #16*4         // window
+        add             x6,  x4,  #32*4
+        add             x7,  x4,  #48*4
+
+        ld1             {v4.4s},   [x2]         // a
+        ld1             {v5.4s},   [x3]         // b
+        movi            v6.4s,  #0              // c
+        movi            v7.4s,  #0              // d
+
+        mov             x12, #512
+2:
+        sub             x12, x12, #64
+        cmp             x12, x0
+        inner_loop
+        b.gt            2b
+
+        sub             x8,  x8,  #512*4
+        sub             x9,  x9,  #512*4
+        cbz             x12, 4f
+        sub             x10, x10, #512*4
+        sub             x11, x11, #512*4
+3:
+        subs            x12, x12, #64
+        inner_loop
+        b.gt            3b
+4:
+        subs            x1,  x1,  #1
+        fmul            v4.4s,  v4.4s,  v0.s[0]
+        fmul            v5.4s,  v5.4s,  v0.s[0]
+        st1             {v6.4s},   [x2],  #16
+        st1             {v7.4s},   [x3],  #16
+        st1             {v4.4s},   [x13], #16
+        st1             {v5.4s},   [x14], #16
+        b.le            10f
+
+        sub             x4,  x4,  #508*4        // window
+        add             x9,  x9,  #4*4          // synth_buf
+        sub             x8,  x8,  #4*4          // synth_buf
+        b               1b
+
+10:
+        add             sp,  sp,  #64
+        ret
+endfunc
diff --git a/libavcodec/synth_filter.c b/libavcodec/synth_filter.c
index d0ace40..708bd4e 100644
--- a/libavcodec/synth_filter.c
+++ b/libavcodec/synth_filter.c
@@ -60,6 +60,10 @@ av_cold void ff_synth_filter_init(SynthFilterContext *c)
 {
     c->synth_filter_float = synth_filter_float;
 
-    if (ARCH_ARM) ff_synth_filter_init_arm(c);
-    if (ARCH_X86) ff_synth_filter_init_x86(c);
+    if (ARCH_AARCH64)
+        ff_synth_filter_init_aarch64(c);
+    if (ARCH_ARM)
+        ff_synth_filter_init_arm(c);
+    if (ARCH_X86)
+        ff_synth_filter_init_x86(c);
 }
diff --git a/libavcodec/synth_filter.h b/libavcodec/synth_filter.h
index f842c70..a93dc4f 100644
--- a/libavcodec/synth_filter.h
+++ b/libavcodec/synth_filter.h
@@ -32,6 +32,7 @@ typedef struct SynthFilterContext {
 } SynthFilterContext;
 
 void ff_synth_filter_init(SynthFilterContext *c);
+void ff_synth_filter_init_aarch64(SynthFilterContext *c);
 void ff_synth_filter_init_arm(SynthFilterContext *c);
 void ff_synth_filter_init_x86(SynthFilterContext *c);