[FFmpeg-devel] [PATCH 1/2] imdct15: remove the AArch64 assembly
Rostislav Pehlivanov
atomnuker at gmail.com
Fri Jan 6 00:33:06 EET 2017
On 4 January 2017 at 10:16, Rostislav Pehlivanov <atomnuker at gmail.com>
wrote:
> Prep work for the next commit, which will add a new FFT algorithm
> which makes the iMDCT over 3x faster than it is currently (standalone,
> the FFT is with some framesizes over 10x faster).
>
> The new FFT algorithm uses the already thouroughly SIMD'd power of two
> FFT which already has SIMD for AArch64, so users of that platform will
> still see an improvement.
>
> The previous FFT+SIMD was barely 2.5x faster than the C versions on these
> platforms.
>
> Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
> ---
> libavcodec/aarch64/Makefile | 2 -
> libavcodec/aarch64/imdct15_init.c | 46 ---
> libavcodec/aarch64/imdct15_neon.S | 647 ------------------------------
> --------
> libavcodec/imdct15.c | 3 -
> libavcodec/imdct15.h | 3 -
> 5 files changed, 701 deletions(-)
> delete mode 100644 libavcodec/aarch64/imdct15_init.c
> delete mode 100644 libavcodec/aarch64/imdct15_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index b7bb898713..5593863a75 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -6,7 +6,6 @@ OBJS-$(CONFIG_H264DSP) +=
> aarch64/h264dsp_init_aarch64.o
> OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
> OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.
> o
> OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
> -OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o
> OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
> OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
> OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
> @@ -35,7 +34,6 @@ NEON-OBJS-$(CONFIG_H264PRED) +=
> aarch64/h264pred_neon.o
> NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o
> \
> aarch64/hpeldsp_neon.o
> NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
> -NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o
> NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
> NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
>
> diff --git a/libavcodec/aarch64/imdct15_init.c
> b/libavcodec/aarch64/imdct15_init.c
> deleted file mode 100644
> index 58af9f00c0..0000000000
> --- a/libavcodec/aarch64/imdct15_init.c
> +++ /dev/null
> @@ -1,46 +0,0 @@
> -/*
> - * This file is part of FFmpeg.
> - *
> - * FFmpeg is free software; you can redistribute it and/or
> - * modify it under the terms of the GNU Lesser General Public
> - * License as published by the Free Software Foundation; either
> - * version 2.1 of the License, or (at your option) any later version.
> - *
> - * FFmpeg is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - * Lesser General Public License for more details.
> - *
> - * You should have received a copy of the GNU Lesser General Public
> - * License along with FFmpeg; if not, write to the Free Software
> - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> - */
> -
> -#include <stddef.h>
> -
> -#include "libavutil/cpu.h"
> -#include "libavutil/aarch64/cpu.h"
> -#include "libavutil/internal.h"
> -
> -#include "libavcodec/imdct15.h"
> -
> -#include "asm-offsets.h"
> -
> -AV_CHECK_OFFSET(IMDCT15Context, exptab, CELT_EXPTAB);
> -AV_CHECK_OFFSET(IMDCT15Context, fft_n, CELT_FFT_N);
> -AV_CHECK_OFFSET(IMDCT15Context, len2, CELT_LEN2);
> -AV_CHECK_OFFSET(IMDCT15Context, len4, CELT_LEN4);
> -AV_CHECK_OFFSET(IMDCT15Context, tmp, CELT_TMP);
> -AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE);
> -
> -void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float
> *src,
> - ptrdiff_t stride, float scale);
> -
> -void ff_imdct15_init_aarch64(IMDCT15Context *s)
> -{
> - int cpu_flags = av_get_cpu_flags();
> -
> - if (have_neon(cpu_flags)) {
> - s->imdct_half = ff_celt_imdct_half_neon;
> - }
> -}
> diff --git a/libavcodec/aarch64/imdct15_neon.S
> b/libavcodec/aarch64/imdct15_neon.S
> deleted file mode 100644
> index 97e1442ccc..0000000000
> --- a/libavcodec/aarch64/imdct15_neon.S
> +++ /dev/null
> @@ -1,647 +0,0 @@
> -/*
> - * Copyright (c) 2014 Janne Grunau <janne-libav at jannau.net>
> - *
> - * This file is part of FFmpeg.
> - *
> - * FFmpeg is free software; you can redistribute it and/or
> - * modify it under the terms of the GNU Lesser General Public
> - * License as published by the Free Software Foundation; either
> - * version 2.1 of the License, or (at your option) any later version.
> - *
> - * FFmpeg is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - * Lesser General Public License for more details.
> - *
> - * You should have received a copy of the GNU Lesser General Public
> - * License along with FFmpeg; if not, write to the Free Software
> - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> - */
> -
> -#include "libavutil/aarch64/asm.S"
> -
> -#include "asm-offsets.h"
> -
> -.macro shuffle a, b, c, d
> -const shuffle_\a\b\c\d, align=4
> - .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
> - .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
> - .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
> - .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
> -endconst
> -.endm
> -
> -shuffle 0, 2, 1, 3
> -shuffle 1, 0, 3, 2
> -shuffle 2, 3, 0, 1
> -shuffle 3, 1, 2, 0
> -
> -
> -function fft5_neon
> - lsl x2, x2, #3
> - ld1 {v24.2s}, [x1], x2
> - ld2 {v25.s,v26.s}[0], [x1], x2
> - ld2 {v25.s,v26.s}[1], [x1], x2
> - ld2 {v25.s,v26.s}[2], [x1], x2
> - ld2 {v25.s,v26.s}[3], [x1]
> - dup v6.4s, v24.s[0]
> - dup v7.4s, v24.s[1]
> -
> - faddp v0.4s, v25.4s, v26.4s
> - // z[][0], z[][3]
> - fmul v16.4s, v25.4s, v15.s[0] // rr
> - fmul v17.4s, v25.4s, v15.s[1] // ri
> - fmul v18.4s, v26.4s, v15.s[0] // ir
> - fmul v19.4s, v26.4s, v15.s[1] // ii
> - faddp v0.4s, v0.4s, v0.4s
> - // z[][1], z[][2]
> - fmul v20.4s, v25.4s, v15.s[2] // rr
> - fmul v21.4s, v25.4s, v15.s[3] // ri
> - fmul v22.4s, v26.4s, v15.s[2] // ir
> - fmul v23.4s, v26.4s, v15.s[3] // ii
> - fadd v0.2s, v24.2s, v0.2s // out[0]
> -
> - // z[0123][0], z[0123][3]
> - fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii;
> - fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii;
> - ld1 {v16.16b}, [x11]
> - ld1 {v19.16b}, [x14]
> - fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir;
> - fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir;
> - ld1 {v17.16b}, [x12]
> - // z[0123][1], z[0123][2]
> - fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii;
> - fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii;
> - ld1 {v18.16b}, [x13]
> - fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir;
> - fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir;
> -
> - //real
> - tbl v20.16b, {v24.16b}, v16.16b
> - tbl v21.16b, {v25.16b}, v17.16b
> - tbl v22.16b, {v26.16b}, v18.16b
> - tbl v23.16b, {v27.16b}, v19.16b
> - //imag
> - tbl v16.16b, {v28.16b}, v16.16b
> - tbl v17.16b, {v29.16b}, v17.16b
> - tbl v18.16b, {v30.16b}, v18.16b
> - tbl v19.16b, {v31.16b}, v19.16b
> -
> - fadd v6.4s, v6.4s, v20.4s
> - fadd v22.4s, v22.4s, v23.4s
> - fadd v7.4s, v7.4s, v16.4s
> - fadd v18.4s, v18.4s, v19.4s
> -
> - fadd v21.4s, v21.4s, v22.4s
> - fadd v17.4s, v17.4s, v18.4s
> - fadd v6.4s, v6.4s, v21.4s
> - fadd v7.4s, v7.4s, v17.4s
> -
> - ret
> -endfunc
> -
> -function fft15_neon
> - mov x8, x1
> - mov x9, x30
> - add x2, x3, x3, lsl #1 // 3 * stride
> -
> - add x1, x8, x3, lsl #3 // in + 1 * stride
> - bl fft5_neon
> - mov v1.8b, v0.8b
> - mov v2.16b, v6.16b
> - mov v3.16b, v7.16b
> -
> - add x1, x8, x3, lsl #4 // in + 2 * stride
> - add x2, x3, x3, lsl #1 // 3 * stride
> - bl fft5_neon
> - zip1 v1.4s, v1.4s, v0.4s
> - mov v4.16b, v6.16b
> - mov v5.16b, v7.16b
> -
> - mov x1, x8 // in + 0 * stride
> - add x2, x3, x3, lsl #1 // 3 * stride
> - bl fft5_neon
> -
> - faddp v20.4s, v1.4s, v1.4s
> -
> - ext v18.16b, v8.16b, v8.16b, #4
> - ext v19.16b, v9.16b, v9.16b, #4
> - mov v16.16b, v6.16b
> - mov v17.16b, v7.16b
> - fadd v20.2s, v20.2s, v0.2s
> -
> - uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re
> - uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im
> -
> - st1 {v20.2s}, [x0], #8 // out[0]
> -
> - fmla v16.4s, v2.4s, v8.4s
> - fmls v16.4s, v3.4s, v9.4s
> -
> - fmla v17.4s, v2.4s, v9.4s
> - fmla v17.4s, v3.4s, v8.4s
> -
> - fmla v16.4s, v4.4s, v18.4s
> - fmls v16.4s, v5.4s, v19.4s
> -
> - fmla v17.4s, v4.4s, v19.4s
> - fmla v17.4s, v5.4s, v18.4s
> -
> - zip1 v18.4s, v16.4s, v17.4s
> - zip2 v19.4s, v16.4s, v17.4s
> -
> - rev64 v31.4s, v14.4s
> - trn1 v28.2d, v1.2d, v1.2d
> - trn2 v29.2d, v1.2d, v1.2d
> - zip1 v30.2d, v14.2d, v31.2d
> - zip2 v31.2d, v14.2d, v31.2d
> -
> - st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4]
> -
> - fmul v16.4s, v28.4s, v30.4s
> - fmul v17.4s, v29.4s, v30.4s
> - fmls v16.4s, v29.4s, v31.4s
> - fmla v17.4s, v28.4s, v31.4s
> - faddp v16.4s, v16.4s, v16.4s
> - faddp v17.4s, v17.4s, v17.4s
> - zip1 v18.2s, v16.2s, v17.2s
> - zip2 v19.2s, v16.2s, v17.2s
> -
> - fadd v18.2s, v18.2s, v0.2s
> - fadd v0.2s, v19.2s, v0.2s
> -
> - ext v30.16b, v12.16b, v12.16b, #4
> - ext v31.16b, v13.16b, v13.16b, #4
> - mov v16.16b, v6.16b
> - mov v17.16b, v7.16b
> -
> - uzp1 v30.4s, v30.4s, v8.4s
> - uzp1 v31.4s, v31.4s, v9.4s
> -
> - st1 {v18.2s}, [x0], #8 // out[5]
> -
> - fmla v16.4s, v2.4s, v10.4s
> - fmls v16.4s, v3.4s, v11.4s
> -
> - fmla v17.4s, v2.4s, v11.4s
> - fmla v17.4s, v3.4s, v10.4s
> -
> - fmla v16.4s, v4.4s, v30.4s
> - fmls v16.4s, v5.4s, v31.4s
> -
> - fmla v17.4s, v4.4s, v31.4s
> - fmla v17.4s, v5.4s, v30.4s
> -
> - zip1 v18.4s, v16.4s, v17.4s
> - zip2 v19.4s, v16.4s, v17.4s
> -
> - ext v30.16b, v10.16b, v10.16b, #4
> - ext v31.16b, v11.16b, v11.16b, #4
> -
> - fmla v6.4s, v2.4s, v12.4s
> - fmls v6.4s, v3.4s, v13.4s
> -
> - st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9]
> -
> - uzp1 v30.4s, v30.4s, v12.4s
> - uzp1 v31.4s, v31.4s, v13.4s
> -
> - fmla v7.4s, v2.4s, v13.4s
> - fmla v7.4s, v3.4s, v12.4s
> -
> - st1 {v0.2s}, [x0], #8 // out[10]
> -
> - fmla v6.4s, v4.4s, v30.4s
> - fmls v6.4s, v5.4s, v31.4s
> -
> - fmla v7.4s, v4.4s, v31.4s
> - fmla v7.4s, v5.4s, v30.4s
> -
> - zip1 v18.4s, v6.4s, v7.4s
> - zip2 v19.4s, v6.4s, v7.4s
> -
> - st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14]
> -
> - ret x9
> -endfunc
> -
> -// x0: out, x1: out+len2, x2: exptab, x3: len2
> -function fft15_pass
> - ands x6, x3, #3
> - mov x4, x0
> - mov x5, x1
> - b.eq 9f
> - ld1 {v0.2s}, [x0], #8
> - ld1 {v1.2s}, [x1], #8
> - sub x3, x3, x6
> - subs x6, x6, #1
> - fadd v2.2s, v0.2s, v1.2s
> - fsub v3.2s, v0.2s, v1.2s
> - add x2, x2, #8
> - st1 {v2.2s}, [x4], #8
> - st1 {v3.2s}, [x5], #8
> - b.eq 9f
> -1:
> - subs x6, x6, #1
> - ldp s4, s5, [x2], #8
> - ldp s2, s3, [x1], #8
> - ldp s0, s1, [x0], #8
> -
> - fmul s6, s2, s4
> - fmul s7, s2, s5
> - fmls s6, s3, v5.s[0]
> - fmla s7, s3, v4.s[0]
> -
> - fsub s2, s0, s6
> - fsub s3, s1, s7
> - fadd s0, s0, s6
> - fadd s1, s1, s7
> -
> - stp s2, s3, [x5], #8
> - stp s0, s1, [x4], #8
> - b.gt 1b
> -9:
> - ld1 {v4.4s,v5.4s}, [x2], #32
> - ld2 {v2.4s,v3.4s}, [x1], #32
> - uzp1 v6.4s, v4.4s, v5.4s
> - uzp2 v7.4s, v4.4s, v5.4s
> - ld2 {v0.4s,v1.4s}, [x0], #32
> -8:
> - subs x3, x3, #8
> -
> - fmul v4.4s, v2.4s, v6.4s
> - fmul v5.4s, v2.4s, v7.4s
> - b.lt 4f
> -
> - ld1 {v18.4s,v19.4s}, [x2], #32
> -
> - fmls v4.4s, v3.4s, v7.4s
> - fmla v5.4s, v3.4s, v6.4s
> -
> - ld2 {v22.4s,v23.4s}, [x1], #32
> -
> - fsub v2.4s, v0.4s, v4.4s
> - fadd v0.4s, v0.4s, v4.4s
> - fsub v3.4s, v1.4s, v5.4s
> - fadd v1.4s, v1.4s, v5.4s
> -
> - uzp1 v16.4s, v18.4s, v19.4s
> - uzp2 v17.4s, v18.4s, v19.4s
> -
> - st2 {v2.4s,v3.4s}, [x5], #32
> - st2 {v0.4s,v1.4s}, [x4], #32
> - ld2 {v20.4s,v21.4s}, [x0], #32
> -
> - fmul v18.4s, v22.4s, v16.4s
> - fmul v19.4s, v22.4s, v17.4s
> - b.eq 0f
> -
> - ld1 {v4.4s,v5.4s}, [x2], #32
> -
> - fmls v18.4s, v23.4s, v17.4s
> - fmla v19.4s, v23.4s, v16.4s
> -
> - ld2 {v2.4s,v3.4s}, [x1], #32
> -
> - fsub v22.4s, v20.4s, v18.4s
> - fadd v20.4s, v20.4s, v18.4s
> - fsub v23.4s, v21.4s, v19.4s
> - fadd v21.4s, v21.4s, v19.4s
> -
> - uzp1 v6.4s, v4.4s, v5.4s
> - uzp2 v7.4s, v4.4s, v5.4s
> -
> - st2 {v22.4s,v23.4s}, [x5], #32
> - st2 {v20.4s,v21.4s}, [x4], #32
> - ld2 {v0.4s,v1.4s}, [x0], #32
> -
> - b 8b
> -4:
> - fmls v4.4s, v3.4s, v7.4s
> - fmla v5.4s, v3.4s, v6.4s
> -
> - fsub v2.4s, v0.4s, v4.4s
> - fadd v0.4s, v0.4s, v4.4s
> - fsub v3.4s, v1.4s, v5.4s
> - fadd v1.4s, v1.4s, v5.4s
> -
> - st2 {v2.4s,v3.4s}, [x5], #32
> - st2 {v0.4s,v1.4s}, [x4], #32
> -
> - ret
> -0:
> - fmls v18.4s, v23.4s, v17.4s
> - fmla v19.4s, v23.4s, v16.4s
> -
> - fsub v22.4s, v20.4s, v18.4s
> - fadd v20.4s, v20.4s, v18.4s
> - fsub v23.4s, v21.4s, v19.4s
> - fadd v21.4s, v21.4s, v19.4s
> -
> - st2 {v22.4s,v23.4s}, [x5], #32
> - st2 {v20.4s,v21.4s}, [x4], #32
> -
> - ret
> -endfunc
> -
> -function fft30_neon, align=6
> - sub sp, sp, #0x20
> - stp x20, x21, [sp]
> - stp x22, x30, [sp, #0x10]
> - mov x21, x1
> - mov x22, x2
> - mov x20, x4
> - mov x0, x21
> - mov x1, x22
> - lsl x3, x20, #1
> - bl fft15_neon
> -
> - add x0, x21, #15*8
> - add x1, x22, x20, lsl #3
> - lsl x3, x20, #1
> - bl fft15_neon
> -
> - ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1]
> - add x0, x21, #0
> - add x1, x21, #15*8
> - mov x3, #15
> - ldp x20, x21, [sp]
> - ldp x22, x30, [sp, #0x10]
> - add sp, sp, #0x20
> - b fft15_pass
> -endfunc
> -
> -.macro def_fft n, n2
> -function fft\n\()_neon, align=6
> - sub sp, sp, #0x30
> - stp x20, x21, [sp]
> - stp x22, x30, [sp, #0x10]
> - stp x23, x24, [sp, #0x20]
> - mov x21, x1
> - mov x22, x2
> - mov x23, x3
> - mov x20, x4
> - sub x3, x3, #1
> - lsl x4, x4, #1
> - bl fft\n2\()_neon
> -
> - add x1, x21, #(\n2 * 8)
> - add x2, x22, x20, lsl #3
> - sub x3, x23, #1
> - lsl x4, x20, #1
> - bl fft\n2\()_neon
> -
> - add x5, x10, #CELT_EXPTAB
> - mov x0, x21
> - ldr x2, [x5, x23, lsl #3] // s->exptab[N]
> - add x1, x21, #(\n2 * 8)
> - mov x3, #\n2
> - ldp x20, x21, [sp]
> - ldp x22, x30, [sp, #0x10]
> - ldp x23, x24, [sp, #0x20]
> - add sp, sp, #0x30
> - b fft15_pass
> -endfunc
> -.endm
> -
> - def_fft 60, 30
> - def_fft 120, 60
> - def_fft 240, 120
> - def_fft 480, 240
> - def_fft 960, 480
> -
> -function fft_b15_calc_neon
> - sub sp, sp, #0x50
> - ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0]
> - movrel x6, fact5
> - movrel x11, shuffle_0213
> - movrel x12, shuffle_1032
> - movrel x13, shuffle_2301
> - movrel x14, shuffle_3120
> - add x8, x8, #8
> - movrel x5, fft_tab_neon
> - stp x20, x30, [sp]
> - stp d8, d9, [sp, #0x10]
> - stp d10, d11, [sp, #0x20]
> - stp d12, d13, [sp, #0x30]
> - stp d14, d15, [sp, #0x40]
> - ld1 {v15.4s}, [x6]
> - ld1 {v0.4s,v1.4s}, [x8], #32
> - ld1 {v6.2s}, [x8], #8
> - ld1 {v2.4s,v3.4s}, [x8], #32
> - ld1 {v7.2s}, [x8], #8
> - ld1 {v4.4s,v5.4s}, [x8], #32
> - uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re
> - uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im
> - uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re
> - uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im
> - uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re
> - uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im
> - zip1 v14.4s, v6.4s, v7.4s //
> exp[5,10].re/exp[5,10].im
> - add x5, x5, x3, lsl #3
> - ldr x5, [x5]
> - mov x10, x0
> - blr x5
> - ldp x20, x30, [sp]
> - ldp d8, d9, [sp, #0x10]
> - ldp d10, d11, [sp, #0x20]
> - ldp d12, d13, [sp, #0x30]
> - ldp d14, d15, [sp, #0x40]
> - add sp, sp, #0x50
> - ret
> -endfunc
> -
> -const fft_tab_neon, relocate=1
> - .quad fft15_neon
> - .quad fft30_neon
> - .quad fft60_neon
> - .quad fft120_neon
> - .quad fft240_neon
> - .quad fft480_neon
> - .quad fft960_neon
> -endconst
> -
> -function ff_celt_imdct_half_neon, export=1
> - sub sp, sp, #0x20
> - stp x21, x30, [sp]
> - str s0, [sp, #0x10]
> -
> - ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4
> - mov x10, x0
> - mov x21, x1
> - sub w5, w5, #1
> - lsl x7, x3, #3 // 2 * stride *
> sizeof(float)
> - sub x8, xzr, x3, lsl #3 // -2 * stride *
> sizeof(float)
> - mul x5, x5, x3
> - ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE
> - ldr w3, [x0, #CELT_FFT_N]
> - add x5, x2, x5, lsl #2
> - mov x11, x9
> -
> - sub w6, w6, #4
> - ld1 {v0.s}[0], [x5], x8
> - ld1 {v1.s}[0], [x2], x7
> - ld1 {v4.4s,v5.4s}, [x10], #32
> - ld1 {v0.s}[1], [x5], x8
> - ld1 {v1.s}[1], [x2], x7
> - uzp1 v2.4s, v4.4s, v5.4s
> - ld1 {v0.s}[2], [x5], x8
> - ld1 {v1.s}[2], [x2], x7
> - uzp2 v3.4s, v4.4s, v5.4s
> - ld1 {v0.s}[3], [x5], x8
> - ld1 {v1.s}[3], [x2], x7
> -1:
> - subs w6, w6, #4
> -
> - ld1 {v20.s}[0], [x5], x8
> - ld1 {v21.s}[0], [x2], x7
> - ld1 {v4.4s,v5.4s}, [x10], #32
> -
> - fmul v6.4s, v0.4s, v2.4s
> - fmul v7.4s, v0.4s, v3.4s
> -
> - ld1 {v20.s}[1], [x5], x8
> - ld1 {v21.s}[1], [x2], x7
> -
> - fmls v6.4s, v1.4s, v3.4s
> - fmla v7.4s, v1.4s, v2.4s
> -
> - ld1 {v20.s}[2], [x5], x8
> - ld1 {v21.s}[2], [x2], x7
> -
> - uzp1 v2.4s, v4.4s, v5.4s
> - uzp2 v3.4s, v4.4s, v5.4s
> - ld1 {v20.s}[3], [x5], x8
> - ld1 {v21.s}[3], [x2], x7
> -
> - zip1 v4.4s, v6.4s, v7.4s
> - zip2 v5.4s, v6.4s, v7.4s
> -
> - fmul v6.4s, v20.4s, v2.4s
> - fmul v7.4s, v20.4s, v3.4s
> -
> - st1 {v4.4s,v5.4s}, [x9], #32
> -
> - fmls v6.4s, v21.4s, v3.4s
> - fmla v7.4s, v21.4s, v2.4s
> -
> - b.eq 3f
> -
> - subs w6, w6, #4
> - ld1 {v4.4s,v5.4s}, [x10], #32
> - ld1 {v0.s}[0], [x5], x8
> - ld1 {v1.s}[0], [x2], x7
> - uzp1 v2.4s, v4.4s, v5.4s
> - ld1 {v0.s}[1], [x5], x8
> - ld1 {v1.s}[1], [x2], x7
> - uzp2 v3.4s, v4.4s, v5.4s
> - ld1 {v0.s}[2], [x5], x8
> - ld1 {v1.s}[2], [x2], x7
> - zip1 v4.4s, v6.4s, v7.4s
> - zip2 v5.4s, v6.4s, v7.4s
> - ld1 {v0.s}[3], [x5], x8
> - ld1 {v1.s}[3], [x2], x7
> -
> - st1 {v4.4s,v5.4s}, [x9], #32
> -
> - b.gt 1b
> -
> - fmul v6.4s, v0.4s, v2.4s
> - fmul v7.4s, v0.4s, v3.4s
> - fmls v6.4s, v1.4s, v3.4s
> - fmla v7.4s, v1.4s, v2.4s
> -3:
> - zip1 v4.4s, v6.4s, v7.4s
> - zip2 v5.4s, v6.4s, v7.4s
> - st1 {v4.4s,v5.4s}, [x9], #32
> -
> - mov x2, x11
> - mov x4, #1
> -
> - bl fft_b15_calc_neon
> -
> - ldr w5, [x10, #CELT_LEN4]
> - ldr x6, [x10, #CELT_TWIDDLE]
> - ldr s31, [sp, #0x10]
> -
> - add x1, x21, x5, lsl #2
> - add x3, x6, x5, lsl #2
> - sub x0, x1, #16
> - sub x2, x3, #16
> - mov x8, #-16
> - mov x7, #16
> - mov x10, x0
> - mov x11, x1
> -
> - sub w5, w5, #4
> -
> - ld1 {v0.4s}, [x0], x8
> - ld1 {v1.4s}, [x1], x7
> - ld1 {v2.4s}, [x2], x8
> - ld1 {v3.4s}, [x3], x7
> -
> - uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i,
> i+1].re
> - uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i,
> i+1].im
> -
> - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2,
> -i-1, +i, i+1].re
> - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2,
> -i-1, +i, i+1].im
> -
> - fmul v1.4s, v6.4s, v5.4s
> - fmul v0.4s, v6.4s, v7.4s
> -2:
> - subs w5, w5, #4
> -
> - ld1 {v20.4s}, [x0], x8
> -
> - fmla v1.4s, v4.4s, v7.4s
> - fmls v0.4s, v4.4s, v5.4s
> -
> - ld1 {v21.4s}, [x1], x7
> -
> - ext v1.16b, v1.16b, v1.16b, #8
> - fmul v0.4s, v0.4s, v31.s[0]
> -
> - ld1 {v2.4s}, [x2], x8
> -
> - rev64 v1.4s, v1.4s
> - fmul v1.4s, v1.4s, v31.s[0]
> -
> - ld1 {v3.4s}, [x3], x7
> -
> - zip1 v5.4s, v0.4s, v1.4s
> - zip2 v7.4s, v0.4s, v1.4s
> -
> - uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i,
> i+1].re
> - uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i,
> i+1].im
> -
> - st1 {v5.4s}, [x10], x8
> - st1 {v7.4s}, [x11], x7
> -
> - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2,
> -i-1, +i, i+1].re
> - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2,
> -i-1, +i, i+1].im
> -
> - fmul v1.4s, v6.4s, v5.4s
> - fmul v0.4s, v6.4s, v7.4s
> - b.gt 2b
> -
> - fmla v1.4s, v4.4s, v7.4s
> - fmls v0.4s, v4.4s, v5.4s
> - ext v1.16b, v1.16b, v1.16b, #8
> - fmul v0.4s, v0.4s, v31.s[0]
> - rev64 v1.4s, v1.4s
> - fmul v1.4s, v1.4s, v31.s[0]
> - zip1 v5.4s, v0.4s, v1.4s
> - zip2 v7.4s, v0.4s, v1.4s
> - st1 {v5.4s}, [x10], x8
> - st1 {v7.4s}, [x11], x7
> -
> - ldp x21, x30, [sp]
> - add sp, sp, #0x20
> - ret
> -endfunc
> -
> -// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
> -const fact5, align=4
> - .float 0.30901699437494745, 0.95105651629515353
> - .float -0.80901699437494734, 0.58778525229247325
> -endconst
> diff --git a/libavcodec/imdct15.c b/libavcodec/imdct15.c
> index e91aa11085..7481c026cf 100644
> --- a/libavcodec/imdct15.c
> +++ b/libavcodec/imdct15.c
> @@ -136,9 +136,6 @@ av_cold int ff_imdct15_init(IMDCT15Context **ps, int N)
>
> s->imdct_half = imdct15_half;
>
> - if (ARCH_AARCH64)
> - ff_imdct15_init_aarch64(s);
> -
> *ps = s;
>
> return 0;
> diff --git a/libavcodec/imdct15.h b/libavcodec/imdct15.h
> index 1979aa76af..7a58aac8b3 100644
> --- a/libavcodec/imdct15.h
> +++ b/libavcodec/imdct15.h
> @@ -51,7 +51,4 @@ int ff_imdct15_init(IMDCT15Context **s, int N);
> */
> void ff_imdct15_uninit(IMDCT15Context **s);
>
> -
> -void ff_imdct15_init_aarch64(IMDCT15Context *s);
> -
> #endif /* AVCODEC_IMDCT15_H */
> --
> 2.11.0.390.gc69c2f50cf
>
>
Applied
More information about the ffmpeg-devel
mailing list