[FFmpeg-devel] [PATCH] mdct15: add assembly optimizations for the 15-point FFT

James Almer jamrial at gmail.com
Thu Jun 22 03:03:31 EEST 2017


On 6/21/2017 7:33 PM, Rostislav Pehlivanov wrote:
> fft15_c:    1961 decicycles in fft15, 4193738 runs,    566 skips
> fft15_fma3:  995 decicycles in fft15, 4193984 runs,    320 skips
> 
> I believe the FFT5 macro can be improved further, so suggestions
> are welcome.
> ---
>  libavcodec/mdct15.c          | 186 +++++++++++++++++++++----------------------
>  libavcodec/mdct15.h          |  26 +++---
>  libavcodec/x86/Makefile      |   2 +
>  libavcodec/x86/mdct15.asm    | 149 ++++++++++++++++++++++++++++++++++
>  libavcodec/x86/mdct15_init.c |  93 ++++++++++++++++++++++
>  5 files changed, 345 insertions(+), 111 deletions(-)
>  create mode 100644 libavcodec/x86/mdct15.asm
>  create mode 100644 libavcodec/x86/mdct15_init.c
> 
> diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
> index 8c42ece483..0cf734caf9 100644
> --- a/libavcodec/mdct15.c
> +++ b/libavcodec/mdct15.c
> @@ -57,11 +57,6 @@ av_cold void ff_mdct15_uninit(MDCT15Context **ps)
>      av_freep(ps);
>  }
>  
> -static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
> -
> -static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
> -                         ptrdiff_t stride, float scale);
> -
>  static inline int init_pfa_reindex_tabs(MDCT15Context *s)
>  {
>      int i, j;
> @@ -93,88 +88,8 @@ static inline int init_pfa_reindex_tabs(MDCT15Context *s)
>      return 0;
>  }
>  
> -av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
> -{
> -    MDCT15Context *s;
> -    double alpha, theta;
> -    int len2 = 15 * (1 << N);
> -    int len  = 2 * len2;
> -    int i;
> -
> -    /* Tested and verified to work on everything in between */
> -    if ((N < 2) || (N > 13))
> -        return AVERROR(EINVAL);
> -
> -    s = av_mallocz(sizeof(*s));
> -    if (!s)
> -        return AVERROR(ENOMEM);
> -
> -    s->fft_n = N - 1;
> -    s->len4 = len2 / 2;
> -    s->len2 = len2;
> -    s->inverse = inverse;
> -    s->mdct = mdct15;
> -    s->imdct_half = imdct15_half;
> -
> -    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> -        goto fail;
> -
> -    if (init_pfa_reindex_tabs(s))
> -        goto fail;
> -
> -    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
> -    if (!s->tmp)
> -        goto fail;
> -
> -    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
> -    if (!s->twiddle_exptab)
> -        goto fail;
> -
> -    theta = 0.125f + (scale < 0 ? s->len4 : 0);
> -    scale = sqrt(fabs(scale));
> -    for (i = 0; i < s->len4; i++) {
> -        alpha = 2 * M_PI * (i + theta) / len;
> -        s->twiddle_exptab[i].re = cos(alpha) * scale;
> -        s->twiddle_exptab[i].im = sin(alpha) * scale;
> -    }
> -
> -    /* 15-point FFT exptab */
> -    for (i = 0; i < 19; i++) {
> -        if (i < 15) {
> -            double theta = (2.0f * M_PI * i) / 15.0f;
> -            if (!s->inverse)
> -                theta *= -1;
> -            s->exptab[i].re = cos(theta);
> -            s->exptab[i].im = sin(theta);
> -        } else { /* Wrap around to simplify fft15 */
> -            s->exptab[i] = s->exptab[i - 15];
> -        }
> -    }
> -
> -    /* 5-point FFT exptab */
> -    s->exptab[19].re = cos(2.0f * M_PI / 5.0f);
> -    s->exptab[19].im = sin(2.0f * M_PI / 5.0f);
> -    s->exptab[20].re = cos(1.0f * M_PI / 5.0f);
> -    s->exptab[20].im = sin(1.0f * M_PI / 5.0f);
> -
> -    /* Invert the phase for an inverse transform, do nothing for a forward transform */
> -    if (s->inverse) {
> -        s->exptab[19].im *= -1;
> -        s->exptab[20].im *= -1;
> -    }
> -
> -    *ps = s;
> -
> -    return 0;
> -
> -fail:
> -    ff_mdct15_uninit(&s);
> -    return AVERROR(ENOMEM);
> -}
> -
>  /* Stride is hardcoded to 3 */
> -static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
> -                        const FFTComplex *in)
> +static inline void fft5(FFTComplex *out, FFTComplex *in, FFTComplex exptab[2])
>  {
>      FFTComplex z0[4], t[6];
>  
> @@ -219,14 +134,14 @@ static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
>      out[4].im = in[0].im + z0[3].im;
>  }
>  
> -static void fft15(const FFTComplex exptab[22], FFTComplex *out, const FFTComplex *in, size_t stride)
> +static void fft15(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, size_t stride)
>  {
>      int k;
>      FFTComplex tmp1[5], tmp2[5], tmp3[5];
>  
> -    fft5(exptab + 19, tmp1, in + 0);
> -    fft5(exptab + 19, tmp2, in + 1);
> -    fft5(exptab + 19, tmp3, in + 2);
> +    fft5(tmp1, in + 0, exptab + 19);
> +    fft5(tmp2, in + 1, exptab + 19);
> +    fft5(tmp3, in + 2, exptab + 19);
>  
>      for (k = 0; k < 5; k++) {
>          FFTComplex t[2];
> @@ -253,10 +168,10 @@ static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
>      int i, j;
>      const int len4 = s->len4, len3 = len4 * 3, len8 = len4 >> 1;
>      const int l_ptwo = 1 << s->ptwo_fft.nbits;
> -    FFTComplex fft15in[15];
>  
>      /* Folding and pre-reindexing */
>      for (i = 0; i < l_ptwo; i++) {
> +        DECLARE_ALIGNED(32, FFTComplex, fft15in)[32];
>          for (j = 0; j < 15; j++) {
>              float re, im;
>              const int k = s->pfa_prereindex[i*15 + j];
> @@ -269,7 +184,7 @@ static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
>              }
>              CMUL(fft15in[j].re, fft15in[j].im, re, im, s->twiddle_exptab[k].re, -s->twiddle_exptab[k].im);
>          }
> -        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> +        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
>      }
>  
>      /* Then a 15xN FFT (where N is a power of two) */
> @@ -294,19 +209,19 @@ static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
>  static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
>                           ptrdiff_t stride, float scale)
>  {
> -    FFTComplex fft15in[15];
>      FFTComplex *z = (FFTComplex *)dst;
>      int i, j, len8 = s->len4 >> 1, l_ptwo = 1 << s->ptwo_fft.nbits;
>      const float *in1 = src, *in2 = src + (s->len2 - 1) * stride;
>  
>      /* Reindex input, putting it into a buffer and doing an Nx15 FFT */
>      for (i = 0; i < l_ptwo; i++) {
> +        DECLARE_ALIGNED(32, FFTComplex, fft15in)[32];
>          for (j = 0; j < 15; j++) {
>              const int k = s->pfa_prereindex[i*15 + j];
>              FFTComplex tmp = { *(in2 - 2*k*stride), *(in1 + 2*k*stride) };
>              CMUL3(fft15in[j], tmp, s->twiddle_exptab[k]);
>          }
> -        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
> +        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
>      }
>  
>      /* Then a 15xN FFT (where N is a power of two) */
> @@ -327,3 +242,86 @@ static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
>          z[i0].im = scale * im1;
>      }
>  }
> +
> +av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
> +{
> +    MDCT15Context *s;
> +    double alpha, theta;
> +    int len2 = 15 * (1 << N);
> +    int len  = 2 * len2;
> +    int i;
> +
> +    /* Tested and verified to work on everything in between */
> +    if ((N < 2) || (N > 13))
> +        return AVERROR(EINVAL);
> +
> +    s = av_mallocz(sizeof(*s));
> +    if (!s)
> +        return AVERROR(ENOMEM);
> +
> +    s->fft_n      = N - 1;
> +    s->len4       = len2 / 2;
> +    s->len2       = len2;
> +    s->inverse    = inverse;
> +    s->fft15      = fft15;
> +    s->mdct       = mdct15;
> +    s->imdct_half = imdct15_half;
> +
> +    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
> +        goto fail;
> +
> +    if (init_pfa_reindex_tabs(s))
> +        goto fail;
> +
> +    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
> +    if (!s->tmp)
> +        goto fail;
> +
> +    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
> +    if (!s->twiddle_exptab)
> +        goto fail;
> +
> +    theta = 0.125f + (scale < 0 ? s->len4 : 0);
> +    scale = sqrt(fabs(scale));
> +    for (i = 0; i < s->len4; i++) {
> +        alpha = 2 * M_PI * (i + theta) / len;
> +        s->twiddle_exptab[i].re = cos(alpha) * scale;
> +        s->twiddle_exptab[i].im = sin(alpha) * scale;

cosf, sinf

> +    }
> +
> +    /* 15-point FFT exptab */
> +    for (i = 0; i < 19; i++) {
> +        if (i < 15) {
> +            double theta = (2.0f * M_PI * i) / 15.0f;
> +            if (!s->inverse)
> +                theta *= -1;
> +            s->exptab[i].re = cos(theta);
> +            s->exptab[i].im = sin(theta);
> +        } else { /* Wrap around to simplify fft15 */
> +            s->exptab[i] = s->exptab[i - 15];
> +        }
> +    }
> +
> +    /* 5-point FFT exptab */
> +    s->exptab[19].re = cos(2.0f * M_PI / 5.0f);
> +    s->exptab[19].im = sin(2.0f * M_PI / 5.0f);
> +    s->exptab[20].re = cos(1.0f * M_PI / 5.0f);
> +    s->exptab[20].im = sin(1.0f * M_PI / 5.0f);

Same

> +
> +    /* Invert the phase for an inverse transform, do nothing for a forward transform */
> +    if (s->inverse) {
> +        s->exptab[19].im *= -1;
> +        s->exptab[20].im *= -1;
> +    }
> +
> +    if (ARCH_X86)
> +        ff_mdct15_init_x86(s);
> +
> +    *ps = s;
> +
> +    return 0;
> +
> +fail:
> +    ff_mdct15_uninit(&s);
> +    return AVERROR(ENOMEM);
> +}
> diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
> index ef94edff6c..15a91220b6 100644
> --- a/libavcodec/mdct15.h
> +++ b/libavcodec/mdct15.h
> @@ -34,34 +34,26 @@ typedef struct MDCT15Context {
>      int *pfa_postreindex;
>  
>      FFTContext ptwo_fft;
> -
>      FFTComplex *tmp;
> -
>      FFTComplex *twiddle_exptab;
>  
> -    /* 0 - 18: fft15 twiddles, 19 - 20: fft5 twiddles */
> -    FFTComplex exptab[21];
> +    DECLARE_ALIGNED(32, FFTComplex, exptab)[64];
>  
> -    /**
> -     * Calculate a full 2N -> N MDCT
> -     */
> +    /* 15-point FFT */
> +    void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, size_t stride);

Make stride ptrdiff_t.

> +
> +    /* Calculate a full 2N -> N MDCT */
>      void (*mdct)(struct MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
>  
> -    /**
> -     * Calculate the middle half of the iMDCT
> -     */
> +    /* Calculate the middle half of the iMDCT */
>      void (*imdct_half)(struct MDCT15Context *s, float *dst, const float *src,
>                         ptrdiff_t src_stride, float scale);
>  } MDCT15Context;
>  
> -/**
> - * Init an (i)MDCT of the length 2 * 15 * (2^N)
> - */
> +/* Init an (i)MDCT of the length 2 * 15 * (2^N) */
>  int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
> -
> -/**
> - * Frees a context
> - */
>  void ff_mdct15_uninit(MDCT15Context **ps);
>  
> +void ff_mdct15_init_x86(MDCT15Context *s);
> +
>  #endif /* AVCODEC_MDCT15_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index e65118d134..12f6b84372 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -53,6 +53,7 @@ OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_ini
>  OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
>  OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
>  OBJS-$(CONFIG_JPEG2000_DECODER)        += x86/jpeg2000dsp_init.o
> +OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o

Move this to the subsystems section (top of the file).

>  OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
>  OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
>  OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
> @@ -158,6 +159,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER)     += x86/hevc_add_res.o            \
>                                            x86/hevc_sao.o                \
>                                            x86/hevc_sao_10bit.o
>  X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
> +X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
>  X86ASM-OBJS-$(CONFIG_MLP_DECODER)      += x86/mlpdsp.o
>  X86ASM-OBJS-$(CONFIG_MPEG4_DECODER)    += x86/xvididct.o
>  X86ASM-OBJS-$(CONFIG_PNG_DECODER)      += x86/pngdsp.o
> diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
> new file mode 100644
> index 0000000000..437df7bea6
> --- /dev/null
> +++ b/libavcodec/x86/mdct15.asm
> @@ -0,0 +1,149 @@
> +;******************************************************************************
> +;* SIMD optimized non-power of two MDCT functions
> +;*
> +;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA 32
> +
> +sign_adjust_5: times 2 dd 1.0, -1.0, -1.0, 1.0

"times 2 dd 0, 0x80000000, 0x80000000, 0" and using xorps instead of
mulps is much faster.

> +
> +SECTION .text
> +
> +%macro FFT5 7 ; src, offset, dst1 (1/2 filled), dst2, exptab1, exptab2, signadjust (uses m0-m4)
> +    movddup xm0, [%1q + 0*16 + 0 + %2]  ; in[ 0].re, in[ 0].im, in[ 1].re, in[ 1].im
> +    movsd   xm1, [%1q + 1*16 + 8 + %2]  ; in[ 3].re, in[ 3].im, in[ 4].re, in[ 4].im
> +    movups  xm2, [%1q + 2*16 + 8 + %2]  ; in[ 5].re, in[ 5].im, in[ 6].re, in[ 6].im
> +    movups  xm3, [%1q + 4*16 + 0 + %2]  ; in[ 8].re, in[ 8].im, in[ 9].re, in[ 9].im
> +    movsd   xm4, [%1q + 6*16 + 0 + %2]  ; in[12].re, in[12].im, in[13].re, in[13].im
> +
> +    shufps      xm1,  xm2, q3210        ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
> +    shufps      xm4,  xm3, q3210        ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
> +
> +    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
> +    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
> +
> +    shufps      %3,   xm1, xm1, q3120   ; t[0].re, t[1].re, t[0].im, t[1].im
> +    haddps      %3,   xm4

haddps is very slow, so unless you need to do an horizontal sum that
fits the behavior of haddps perfectly, you're better manually shuffling
data and using addps instead.

> +    addps       %3,   xm0               ; DC[0].re, DC[0].im, junk...
> +    movlhps     %3,   %3                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
> +
> +    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
> +    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
> +
> +    mulps       xm%4, xm1, %5
> +    mulps       xm4,  xm3, %6
> +    mulps       xm1,  %6
> +
> +    mulps       xm1,  %7
> +    fmaddsubps  xm3,  xm3, %5, xm1      ; t[0].re, t[0].im, t[2].re, t[2].im
> +    subps       xm%4, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
> +
> +    mova        xm4,  %7
> +
> +    unpckhpd    xm2,  xm3, xm%4         ; t[2].re, t[2].im, t[5].re, t[5].im
> +    unpcklpd    xm3,  xm%4              ; t[0].re, t[0].im, t[4].re, t[4].im

Use movhlps and movlhps.

> +
> +    mova        xm%4, xm4
> +    fmaddps     xm%4, xm2, xm%4, xm3

Right, so if you make the sign_adjust_5 constant work with xorps, this
will not work anymore.
I still think it's better to do it, so you'd have to replace this
fmaddps with i think xorps + addps.

> +    addps       xm%4, xm0               ; ACs(1)
> +
> +    fnmaddps    xm2,  xm2, xm4, xm3

I think this one would become xorps + subps.

> +    shufps      xm2,  xm2, q1032
> +    addps       xm2,  xm0               ; ACs(2)
> +    vinsertf128 m%4,  m%4, xm2, 1       ; All ACs (tmp[1] through to tmp[4])
> +%endmacro
> +
> +%macro BUTTERFLIES_DC 7 ; exptab, exptab_offset, src1, src2, src3, out, out_offset, (uses m0-m1)
> +    movaps m0, [%1q + %2]
> +    vextractf128 xm1, m0, 1

movaps xm0, [%1q + %2]
movaps xm1, [%1q + %2 + 16]

Might be faster.

> +
> +    mulps xm1, %5
> +    mulps xm0, %4
> +
> +    haddps xm0, xm1
> +    shufps xm0, xm0, q3120              ; t[0].re, t[1].re, t[0].im, t[1].im
> +
> +    haddps xm0, xm0

This is specifically what you need to avoid. shufps/movhlps + addps is
faster.

> +    addps  xm0, %3
> +
> +    movsd [%6q + %7], xm0
> +%endmacro
> +
> +%macro BUTTERFLIES_AC 8 ; exptab, exptab_offset, src1, src2, src3, out, strideq, k_off
> +    mulps m0, %4, [%1q + 64*0 + 0*mmsize + %2]
> +    mulps m1, %4, [%1q + 64*0 + 1*mmsize + %2]
> +    mulps m2, %5, [%1q + 64*1 + 0*mmsize + %2]
> +    mulps m3, %5, [%1q + 64*1 + 1*mmsize + %2]
> +
> +    shufps m1, m1, q2301
> +    shufps m3, m3, q2301
> +
> +    addps m0, m1
> +    addps m2, m3
> +    addps m0, m2
> +    addps m0, %3
> +
> +    vextractf128 xmm1, m0, 1
> +

> +    mov r5q, %8
> +    add r5q, strideq
> +    movlps [%6q + r5q], xmm0
> +
> +    add r5q, strideq
> +    movhps [%6q + r5q], xmm0
> +
> +    add r5q, strideq
> +    movlps [%6q + r5q], xmm1
> +
> +    add r5q, strideq
> +    movhps [%6q + r5q], xmm1
> +%endmacro

This macros are not optimal, including the stride handling code before
calling them.
You need to have two regs, strideq and stride3q, with the corresponding
values. With those two you can derive all the effective addresses if you
instead increment the outq pointer.

> +
> +;***************************************************************************************
> +;void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, size_t stride);
> +;***************************************************************************************
> +INIT_YMM fma3

Ideally, you'd add an AVX version as well that doesn't use fma at all.

> +cglobal fft15, 4, 6, 14, out, in, exptab, stride
> +
> +    shl strideq, 3
> +
> +    movaps m5, [exptabq + 480]
> +    vextractf128 xm6, m5, 1

Same here, two movaps from memory with xmm as dest might be faster.

> +    movaps xm7, [sign_adjust_5]

If you only need 16 bytes, no need to declare the constant with times 2.

> +
> +    FFT5 in,  0,  xm8, 11, xm5, xm6, xm7
> +    FFT5 in,  8,  xm9, 12, xm5, xm6, xm7
> +    FFT5 in, 16, xm10, 13, xm5, xm6, xm7
> +
> +    BUTTERFLIES_DC exptab, (8*6 + 4*0)*2*4, xm8, xm9, xm10, out,   0
> +    imul r4q, strideq, 5
> +    BUTTERFLIES_DC exptab, (8*6 + 4*1)*2*4, xm8, xm9, xm10, out, r4q
> +    imul r4q, 2 ; stride * 10
> +    BUTTERFLIES_DC exptab, (8*6 + 4*2)*2*4, xm8, xm9, xm10, out, r4q
> +
> +    BUTTERFLIES_AC exptab, (8*0)*2*4, m11, m12, m13, out, strideq,   0
> +    imul r4q, strideq, 5
> +    BUTTERFLIES_AC exptab, (8*2)*2*4, m11, m12, m13, out, strideq, r4q
> +    imul r4q, 2 ; stride * 10
> +    BUTTERFLIES_AC exptab, (8*4)*2*4, m11, m12, m13, out, strideq, r4q

Based on what i said above, try the following (just guidelines, and
untested):

%macro BUTTERFLIES_DC
...
movsd [outq + %7], xm0
%endmacro

%macro BUTTERFLIES_AC
...
movlps [outq + strideq],   xm0
movhps [outq + strideq*2], xm0
movlps [outq + stride3q],  xm1
movhps [outq + strideq*4], xm1
%endmacro

cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride3
...
BUTTERFLIES_DC ... , 0
lea outq, [outq + strideq*4]
BUTTERFLIES_DC ... , strideq
lea outq, [outq + strideq*4]
BUTTERFLIES_DC ... , strideq
lea outq, [outq + strideq*4]

lea stride3q, [strideq + strideq * 2]
BUTTERFLIES_AC ...
lea outq, [outq + strideq*4]
BUTTERFLIES_AC ...
lea outq, [outq + strideq*4]
BUTTERFLIES_AC ...
lea outq, [outq + strideq*4]
RET

> +
> +    RET
> diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
> new file mode 100644
> index 0000000000..16306b893f
> --- /dev/null
> +++ b/libavcodec/x86/mdct15_init.c
> @@ -0,0 +1,93 @@
> +/*
> + * Non power of two MDCT assembly optimizations
> + * Copyright (C) 2017 Rostislav Pehlivanov <atomnuker at gmail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +
> +#include "libavutil/float_dsp.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/mdct15.h"
> +
> +void ff_fft15_fma3(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, size_t stride);
> +
> +static void perm_twiddles(MDCT15Context *s)
> +{
> +    int k;
> +
> +    FFTComplex tmp[64], tmp2[64];
> +    memcpy(tmp, s->exptab, sizeof(FFTComplex)*32);
> +
> +    /* 15-point FFT twiddles */
> +    for (k = 0; k < 5; k++) {
> +        tmp2[6*k + 0] = tmp[k +  0];
> +        tmp2[6*k + 2] = tmp[k +  5];
> +        tmp2[6*k + 4] = tmp[k + 10];
> +
> +        tmp2[6*k + 1] = tmp[2 * (k + 0)];
> +        tmp2[6*k + 3] = tmp[2 * (k + 5)];
> +        tmp2[6*k + 5] = tmp[2 *  k + 5 ];
> +    }
> +
> +    /* Newtab */
> +
> +    for (k = 0; k < 6; k++) {
> +        FFTComplex ac_exp[] = {
> +            { tmp2[6*1 + k].re,  tmp2[6*1 + k].re },
> +            { tmp2[6*2 + k].re,  tmp2[6*2 + k].re },
> +            { tmp2[6*3 + k].re,  tmp2[6*3 + k].re },
> +            { tmp2[6*4 + k].re,  tmp2[6*4 + k].re },
> +            { tmp2[6*1 + k].im, -tmp2[6*1 + k].im },
> +            { tmp2[6*2 + k].im, -tmp2[6*2 + k].im },
> +            { tmp2[6*3 + k].im, -tmp2[6*3 + k].im },
> +            { tmp2[6*4 + k].im, -tmp2[6*4 + k].im },
> +        };
> +        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
> +    }
> +
> +    /* Specialcase when k = 0 */
> +    for (k = 0; k < 3; k++) {
> +        FFTComplex dc_exp[] = {
> +            { tmp2[2*k + 0].re, -tmp2[2*k + 0].im },
> +            { tmp2[2*k + 0].im,  tmp2[2*k + 0].re },
> +            { tmp2[2*k + 1].re, -tmp2[2*k + 1].im },
> +            { tmp2[2*k + 1].im,  tmp2[2*k + 1].re },
> +        };
> +        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
> +    }
> +
> +    /* 5-point FFT twiddles */
> +    FFTComplex exp_5point[] = {
> +        { tmp[19].re, tmp[19].re },
> +        { tmp[19].im, tmp[19].im },
> +        { tmp[20].re, tmp[20].re },
> +        { tmp[20].im, tmp[20].im },
> +    };
> +    memcpy(s->exptab + 8*6 + 4*3, exp_5point, 4*sizeof(FFTComplex));
> +}
> +
> +av_cold void ff_mdct15_init_x86(MDCT15Context *s)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (ARCH_X86_64 && EXTERNAL_FMA3(cpu_flags)) {
> +        perm_twiddles(s);
> +        s->fft15 = ff_fft15_fma3;
> +    }
> +}
> 



More information about the ffmpeg-devel mailing list