[FFmpeg-devel] [PATCH] split-radix FFT

Tue Jul 29 02:59:15 CEST 2008

On Fri, Jul 25, 2008 at 08:14:00PM -0600, Loren Merritt wrote:
> $subject, vaguely based on djbfft.
> Changed from djb:
> * added simd.
> * removed the hand-scheduled pentium-pro code. gcc's output from simple C 
> is better on all cpus I have access to.
> * removed the distinction between fft and ifft. they're just permutations 
> of eachother, so the difference belongs in revtab[] and not in the code.
> * removed the distinction between pass() and pass_big(). C can always use 
> the memory-efficient version, and simd never does because the shuffles are 
> too costly.
> * made an entirely different pass_big(), to avoid store->load aliasing.
>
> I tried the tangent FFT, but I couldn't make it faster than split-radix. 
> Tangent has asymptotically 5% fewer arithmetic ops, but only 1-2% for sizes 
> typical of audio codecs, and even a couple extra shuffles or other overhead 
> pushes it over.
>
> I tried an in-place fft_permute, but it wasn't any faster than out-of-place 
> + memcpy, and quite a bit more complex.
[...]
> -    for(i=0;i<n;i++) {
> -        m=0;
> -        for(j=0;j<nbits;j++) {
> -            m |= ((i >> j) & 1) << (nbits-j-1);
> +        for(i=0;i<n;i++) {
> +            m=0;
> +            for(j=0;j<nbits;j++) {
> +                m |= ((i >> j) & 1) << (nbits-j-1);
> +            }
> +            s->revtab[i]=m;
>          }
> -        s->revtab[i]=m;
>      }

the reindention should be in a seperate commit from the rest

[...]
> @@ -23,105 +22,209 @@
>  #include "libavutil/x86_cpu.h"
>  #include "libavcodec/dsputil.h"
>  
> -static const int p1m1[2] __attribute__((aligned(8))) =
> -    { 0, 1 << 31 };
> +DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
> +DECLARE_ALIGNED_8(static const int, m1p1[2]) = { 1<<31, 0 };
[...]
>  
> -static const int m1p1[2] __attribute__((aligned(8))) =
> -    { 1 << 31, 0 };

This should ideally as well be in a seperate commit from the functional
changes

> +#ifdef EMULATE_3DNOWEXT
> +#define PSWAPD(s,d)\
> +    "movq "#s","#d"\n"\
> +    "psrlq $32,"#d"\n"\
> +    "punpckldq "#s","#d"\n"

> +#define PSWAPD_UNARY(s)\
> +    "sub $8, %%"REG_SP"\n"\
> +    "movd "#s", 4(%%"REG_SP")\n"\
> +    "punpckhdq (%%"REG_SP"), "#s"\n"\
> +    "add $8, %%"REG_SP"\n"

Gcc failed with a "+m" ?

> +#define ff_fft_calc_3dn2 ff_fft_calc_3dn
> +#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
> +#define ff_imdct_half_3dn2 ff_imdct_half_3dn
> +#else
> +#define PSWAPD(s,d)     "pswapd "#s","#d"\n"
> +#define PSWAPD_UNARY(s) "pswapd "#s","#s"\n"
> +#endif
>  
> -void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
> +#define T2(m0,m1,z0,z1)\
> +    asm volatile(\
> +        "movq     %0,  "#z0" \n"\
> +        "movq   "#z0", "#z1" \n"\
> +        "pfadd    %1,  "#z0" \n"\
> +        "pfsub    %1,  "#z1" \n"\
> +        ::"m"(m0), "m"(m1)\
> +    );
> +
> +#define T4(z0,z1,z2,z3,t0,t1)\
> +    asm volatile(\
> +        "movq   "#z2", "#t1" \n" /* FIXME redundant */\
> +        "movq   "#z2", "#t0" \n" /* {r2,i2} */\
> +        "pfsub  "#z3", "#t1" \n"\
> +        "pfadd  "#z3", "#t0" \n" /* {t6,t5} */\
> +        "pxor     %0,  "#t1" \n" /* {t8,t7} */\
> +        "movq   "#z0", "#z2" \n"\
> +        PSWAPD_UNARY(t1)\
> +        "pfadd  "#t0", "#z0" \n" /* {r0,i0} */\
> +        "pfsub  "#t0", "#z2" \n" /* {r2,i2} */\
> +        "movq   "#z1", "#z3" \n"\
> +        "pfadd  "#t1", "#z1" \n" /* {r1,i1} */\
> +        "pfsub  "#t1", "#z3" \n" /* {r3,i3} */\
> +        ::"m"(*m1p1)\
> +    );
> +
> +#define LOAD(mem,mmx)\
> +    asm volatile("movq %0, "#mmx ::"m"(mem))
> +
> +#define SAVE(mem,mmx)\
> +    asm volatile("movq "#mmx", %0" :"=m"(mem))
> +
> +#define PUNPCK(a,b,c)\
> +    asm volatile(\
> +        "movq      "#a", "#c"\n"\
> +        "punpckldq "#b", "#a"\n"\
> +        "punpckhdq "#b", "#c"\n"\
> +    :);
> +
> +#define PUNPCK_MEM(a,b,c)\
> +    asm volatile(\
> +        "movq      "#a", "#c"\n"\
> +        "punpckldq  %0,  "#a"\n"\
> +        "punpckhdq  %0,  "#c"\n"\
> +        ::"m"(b)\
> +    );
> +
> +static void fft4(FFTComplex *z)
>  {
> -    int ln = s->nbits;
> -    long j;
> -    x86_reg i;
> -    long nblocks, nloops;
> -    FFTComplex *p, *cptr;
> +    T2(z[0], z[1], %%mm0, %%mm1);
> +    LOAD(z[2], %%mm2);
> +    LOAD(z[3], %%mm3);
> +    T4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5);
> +    PUNPCK(%%mm0, %%mm1, %%mm4);
> +    PUNPCK(%%mm2, %%mm3, %%mm5);
> +    SAVE(z[0], %%mm0);
> +    SAVE(z[1], %%mm4);
> +    SAVE(z[2], %%mm2);
> +    SAVE(z[3], %%mm5);
> +}

is there any reason why seperate asm() are chained? I think a single
asm block, or even nasm/yasm if you prefer would be better.
The way its written is almost asking for gcc to put something in between,
iam especially concerned about the -fPIC case and gcc putting all the GOT
"magic" in between the asms ...

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The misfortune of the wise is better than the prosperity of the fool.
-- Epicurus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080729/41574ded/attachment.pgp>