00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 #include "fft.h"
00025
00026 DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 };
00027
00028 #ifdef EMULATE_3DNOWEXT
00029 #define PSWAPD(s,d)\
00030 "movq "#s","#d"\n"\
00031 "psrlq $32,"#d"\n"\
00032 "punpckldq "#s","#d"\n"
00033 #define ff_fft_calc_3dn2 ff_fft_calc_3dn
00034 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
00035 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
00036 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
00037 #define ff_imdct_half_3dn2 ff_imdct_half_3dn
00038 #else
00039 #define PSWAPD(s,d) "pswapd "#s","#d"\n"
00040 #endif
00041
00042 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
00043 void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
00044
00045 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
00046 {
00047 int n = 1<<s->nbits;
00048 int i;
00049 ff_fft_dispatch_interleave_3dn2(z, s->nbits);
00050 __asm__ volatile("femms");
00051 if(n <= 8)
00052 for(i=0; i<n; i+=2)
00053 FFSWAP(FFTSample, z[i].im, z[i+1].re);
00054 }
00055
00056 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
00057 {
00058 x86_reg j, k;
00059 long n = s->mdct_size;
00060 long n2 = n >> 1;
00061 long n4 = n >> 2;
00062 long n8 = n >> 3;
00063 const uint16_t *revtab = s->revtab;
00064 const FFTSample *tcos = s->tcos;
00065 const FFTSample *tsin = s->tsin;
00066 const FFTSample *in1, *in2;
00067 FFTComplex *z = (FFTComplex *)output;
00068
00069
00070 in1 = input;
00071 in2 = input + n2 - 1;
00072 #ifdef EMULATE_3DNOWEXT
00073 __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31));
00074 #endif
00075 for(k = 0; k < n4; k++) {
00076
00077 __asm__ volatile(
00078 "movd %0, %%mm0 \n"
00079 "movd %2, %%mm1 \n"
00080 "punpckldq %1, %%mm0 \n"
00081 "punpckldq %3, %%mm1 \n"
00082 "movq %%mm0, %%mm2 \n"
00083 PSWAPD( %%mm1, %%mm3 )
00084 "pfmul %%mm1, %%mm0 \n"
00085 "pfmul %%mm3, %%mm2 \n"
00086 #ifdef EMULATE_3DNOWEXT
00087 "movq %%mm0, %%mm1 \n"
00088 "punpckhdq %%mm2, %%mm0 \n"
00089 "punpckldq %%mm2, %%mm1 \n"
00090 "pxor %%mm7, %%mm0 \n"
00091 "pfadd %%mm1, %%mm0 \n"
00092 #else
00093 "pfpnacc %%mm2, %%mm0 \n"
00094 #endif
00095 ::"m"(in2[-2*k]), "m"(in1[2*k]),
00096 "m"(tcos[k]), "m"(tsin[k])
00097 );
00098 __asm__ volatile(
00099 "movq %%mm0, %0 \n\t"
00100 :"=m"(z[revtab[k]])
00101 );
00102 }
00103
00104 ff_fft_dispatch_3dn2(z, s->nbits);
00105
00106 #define CMUL(j,mm0,mm1)\
00107 "movq (%2,"#j",2), %%mm6 \n"\
00108 "movq 8(%2,"#j",2), "#mm0"\n"\
00109 "movq %%mm6, "#mm1"\n"\
00110 "movq "#mm0",%%mm7 \n"\
00111 "pfmul (%3,"#j"), %%mm6 \n"\
00112 "pfmul (%4,"#j"), "#mm0"\n"\
00113 "pfmul (%4,"#j"), "#mm1"\n"\
00114 "pfmul (%3,"#j"), %%mm7 \n"\
00115 "pfsub %%mm6, "#mm0"\n"\
00116 "pfadd %%mm7, "#mm1"\n"
00117
00118
00119 j = -n2;
00120 k = n2-8;
00121 __asm__ volatile(
00122 "1: \n"
00123 CMUL(%0, %%mm0, %%mm1)
00124 CMUL(%1, %%mm2, %%mm3)
00125 "movd %%mm0, (%2,%0,2) \n"
00126 "movd %%mm1,12(%2,%1,2) \n"
00127 "movd %%mm2, (%2,%1,2) \n"
00128 "movd %%mm3,12(%2,%0,2) \n"
00129 "psrlq $32, %%mm0 \n"
00130 "psrlq $32, %%mm1 \n"
00131 "psrlq $32, %%mm2 \n"
00132 "psrlq $32, %%mm3 \n"
00133 "movd %%mm0, 8(%2,%0,2) \n"
00134 "movd %%mm1, 4(%2,%1,2) \n"
00135 "movd %%mm2, 8(%2,%1,2) \n"
00136 "movd %%mm3, 4(%2,%0,2) \n"
00137 "sub $8, %1 \n"
00138 "add $8, %0 \n"
00139 "jl 1b \n"
00140 :"+r"(j), "+r"(k)
00141 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
00142 :"memory"
00143 );
00144 __asm__ volatile("femms");
00145 }
00146
00147 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
00148 {
00149 x86_reg j, k;
00150 long n = s->mdct_size;
00151 long n4 = n >> 2;
00152
00153 ff_imdct_half_3dn2(s, output+n4, input);
00154
00155 j = -n;
00156 k = n-8;
00157 __asm__ volatile(
00158 "movq %4, %%mm7 \n"
00159 "1: \n"
00160 PSWAPD((%2,%1), %%mm0)
00161 PSWAPD((%3,%0), %%mm1)
00162 "pxor %%mm7, %%mm0 \n"
00163 "movq %%mm1, (%3,%1) \n"
00164 "movq %%mm0, (%2,%0) \n"
00165 "sub $8, %1 \n"
00166 "add $8, %0 \n"
00167 "jl 1b \n"
00168 :"+r"(j), "+r"(k)
00169 :"r"(output+n4), "r"(output+n4*3),
00170 "m"(*m1m1)
00171 );
00172 __asm__ volatile("femms");
00173 }