00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86/asm.h"
00024 #include "libavutil/x86/cpu.h"
00025 #include "libavcodec/dsputil.h"
00026 #include "libavcodec/mpegaudiodsp.h"
00027
00028 #define DECL(CPU)\
00029 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
00030 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
00031
00032 DECL(sse)
00033 DECL(sse2)
00034 DECL(sse3)
00035 DECL(ssse3)
00036 DECL(avx)
00037
00038 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00039 float *tmpbuf);
00040 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00041 float *tmpbuf);
00042
00043 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00044
00045 #if HAVE_SSE2_INLINE
00046
00047 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00048 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00049
00050 #define SUM8(op, sum, w, p) \
00051 { \
00052 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00053 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00054 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00055 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00056 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00057 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00058 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00059 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00060 }
00061
00062 static void apply_window(const float *buf, const float *win1,
00063 const float *win2, float *sum1, float *sum2, int len)
00064 {
00065 x86_reg count = - 4*len;
00066 const float *win1a = win1+len;
00067 const float *win2a = win2+len;
00068 const float *bufa = buf+len;
00069 float *sum1a = sum1+len;
00070 float *sum2a = sum2+len;
00071
00072
00073 #define MULT(a, b) \
00074 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00075 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00076 "mulps %%xmm2, %%xmm1 \n\t" \
00077 "subps %%xmm1, %%xmm0 \n\t" \
00078 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00079 "subps %%xmm2, %%xmm4 \n\t" \
00080
00081 __asm__ volatile(
00082 "1: \n\t"
00083 "xorps %%xmm0, %%xmm0 \n\t"
00084 "xorps %%xmm4, %%xmm4 \n\t"
00085
00086 MULT( 0, 0)
00087 MULT( 256, 64)
00088 MULT( 512, 128)
00089 MULT( 768, 192)
00090 MULT(1024, 256)
00091 MULT(1280, 320)
00092 MULT(1536, 384)
00093 MULT(1792, 448)
00094
00095 "movaps %%xmm0, (%4,%0) \n\t"
00096 "movaps %%xmm4, (%5,%0) \n\t"
00097 "add $16, %0 \n\t"
00098 "jl 1b \n\t"
00099 :"+&r"(count)
00100 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00101 );
00102
00103 #undef MULT
00104 }
00105
00106 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00107 int incr)
00108 {
00109 LOCAL_ALIGNED_16(float, suma, [17]);
00110 LOCAL_ALIGNED_16(float, sumb, [17]);
00111 LOCAL_ALIGNED_16(float, sumc, [17]);
00112 LOCAL_ALIGNED_16(float, sumd, [17]);
00113
00114 float sum;
00115
00116
00117 __asm__ volatile(
00118 "movaps 0(%0), %%xmm0 \n\t" \
00119 "movaps 16(%0), %%xmm1 \n\t" \
00120 "movaps 32(%0), %%xmm2 \n\t" \
00121 "movaps 48(%0), %%xmm3 \n\t" \
00122 "movaps %%xmm0, 0(%1) \n\t" \
00123 "movaps %%xmm1, 16(%1) \n\t" \
00124 "movaps %%xmm2, 32(%1) \n\t" \
00125 "movaps %%xmm3, 48(%1) \n\t" \
00126 "movaps 64(%0), %%xmm0 \n\t" \
00127 "movaps 80(%0), %%xmm1 \n\t" \
00128 "movaps 96(%0), %%xmm2 \n\t" \
00129 "movaps 112(%0), %%xmm3 \n\t" \
00130 "movaps %%xmm0, 64(%1) \n\t" \
00131 "movaps %%xmm1, 80(%1) \n\t" \
00132 "movaps %%xmm2, 96(%1) \n\t" \
00133 "movaps %%xmm3, 112(%1) \n\t"
00134 ::"r"(in), "r"(in+512)
00135 :"memory"
00136 );
00137
00138 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00139 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00140
00141 SUM8(MACS, suma[0], win + 32, in + 48);
00142
00143 sumc[ 0] = 0;
00144 sumb[16] = 0;
00145 sumd[16] = 0;
00146
00147 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00148 "movups " #sumd "(%4), %%xmm0 \n\t" \
00149 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00150 "subps " #suma "(%1), %%xmm0 \n\t" \
00151 "movaps %%xmm0," #out1 "(%0) \n\t" \
00152 \
00153 "movups " #sumc "(%3), %%xmm0 \n\t" \
00154 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00155 "addps " #sumb "(%2), %%xmm0 \n\t" \
00156 "movaps %%xmm0," #out2 "(%0) \n\t"
00157
00158 if (incr == 1) {
00159 __asm__ volatile(
00160 SUMS( 0, 48, 4, 52, 0, 112)
00161 SUMS(16, 32, 20, 36, 16, 96)
00162 SUMS(32, 16, 36, 20, 32, 80)
00163 SUMS(48, 0, 52, 4, 48, 64)
00164
00165 :"+&r"(out)
00166 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00167 :"memory"
00168 );
00169 out += 16*incr;
00170 } else {
00171 int j;
00172 float *out2 = out + 32 * incr;
00173 out[0 ] = -suma[ 0];
00174 out += incr;
00175 out2 -= incr;
00176 for(j=1;j<16;j++) {
00177 *out = -suma[ j] + sumd[16-j];
00178 *out2 = sumb[16-j] + sumc[ j];
00179 out += incr;
00180 out2 -= incr;
00181 }
00182 }
00183
00184 sum = 0;
00185 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00186 *out = sum;
00187 }
00188
00189 #endif
00190
00191 #if HAVE_YASM
00192 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
00193 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
00194 int count, int switch_point, int block_type) \
00195 { \
00196 int align_end = count - (count & 3); \
00197 int j; \
00198 for (j = 0; j < align_end; j+= 4) { \
00199 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
00200 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
00201 \
00202 \
00203 \
00204 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
00205 in += 4*18; \
00206 buf += 4*18; \
00207 out += 4; \
00208 } \
00209 for (; j < count; j++) { \
00210 \
00211 \
00212 \
00213 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
00214 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
00215 \
00216 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
00217 \
00218 in += 18; \
00219 buf++; \
00220 out++; \
00221 } \
00222 }
00223
00224 #if HAVE_SSE
00225 DECL_IMDCT_BLOCKS(sse,sse)
00226 DECL_IMDCT_BLOCKS(sse2,sse)
00227 DECL_IMDCT_BLOCKS(sse3,sse)
00228 DECL_IMDCT_BLOCKS(ssse3,sse)
00229 #endif
00230 #if HAVE_AVX_EXTERNAL
00231 DECL_IMDCT_BLOCKS(avx,avx)
00232 #endif
00233 #endif
00234
00235 void ff_mpadsp_init_mmx(MPADSPContext *s)
00236 {
00237 int mm_flags = av_get_cpu_flags();
00238
00239 int i, j;
00240 for (j = 0; j < 4; j++) {
00241 for (i = 0; i < 40; i ++) {
00242 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
00243 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00244 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
00245 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00246 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
00247 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
00248 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
00249 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00250 }
00251 }
00252
00253 #if HAVE_SSE2_INLINE
00254 if (mm_flags & AV_CPU_FLAG_SSE2) {
00255 s->apply_window_float = apply_window_mp3;
00256 }
00257 #endif
00258
00259 #if HAVE_YASM
00260 if (EXTERNAL_AVX(mm_flags)) {
00261 s->imdct36_blocks_float = imdct36_blocks_avx;
00262 } else if (EXTERNAL_SSSE3(mm_flags)) {
00263 s->imdct36_blocks_float = imdct36_blocks_ssse3;
00264 } else if (EXTERNAL_SSE3(mm_flags)) {
00265 s->imdct36_blocks_float = imdct36_blocks_sse3;
00266 } else if (EXTERNAL_SSE2(mm_flags)) {
00267 s->imdct36_blocks_float = imdct36_blocks_sse2;
00268 } else if (EXTERNAL_SSE(mm_flags)) {
00269 s->imdct36_blocks_float = imdct36_blocks_sse;
00270 }
00271 #endif
00272 }