00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026
00027 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00028 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00029
00030 #define SUM8(op, sum, w, p) \
00031 { \
00032 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00033 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00034 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00035 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00036 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00037 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00038 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00039 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00040 }
00041
00042 static void apply_window(const float *buf, const float *win1,
00043 const float *win2, float *sum1, float *sum2, int len)
00044 {
00045 x86_reg count = - 4*len;
00046 const float *win1a = win1+len;
00047 const float *win2a = win2+len;
00048 const float *bufa = buf+len;
00049 float *sum1a = sum1+len;
00050 float *sum2a = sum2+len;
00051
00052
00053 #define MULT(a, b) \
00054 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00055 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00056 "mulps %%xmm2, %%xmm1 \n\t" \
00057 "subps %%xmm1, %%xmm0 \n\t" \
00058 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00059 "subps %%xmm2, %%xmm4 \n\t" \
00060
00061 __asm__ volatile(
00062 "1: \n\t"
00063 "xorps %%xmm0, %%xmm0 \n\t"
00064 "xorps %%xmm4, %%xmm4 \n\t"
00065
00066 MULT( 0, 0)
00067 MULT( 256, 64)
00068 MULT( 512, 128)
00069 MULT( 768, 192)
00070 MULT(1024, 256)
00071 MULT(1280, 320)
00072 MULT(1536, 384)
00073 MULT(1792, 448)
00074
00075 "movaps %%xmm0, (%4,%0) \n\t"
00076 "movaps %%xmm4, (%5,%0) \n\t"
00077 "add $16, %0 \n\t"
00078 "jl 1b \n\t"
00079 :"+&r"(count)
00080 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00081 );
00082
00083 #undef MULT
00084 }
00085
00086 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00087 int incr)
00088 {
00089 LOCAL_ALIGNED_16(float, suma, [17]);
00090 LOCAL_ALIGNED_16(float, sumb, [17]);
00091 LOCAL_ALIGNED_16(float, sumc, [17]);
00092 LOCAL_ALIGNED_16(float, sumd, [17]);
00093
00094 float sum;
00095
00096
00097 memcpy(in + 512, in, 32 * sizeof(*in));
00098
00099 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00100 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00101
00102 SUM8(MACS, suma[0], win + 32, in + 48);
00103
00104 sumc[ 0] = 0;
00105 sumb[16] = 0;
00106 sumd[16] = 0;
00107
00108 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00109 "movups " #sumd "(%4), %%xmm0 \n\t" \
00110 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00111 "subps " #suma "(%1), %%xmm0 \n\t" \
00112 "movaps %%xmm0," #out1 "(%0) \n\t" \
00113 \
00114 "movups " #sumc "(%3), %%xmm0 \n\t" \
00115 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00116 "addps " #sumb "(%2), %%xmm0 \n\t" \
00117 "movaps %%xmm0," #out2 "(%0) \n\t"
00118
00119 if (incr == 1) {
00120 __asm__ volatile(
00121 SUMS( 0, 48, 4, 52, 0, 112)
00122 SUMS(16, 32, 20, 36, 16, 96)
00123 SUMS(32, 16, 36, 20, 32, 80)
00124 SUMS(48, 0, 52, 4, 48, 64)
00125
00126 :"+&r"(out)
00127 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00128 :"memory"
00129 );
00130 out += 16*incr;
00131 } else {
00132 int j;
00133 float *out2 = out + 32 * incr;
00134 out[0 ] = -suma[ 0];
00135 out += incr;
00136 out2 -= incr;
00137 for(j=1;j<16;j++) {
00138 *out = -suma[ j] + sumd[16-j];
00139 *out2 = sumb[16-j] + sumc[ j];
00140 out += incr;
00141 out2 -= incr;
00142 }
00143 }
00144
00145 sum = 0;
00146 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00147 *out = sum;
00148 }
00149
00150 void ff_mpadsp_init_mmx(MPADSPContext *s)
00151 {
00152 int mm_flags = av_get_cpu_flags();
00153
00154 if (mm_flags & AV_CPU_FLAG_SSE2) {
00155 s->apply_window_float = apply_window_mp3;
00156 }
00157 }