FFmpeg: libavcodec/x86/mpegaudiodec.c Source File

00001 /*
00002  * MMX optimized MP3 decoding functions
00003  * Copyright (c) 2010 Vitor Sessak
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86/asm.h"
00024 #include "libavutil/x86/cpu.h"
00025 #include "libavcodec/dsputil.h"
00026 #include "libavcodec/mpegaudiodsp.h"
00027 
00028 #define DECL(CPU)\
00029 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
00030 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
00031 
00032 DECL(sse)
00033 DECL(sse2)
00034 DECL(sse3)
00035 DECL(ssse3)
00036 DECL(avx)
00037 
00038 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00039                                float *tmpbuf);
00040 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00041                                float *tmpbuf);
00042 
00043 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00044 
00045 #if HAVE_SSE2_INLINE
00046 
00047 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00048 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00049 
00050 #define SUM8(op, sum, w, p)               \
00051 {                                         \
00052     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
00053     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
00054     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
00055     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
00056     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
00057     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
00058     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
00059     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
00060 }
00061 
00062 static void apply_window(const float *buf, const float *win1,
00063                          const float *win2, float *sum1, float *sum2, int len)
00064 {
00065     x86_reg count = - 4*len;
00066     const float *win1a = win1+len;
00067     const float *win2a = win2+len;
00068     const float *bufa  = buf+len;
00069     float *sum1a = sum1+len;
00070     float *sum2a = sum2+len;
00071 
00072 
00073 #define MULT(a, b)                                 \
00074     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
00075     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
00076     "mulps         %%xmm2, %%xmm1           \n\t"  \
00077     "subps         %%xmm1, %%xmm0           \n\t"  \
00078     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
00079     "subps         %%xmm2, %%xmm4           \n\t"  \
00080 
00081     __asm__ volatile(
00082             "1:                                   \n\t"
00083             "xorps       %%xmm0, %%xmm0           \n\t"
00084             "xorps       %%xmm4, %%xmm4           \n\t"
00085 
00086             MULT(   0,   0)
00087             MULT( 256,  64)
00088             MULT( 512, 128)
00089             MULT( 768, 192)
00090             MULT(1024, 256)
00091             MULT(1280, 320)
00092             MULT(1536, 384)
00093             MULT(1792, 448)
00094 
00095             "movaps      %%xmm0, (%4,%0)          \n\t"
00096             "movaps      %%xmm4, (%5,%0)          \n\t"
00097             "add            $16,  %0              \n\t"
00098             "jl              1b                   \n\t"
00099             :"+&r"(count)
00100             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00101             );
00102 
00103 #undef MULT
00104 }
00105 
00106 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00107                              int incr)
00108 {
00109     LOCAL_ALIGNED_16(float, suma, [17]);
00110     LOCAL_ALIGNED_16(float, sumb, [17]);
00111     LOCAL_ALIGNED_16(float, sumc, [17]);
00112     LOCAL_ALIGNED_16(float, sumd, [17]);
00113 
00114     float sum;
00115 
00116     /* copy to avoid wrap */
00117     __asm__ volatile(
00118             "movaps    0(%0), %%xmm0   \n\t" \
00119             "movaps   16(%0), %%xmm1   \n\t" \
00120             "movaps   32(%0), %%xmm2   \n\t" \
00121             "movaps   48(%0), %%xmm3   \n\t" \
00122             "movaps   %%xmm0,   0(%1) \n\t" \
00123             "movaps   %%xmm1,  16(%1) \n\t" \
00124             "movaps   %%xmm2,  32(%1) \n\t" \
00125             "movaps   %%xmm3,  48(%1) \n\t" \
00126             "movaps   64(%0), %%xmm0   \n\t" \
00127             "movaps   80(%0), %%xmm1   \n\t" \
00128             "movaps   96(%0), %%xmm2   \n\t" \
00129             "movaps  112(%0), %%xmm3   \n\t" \
00130             "movaps   %%xmm0,  64(%1) \n\t" \
00131             "movaps   %%xmm1,  80(%1) \n\t" \
00132             "movaps   %%xmm2,  96(%1) \n\t" \
00133             "movaps   %%xmm3, 112(%1) \n\t"
00134             ::"r"(in), "r"(in+512)
00135             :"memory"
00136             );
00137 
00138     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
00139     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00140 
00141     SUM8(MACS, suma[0], win + 32, in + 48);
00142 
00143     sumc[ 0] = 0;
00144     sumb[16] = 0;
00145     sumd[16] = 0;
00146 
00147 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
00148             "movups " #sumd "(%4),       %%xmm0          \n\t" \
00149             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00150             "subps  " #suma "(%1),       %%xmm0          \n\t" \
00151             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
00152 \
00153             "movups " #sumc "(%3),       %%xmm0          \n\t" \
00154             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00155             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
00156             "movaps        %%xmm0," #out2 "(%0)          \n\t"
00157 
00158     if (incr == 1) {
00159         __asm__ volatile(
00160             SUMS( 0, 48,  4, 52,  0, 112)
00161             SUMS(16, 32, 20, 36, 16,  96)
00162             SUMS(32, 16, 36, 20, 32,  80)
00163             SUMS(48,  0, 52,  4, 48,  64)
00164 
00165             :"+&r"(out)
00166             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00167             :"memory"
00168             );
00169         out += 16*incr;
00170     } else {
00171         int j;
00172         float *out2 = out + 32 * incr;
00173         out[0  ]  = -suma[   0];
00174         out += incr;
00175         out2 -= incr;
00176         for(j=1;j<16;j++) {
00177             *out  = -suma[   j] + sumd[16-j];
00178             *out2 =  sumb[16-j] + sumc[   j];
00179             out  += incr;
00180             out2 -= incr;
00181         }
00182     }
00183 
00184     sum = 0;
00185     SUM8(MLSS, sum, win + 16 + 32, in + 32);
00186     *out = sum;
00187 }
00188 
00189 #endif /* HAVE_SSE2_INLINE */
00190 
00191 #if HAVE_YASM
00192 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
00193 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
00194                                int count, int switch_point, int block_type) \
00195 {                                                                           \
00196     int align_end = count - (count & 3);                                \
00197     int j;                                                              \
00198     for (j = 0; j < align_end; j+= 4) {                                 \
00199         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
00200         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
00201         /* apply window & overlap with previous buffer */               \
00202                                                                         \
00203         /* select window */                                             \
00204         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
00205         in      += 4*18;                                                \
00206         buf     += 4*18;                                                \
00207         out     += 4;                                                   \
00208     }                                                                   \
00209     for (; j < count; j++) {                                            \
00210         /* apply window & overlap with previous buffer */               \
00211                                                                         \
00212         /* select window */                                             \
00213         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
00214         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
00215                                                                         \
00216         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
00217                                                                         \
00218         in  += 18;                                                      \
00219         buf++;                                                          \
00220         out++;                                                          \
00221     }                                                                   \
00222 }
00223 
00224 #if HAVE_SSE
00225 DECL_IMDCT_BLOCKS(sse,sse)
00226 DECL_IMDCT_BLOCKS(sse2,sse)
00227 DECL_IMDCT_BLOCKS(sse3,sse)
00228 DECL_IMDCT_BLOCKS(ssse3,sse)
00229 #endif
00230 #if HAVE_AVX_EXTERNAL
00231 DECL_IMDCT_BLOCKS(avx,avx)
00232 #endif
00233 #endif /* HAVE_YASM */
00234 
00235 void ff_mpadsp_init_mmx(MPADSPContext *s)
00236 {
00237     int mm_flags = av_get_cpu_flags();
00238 
00239     int i, j;
00240     for (j = 0; j < 4; j++) {
00241         for (i = 0; i < 40; i ++) {
00242             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
00243             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00244             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00245             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00246             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
00247             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
00248             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00249             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00250         }
00251     }
00252 
00253 #if HAVE_SSE2_INLINE
00254     if (mm_flags & AV_CPU_FLAG_SSE2) {
00255         s->apply_window_float = apply_window_mp3;
00256     }
00257 #endif /* HAVE_SSE2_INLINE */
00258 
00259 #if HAVE_YASM
00260     if (EXTERNAL_AVX(mm_flags)) {
00261         s->imdct36_blocks_float = imdct36_blocks_avx;
00262     } else if (EXTERNAL_SSSE3(mm_flags)) {
00263         s->imdct36_blocks_float = imdct36_blocks_ssse3;
00264     } else if (EXTERNAL_SSE3(mm_flags)) {
00265         s->imdct36_blocks_float = imdct36_blocks_sse3;
00266     } else if (EXTERNAL_SSE2(mm_flags)) {
00267         s->imdct36_blocks_float = imdct36_blocks_sse2;
00268     } else if (EXTERNAL_SSE(mm_flags)) {
00269         s->imdct36_blocks_float = imdct36_blocks_sse;
00270     }
00271 #endif /* HAVE_YASM */
00272 }