FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mpegaudiodec.c
Go to the documentation of this file.
1 /*
2  * MMX optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/cpu.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
25 #include "libavcodec/dsputil.h"
27 
28 #define DECL(CPU)\
29 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
30 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
31 
32 DECL(sse)
33 DECL(sse2)
34 DECL(sse3)
35 DECL(ssse3)
36 DECL(avx)
37 
38 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
39  float *tmpbuf);
40 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
41  float *tmpbuf);
42 
43 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
44 
45 #if HAVE_SSE2_INLINE
46 
47 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
48 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
49 
50 #define SUM8(op, sum, w, p) \
51 { \
52  op(sum, (w)[0 * 64], (p)[0 * 64]); \
53  op(sum, (w)[1 * 64], (p)[1 * 64]); \
54  op(sum, (w)[2 * 64], (p)[2 * 64]); \
55  op(sum, (w)[3 * 64], (p)[3 * 64]); \
56  op(sum, (w)[4 * 64], (p)[4 * 64]); \
57  op(sum, (w)[5 * 64], (p)[5 * 64]); \
58  op(sum, (w)[6 * 64], (p)[6 * 64]); \
59  op(sum, (w)[7 * 64], (p)[7 * 64]); \
60 }
61 
62 static void apply_window(const float *buf, const float *win1,
63  const float *win2, float *sum1, float *sum2, int len)
64 {
65  x86_reg count = - 4*len;
66  const float *win1a = win1+len;
67  const float *win2a = win2+len;
68  const float *bufa = buf+len;
69  float *sum1a = sum1+len;
70  float *sum2a = sum2+len;
71 
72 
73 #define MULT(a, b) \
74  "movaps " #a "(%1,%0), %%xmm1 \n\t" \
75  "movaps " #a "(%3,%0), %%xmm2 \n\t" \
76  "mulps %%xmm2, %%xmm1 \n\t" \
77  "subps %%xmm1, %%xmm0 \n\t" \
78  "mulps " #b "(%2,%0), %%xmm2 \n\t" \
79  "subps %%xmm2, %%xmm4 \n\t" \
80 
81  __asm__ volatile(
82  "1: \n\t"
83  "xorps %%xmm0, %%xmm0 \n\t"
84  "xorps %%xmm4, %%xmm4 \n\t"
85 
86  MULT( 0, 0)
87  MULT( 256, 64)
88  MULT( 512, 128)
89  MULT( 768, 192)
90  MULT(1024, 256)
91  MULT(1280, 320)
92  MULT(1536, 384)
93  MULT(1792, 448)
94 
95  "movaps %%xmm0, (%4,%0) \n\t"
96  "movaps %%xmm4, (%5,%0) \n\t"
97  "add $16, %0 \n\t"
98  "jl 1b \n\t"
99  :"+&r"(count)
100  :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
101  );
102 
103 #undef MULT
104 }
105 
106 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
107  int incr)
108 {
109  LOCAL_ALIGNED_16(float, suma, [17]);
110  LOCAL_ALIGNED_16(float, sumb, [17]);
111  LOCAL_ALIGNED_16(float, sumc, [17]);
112  LOCAL_ALIGNED_16(float, sumd, [17]);
113 
114  float sum;
115 
116  /* copy to avoid wrap */
117  __asm__ volatile(
118  "movaps 0(%0), %%xmm0 \n\t" \
119  "movaps 16(%0), %%xmm1 \n\t" \
120  "movaps 32(%0), %%xmm2 \n\t" \
121  "movaps 48(%0), %%xmm3 \n\t" \
122  "movaps %%xmm0, 0(%1) \n\t" \
123  "movaps %%xmm1, 16(%1) \n\t" \
124  "movaps %%xmm2, 32(%1) \n\t" \
125  "movaps %%xmm3, 48(%1) \n\t" \
126  "movaps 64(%0), %%xmm0 \n\t" \
127  "movaps 80(%0), %%xmm1 \n\t" \
128  "movaps 96(%0), %%xmm2 \n\t" \
129  "movaps 112(%0), %%xmm3 \n\t" \
130  "movaps %%xmm0, 64(%1) \n\t" \
131  "movaps %%xmm1, 80(%1) \n\t" \
132  "movaps %%xmm2, 96(%1) \n\t" \
133  "movaps %%xmm3, 112(%1) \n\t"
134  ::"r"(in), "r"(in+512)
135  :"memory"
136  );
137 
138  apply_window(in + 16, win , win + 512, suma, sumc, 16);
139  apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
140 
141  SUM8(MACS, suma[0], win + 32, in + 48);
142 
143  sumc[ 0] = 0;
144  sumb[16] = 0;
145  sumd[16] = 0;
146 
147 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
148  "movups " #sumd "(%4), %%xmm0 \n\t" \
149  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
150  "subps " #suma "(%1), %%xmm0 \n\t" \
151  "movaps %%xmm0," #out1 "(%0) \n\t" \
152 \
153  "movups " #sumc "(%3), %%xmm0 \n\t" \
154  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
155  "addps " #sumb "(%2), %%xmm0 \n\t" \
156  "movaps %%xmm0," #out2 "(%0) \n\t"
157 
158  if (incr == 1) {
159  __asm__ volatile(
160  SUMS( 0, 48, 4, 52, 0, 112)
161  SUMS(16, 32, 20, 36, 16, 96)
162  SUMS(32, 16, 36, 20, 32, 80)
163  SUMS(48, 0, 52, 4, 48, 64)
164 
165  :"+&r"(out)
166  :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
167  :"memory"
168  );
169  out += 16*incr;
170  } else {
171  int j;
172  float *out2 = out + 32 * incr;
173  out[0 ] = -suma[ 0];
174  out += incr;
175  out2 -= incr;
176  for(j=1;j<16;j++) {
177  *out = -suma[ j] + sumd[16-j];
178  *out2 = sumb[16-j] + sumc[ j];
179  out += incr;
180  out2 -= incr;
181  }
182  }
183 
184  sum = 0;
185  SUM8(MLSS, sum, win + 16 + 32, in + 32);
186  *out = sum;
187 }
188 
189 #endif /* HAVE_SSE2_INLINE */
190 
191 #if HAVE_YASM
192 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
193 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
194  int count, int switch_point, int block_type) \
195 { \
196  int align_end = count - (count & 3); \
197  int j; \
198  for (j = 0; j < align_end; j+= 4) { \
199  LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
200  float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
201  /* apply window & overlap with previous buffer */ \
202  \
203  /* select window */ \
204  ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
205  in += 4*18; \
206  buf += 4*18; \
207  out += 4; \
208  } \
209  for (; j < count; j++) { \
210  /* apply window & overlap with previous buffer */ \
211  \
212  /* select window */ \
213  int win_idx = (switch_point && j < 2) ? 0 : block_type; \
214  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
215  \
216  ff_imdct36_float_ ## CPU1(out, buf, in, win); \
217  \
218  in += 18; \
219  buf++; \
220  out++; \
221  } \
222 }
223 
224 #if HAVE_SSE
225 DECL_IMDCT_BLOCKS(sse,sse)
226 DECL_IMDCT_BLOCKS(sse2,sse)
227 DECL_IMDCT_BLOCKS(sse3,sse)
228 DECL_IMDCT_BLOCKS(ssse3,sse)
229 #endif
230 #if HAVE_AVX_EXTERNAL
231 DECL_IMDCT_BLOCKS(avx,avx)
232 #endif
233 #endif /* HAVE_YASM */
234 
236 {
237  int mm_flags = av_get_cpu_flags();
238 
239  int i, j;
240  for (j = 0; j < 4; j++) {
241  for (i = 0; i < 40; i ++) {
242  mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
243  mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
244  mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
245  mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
246  mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
247  mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
248  mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
249  mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
250  }
251  }
252 
253 #if HAVE_SSE2_INLINE
254  if (mm_flags & AV_CPU_FLAG_SSE2) {
256  }
257 #endif /* HAVE_SSE2_INLINE */
258 
259 #if HAVE_YASM
260  if (EXTERNAL_AVX(mm_flags)) {
261  s->imdct36_blocks_float = imdct36_blocks_avx;
262  } else if (EXTERNAL_SSSE3(mm_flags)) {
263  s->imdct36_blocks_float = imdct36_blocks_ssse3;
264  } else if (EXTERNAL_SSE3(mm_flags)) {
265  s->imdct36_blocks_float = imdct36_blocks_sse3;
266  } else if (EXTERNAL_SSE2(mm_flags)) {
267  s->imdct36_blocks_float = imdct36_blocks_sse2;
268  } else if (EXTERNAL_SSE(mm_flags)) {
269  s->imdct36_blocks_float = imdct36_blocks_sse;
270  }
271 #endif /* HAVE_YASM */
272 }