FFmpeg
mpegaudiodsp.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
29 
30 #define DECL(CPU)\
31 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
32 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
33 
34 #if HAVE_X86ASM
35 #if ARCH_X86_32
36 DECL(sse)
37 #endif
38 DECL(sse2)
39 DECL(sse3)
40 DECL(ssse3)
41 DECL(avx)
42 #endif /* HAVE_X86ASM */
43 
44 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
45  float *tmpbuf);
46 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
47  float *tmpbuf);
48 
49 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
50 
51 #if HAVE_6REGS && HAVE_SSE_INLINE
52 
53 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
54 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
55 
56 #define SUM8(op, sum, w, p) \
57 { \
58  op(sum, (w)[0 * 64], (p)[0 * 64]); \
59  op(sum, (w)[1 * 64], (p)[1 * 64]); \
60  op(sum, (w)[2 * 64], (p)[2 * 64]); \
61  op(sum, (w)[3 * 64], (p)[3 * 64]); \
62  op(sum, (w)[4 * 64], (p)[4 * 64]); \
63  op(sum, (w)[5 * 64], (p)[5 * 64]); \
64  op(sum, (w)[6 * 64], (p)[6 * 64]); \
65  op(sum, (w)[7 * 64], (p)[7 * 64]); \
66 }
67 
68 static void apply_window(const float *buf, const float *win1,
69  const float *win2, float *sum1, float *sum2, int len)
70 {
71  x86_reg count = - 4*len;
72  const float *win1a = win1+len;
73  const float *win2a = win2+len;
74  const float *bufa = buf+len;
75  float *sum1a = sum1+len;
76  float *sum2a = sum2+len;
77 
78 
79 #define MULT(a, b) \
80  "movaps " #a "(%1,%0), %%xmm1 \n\t" \
81  "movaps " #a "(%3,%0), %%xmm2 \n\t" \
82  "mulps %%xmm2, %%xmm1 \n\t" \
83  "subps %%xmm1, %%xmm0 \n\t" \
84  "mulps " #b "(%2,%0), %%xmm2 \n\t" \
85  "subps %%xmm2, %%xmm4 \n\t" \
86 
87  __asm__ volatile(
88  "1: \n\t"
89  "xorps %%xmm0, %%xmm0 \n\t"
90  "xorps %%xmm4, %%xmm4 \n\t"
91 
92  MULT( 0, 0)
93  MULT( 256, 64)
94  MULT( 512, 128)
95  MULT( 768, 192)
96  MULT(1024, 256)
97  MULT(1280, 320)
98  MULT(1536, 384)
99  MULT(1792, 448)
100 
101  "movaps %%xmm0, (%4,%0) \n\t"
102  "movaps %%xmm4, (%5,%0) \n\t"
103  "add $16, %0 \n\t"
104  "jl 1b \n\t"
105  :"+&r"(count)
106  :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
107  );
108 
109 #undef MULT
110 }
111 
112 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
113  ptrdiff_t incr)
114 {
115  LOCAL_ALIGNED_16(float, suma, [17]);
116  LOCAL_ALIGNED_16(float, sumb, [17]);
117  LOCAL_ALIGNED_16(float, sumc, [17]);
118  LOCAL_ALIGNED_16(float, sumd, [17]);
119 
120  float sum;
121 
122  /* copy to avoid wrap */
123  __asm__ volatile(
124  "movaps 0(%0), %%xmm0 \n\t" \
125  "movaps 16(%0), %%xmm1 \n\t" \
126  "movaps 32(%0), %%xmm2 \n\t" \
127  "movaps 48(%0), %%xmm3 \n\t" \
128  "movaps %%xmm0, 0(%1) \n\t" \
129  "movaps %%xmm1, 16(%1) \n\t" \
130  "movaps %%xmm2, 32(%1) \n\t" \
131  "movaps %%xmm3, 48(%1) \n\t" \
132  "movaps 64(%0), %%xmm0 \n\t" \
133  "movaps 80(%0), %%xmm1 \n\t" \
134  "movaps 96(%0), %%xmm2 \n\t" \
135  "movaps 112(%0), %%xmm3 \n\t" \
136  "movaps %%xmm0, 64(%1) \n\t" \
137  "movaps %%xmm1, 80(%1) \n\t" \
138  "movaps %%xmm2, 96(%1) \n\t" \
139  "movaps %%xmm3, 112(%1) \n\t"
140  ::"r"(in), "r"(in+512)
141  :"memory"
142  );
143 
144  apply_window(in + 16, win , win + 512, suma, sumc, 16);
145  apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
146 
147  SUM8(MACS, suma[0], win + 32, in + 48);
148 
149  sumc[ 0] = 0;
150  sumb[16] = 0;
151  sumd[16] = 0;
152 
153 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
154  "movups " #sumd "(%4), %%xmm0 \n\t" \
155  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
156  "subps " #suma "(%1), %%xmm0 \n\t" \
157  "movaps %%xmm0," #out1 "(%0) \n\t" \
158 \
159  "movups " #sumc "(%3), %%xmm0 \n\t" \
160  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
161  "addps " #sumb "(%2), %%xmm0 \n\t" \
162  "movaps %%xmm0," #out2 "(%0) \n\t"
163 
164  if (incr == 1) {
165  __asm__ volatile(
166  SUMS( 0, 48, 4, 52, 0, 112)
167  SUMS(16, 32, 20, 36, 16, 96)
168  SUMS(32, 16, 36, 20, 32, 80)
169  SUMS(48, 0, 52, 4, 48, 64)
170 
171  :"+&r"(out)
172  :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
173  :"memory"
174  );
175  out += 16*incr;
176  } else {
177  int j;
178  float *out2 = out + 32 * incr;
179  out[0 ] = -suma[ 0];
180  out += incr;
181  out2 -= incr;
182  for(j=1;j<16;j++) {
183  *out = -suma[ j] + sumd[16-j];
184  *out2 = sumb[16-j] + sumc[ j];
185  out += incr;
186  out2 -= incr;
187  }
188  }
189 
190  sum = 0;
191  SUM8(MLSS, sum, win + 16 + 32, in + 32);
192  *out = sum;
193 }
194 
195 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
196 
197 #if HAVE_X86ASM
198 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
199 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
200  int count, int switch_point, int block_type) \
201 { \
202  int align_end = count - (count & 3); \
203  int j; \
204  for (j = 0; j < align_end; j+= 4) { \
205  LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
206  float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
207  /* apply window & overlap with previous buffer */ \
208  \
209  /* select window */ \
210  ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
211  in += 4*18; \
212  buf += 4*18; \
213  out += 4; \
214  } \
215  for (; j < count; j++) { \
216  /* apply window & overlap with previous buffer */ \
217  \
218  /* select window */ \
219  int win_idx = (switch_point && j < 2) ? 0 : block_type; \
220  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
221  \
222  ff_imdct36_float_ ## CPU1(out, buf, in, win); \
223  \
224  in += 18; \
225  buf++; \
226  out++; \
227  } \
228 }
229 
230 #if HAVE_SSE
231 #if ARCH_X86_32
232 DECL_IMDCT_BLOCKS(sse,sse)
233 #endif
234 DECL_IMDCT_BLOCKS(sse2,sse)
235 DECL_IMDCT_BLOCKS(sse3,sse)
236 DECL_IMDCT_BLOCKS(ssse3,sse)
237 #endif
238 #if HAVE_AVX_EXTERNAL
239 DECL_IMDCT_BLOCKS(avx,avx)
240 #endif
241 #endif /* HAVE_X86ASM */
242 
244 {
245  int i, j;
246  for (j = 0; j < 4; j++) {
247  for (i = 0; i < 40; i ++) {
248  mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
249  mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
250  mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
251  mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
252  mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
253  mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
254  mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
255  mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
256  }
257  }
258 }
259 
261 {
263 
264 #if HAVE_6REGS && HAVE_SSE_INLINE
265  if (INLINE_SSE(cpu_flags)) {
266  s->apply_window_float = apply_window_mp3;
267  }
268 #endif /* HAVE_SSE_INLINE */
269 
270 #if HAVE_X86ASM
271 #if HAVE_SSE
272 #if ARCH_X86_32
273  if (EXTERNAL_SSE(cpu_flags)) {
274  s->imdct36_blocks_float = imdct36_blocks_sse;
275  }
276 #endif
277  if (EXTERNAL_SSE2(cpu_flags)) {
278  s->imdct36_blocks_float = imdct36_blocks_sse2;
279  }
280  if (EXTERNAL_SSE3(cpu_flags)) {
281  s->imdct36_blocks_float = imdct36_blocks_sse3;
282  }
283  if (EXTERNAL_SSSE3(cpu_flags)) {
284  s->imdct36_blocks_float = imdct36_blocks_ssse3;
285  }
286 #endif
287 #if HAVE_AVX_EXTERNAL
288  if (EXTERNAL_AVX(cpu_flags)) {
289  s->imdct36_blocks_float = imdct36_blocks_avx;
290  }
291 #endif
292 #endif /* HAVE_X86ASM */
293 }
static float win(SuperEqualizerContext *s, float n, int N)
static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride)
static atomic_int cpu_flags
Definition: cpu.c:50
#define EXTERNAL_SSE(flags)
Definition: cpu.h:58
float ff_mdct_win_float[8][MDCT_BUF_SIZE]
Macro definitions for various function/variable attributes.
#define LOCAL_ALIGNED_16(t, v,...)
Definition: mem_internal.h:130
#define MACS(rt, ra, rb)
#define av_cold
Definition: attributes.h:88
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:117
#define EXTERNAL_SSE3(flags)
Definition: cpu.h:62
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
#define SUM8(op, sum, w, p)
void(* apply_window_float)(float *synth_buf, float *window, int *dither_state, float *samples, ptrdiff_t incr)
Definition: mpegaudiodsp.h:28
#define DECL(CPU)
Definition: mpegaudiodsp.c:30
#define MULT(c, x, n)
Definition: xvididct.c:145
GLsizei count
Definition: opengl_enc.c:108
common internal API header
void(* imdct36_blocks_float)(float *out, float *buf, float *in, int count, int switch_point, int block_type)
Definition: mpegaudiodsp.h:37
#define s(width, name)
Definition: cbs_vp9.c:257
static float mdct_win_sse[2][4][4 *40]
Definition: mpegaudiodsp.c:49
#define INLINE_SSE(flags)
Definition: cpu.h:88
static void(*const apply_window[4])(AVFloatDSPContext *fdsp, SingleChannelElement *sce, const float *audio)
Definition: aacenc.c:188
#define MLSS(rt, ra, rb)
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
Definition: mpegaudiodsp.c:260
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, float *tmpbuf)
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, float *tmpbuf)
int x86_reg
Definition: asm.h:72
int len
FILE * out
Definition: movenc.c:54
av_cold void ff_mpadsp_init_x86_tabs(void)
Definition: mpegaudiodsp.c:243
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
int i
Definition: input.c:407
#define av_unused
Definition: attributes.h:131