FFmpeg
tx_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) Lynne
3  *
4  * Power of two FFT:
5  * Copyright (c) Lynne
6  * Copyright (c) 2008 Loren Merritt
7  * Copyright (c) 2002 Fabrice Bellard
8  * Partly based on libdjbfft by D. J. Bernstein
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #define TABLE_DEF(name, size) \
28  DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
29 
30 #define SR_TABLE(len) \
31  TABLE_DEF(len, len/4 + 1)
32 
33 /* Power of two tables */
34 SR_TABLE(8);
35 SR_TABLE(16);
36 SR_TABLE(32);
37 SR_TABLE(64);
38 SR_TABLE(128);
39 SR_TABLE(256);
40 SR_TABLE(512);
41 SR_TABLE(1024);
42 SR_TABLE(2048);
43 SR_TABLE(4096);
44 SR_TABLE(8192);
45 SR_TABLE(16384);
46 SR_TABLE(32768);
47 SR_TABLE(65536);
48 SR_TABLE(131072);
49 
50 /* Other factors' tables */
51 TABLE_DEF(53, 12);
52 TABLE_DEF( 7, 6);
53 TABLE_DEF( 9, 8);
54 
55 typedef struct FFSRTabsInitOnce {
56  void (*func)(void);
58  int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
60 
61 #define INIT_FF_SR_TAB(len) \
62 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
63 { \
64  double freq = 2*M_PI/len; \
65  TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
66  \
67  for (int i = 0; i < len/4; i++) \
68  *tab++ = RESCALE(cos(i*freq)); \
69  \
70  *tab = 0; \
71 }
72 
77 INIT_FF_SR_TAB(128)
78 INIT_FF_SR_TAB(256)
79 INIT_FF_SR_TAB(512)
80 INIT_FF_SR_TAB(1024)
81 INIT_FF_SR_TAB(2048)
82 INIT_FF_SR_TAB(4096)
83 INIT_FF_SR_TAB(8192)
84 INIT_FF_SR_TAB(16384)
85 INIT_FF_SR_TAB(32768)
86 INIT_FF_SR_TAB(65536)
87 INIT_FF_SR_TAB(131072)
88 
90  { TX_TAB(ff_tx_init_tab_8), AV_ONCE_INIT },
91  { TX_TAB(ff_tx_init_tab_16), AV_ONCE_INIT },
92  { TX_TAB(ff_tx_init_tab_32), AV_ONCE_INIT },
93  { TX_TAB(ff_tx_init_tab_64), AV_ONCE_INIT },
94  { TX_TAB(ff_tx_init_tab_128), AV_ONCE_INIT },
95  { TX_TAB(ff_tx_init_tab_256), AV_ONCE_INIT },
96  { TX_TAB(ff_tx_init_tab_512), AV_ONCE_INIT },
97  { TX_TAB(ff_tx_init_tab_1024), AV_ONCE_INIT },
98  { TX_TAB(ff_tx_init_tab_2048), AV_ONCE_INIT },
99  { TX_TAB(ff_tx_init_tab_4096), AV_ONCE_INIT },
100  { TX_TAB(ff_tx_init_tab_8192), AV_ONCE_INIT },
101  { TX_TAB(ff_tx_init_tab_16384), AV_ONCE_INIT },
102  { TX_TAB(ff_tx_init_tab_32768), AV_ONCE_INIT },
103  { TX_TAB(ff_tx_init_tab_65536), AV_ONCE_INIT },
104  { TX_TAB(ff_tx_init_tab_131072), AV_ONCE_INIT },
105 };
106 
107 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
108 {
109  /* 5pt, doubled to eliminate AVX lane shuffles */
110  TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
111  TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
112  TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
113  TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
114  TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
115  TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
116  TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
117  TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
118 
119  /* 3pt */
120  TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
121  TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
122  TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
123  TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
124 }
125 
126 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
127 {
128  TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
129  TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
130  TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
131  TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
132  TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
133  TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
134 }
135 
136 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
137 {
138  TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
139  TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
140  TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9));
141  TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9));
142  TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
143  TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
144  TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
145  TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
146 }
147 
149  { TX_TAB(ff_tx_init_tab_53), AV_ONCE_INIT, { 15, 5, 3 } },
150  { TX_TAB(ff_tx_init_tab_9), AV_ONCE_INIT, { 9 } },
151  { TX_TAB(ff_tx_init_tab_7), AV_ONCE_INIT, { 7 } },
152 };
153 
154 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
155 {
156  int factor_2 = ff_ctz(len);
157  if (factor_2) {
158  int idx = factor_2 - 3;
159  for (int i = 0; i <= idx; i++)
162  len >>= factor_2;
163  }
164 
165  for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_once); i++) {
166  int f, f_idx = 0;
167 
168  if (len <= 1)
169  return;
170 
171  while ((f = nptwo_tabs_init_once[i].factors[f_idx++])) {
172  if (f % len)
173  continue;
174 
177  len /= f;
178  break;
179  }
180  }
181 }
182 
184  ptrdiff_t stride)
185 {
186  TXComplex tmp[2];
187  const TXSample *tab = TX_TAB(ff_tx_tab_53);
188 #ifdef TX_INT32
189  int64_t mtmp[4];
190 #endif
191 
192  BF(tmp[0].re, tmp[1].im, in[1].im, in[2].im);
193  BF(tmp[0].im, tmp[1].re, in[1].re, in[2].re);
194 
195  out[0*stride].re = in[0].re + tmp[1].re;
196  out[0*stride].im = in[0].im + tmp[1].im;
197 
198 #ifdef TX_INT32
199  mtmp[0] = (int64_t)tab[ 8] * tmp[0].re;
200  mtmp[1] = (int64_t)tab[ 9] * tmp[0].im;
201  mtmp[2] = (int64_t)tab[10] * tmp[1].re;
202  mtmp[3] = (int64_t)tab[10] * tmp[1].im;
203  out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
204  out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
205  out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
206  out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
207 #else
208  tmp[0].re = tab[ 8] * tmp[0].re;
209  tmp[0].im = tab[ 9] * tmp[0].im;
210  tmp[1].re = tab[10] * tmp[1].re;
211  tmp[1].im = tab[10] * tmp[1].im;
212  out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
213  out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
214  out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re;
215  out[2*stride].im = in[0].im - tmp[1].im + tmp[0].im;
216 #endif
217 }
218 
219 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
220 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
221  ptrdiff_t stride) \
222 { \
223  TXComplex z0[4], t[6]; \
224  const TXSample *tab = TX_TAB(ff_tx_tab_53); \
225  \
226  BF(t[1].im, t[0].re, in[1].re, in[4].re); \
227  BF(t[1].re, t[0].im, in[1].im, in[4].im); \
228  BF(t[3].im, t[2].re, in[2].re, in[3].re); \
229  BF(t[3].re, t[2].im, in[2].im, in[3].im); \
230  \
231  out[D0*stride].re = in[0].re + t[0].re + t[2].re; \
232  out[D0*stride].im = in[0].im + t[0].im + t[2].im; \
233  \
234  SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
235  SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
236  CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
237  CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
238  \
239  BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
240  BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
241  BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
242  BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
243  \
244  out[D1*stride].re = in[0].re + z0[3].re; \
245  out[D1*stride].im = in[0].im + z0[0].im; \
246  out[D2*stride].re = in[0].re + z0[2].re; \
247  out[D2*stride].im = in[0].im + z0[1].im; \
248  out[D3*stride].re = in[0].re + z0[1].re; \
249  out[D3*stride].im = in[0].im + z0[2].im; \
250  out[D4*stride].re = in[0].re + z0[0].re; \
251  out[D4*stride].im = in[0].im + z0[3].im; \
252 }
253 
254 DECL_FFT5(fft5, 0, 1, 2, 3, 4)
255 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9)
256 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4)
257 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14)
258 
260  ptrdiff_t stride)
261 {
262  TXComplex t[6], z[3];
263  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
264 #ifdef TX_INT32
265  int64_t mtmp[12];
266 #endif
267 
268  BF(t[1].re, t[0].re, in[1].re, in[6].re);
269  BF(t[1].im, t[0].im, in[1].im, in[6].im);
270  BF(t[3].re, t[2].re, in[2].re, in[5].re);
271  BF(t[3].im, t[2].im, in[2].im, in[5].im);
272  BF(t[5].re, t[4].re, in[3].re, in[4].re);
273  BF(t[5].im, t[4].im, in[3].im, in[4].im);
274 
275  out[0*stride].re = in[0].re + t[0].re + t[2].re + t[4].re;
276  out[0*stride].im = in[0].im + t[0].im + t[2].im + t[4].im;
277 
278 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
279  mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
280  mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
281  mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
282  mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
283  mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
284  mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
285 
286  mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
287  mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
288  mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
289  mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
290  mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
291  mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
292 
293  z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
294  z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
295  z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
296  z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
297  z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
298  z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
299 
300  t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
301  t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
302  t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
303  t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
304  t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
305  t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
306 #else
307  z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
308  z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
309  z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
310  z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
311  z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
312  z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
313 
314  /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
315  * multiplying the sum of all with the average of the twiddles */
316 
317  t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
318  t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
319  t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
320  t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
321  t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
322  t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
323 #endif
324 
325  BF(t[1].re, z[0].re, z[0].re, t[4].re);
326  BF(t[3].re, z[1].re, z[1].re, t[2].re);
327  BF(t[5].re, z[2].re, z[2].re, t[0].re);
328  BF(t[1].im, z[0].im, z[0].im, t[0].im);
329  BF(t[3].im, z[1].im, z[1].im, t[2].im);
330  BF(t[5].im, z[2].im, z[2].im, t[4].im);
331 
332  out[1*stride].re = in[0].re + z[0].re;
333  out[1*stride].im = in[0].im + t[1].im;
334  out[2*stride].re = in[0].re + t[3].re;
335  out[2*stride].im = in[0].im + z[1].im;
336  out[3*stride].re = in[0].re + z[2].re;
337  out[3*stride].im = in[0].im + t[5].im;
338  out[4*stride].re = in[0].re + t[5].re;
339  out[4*stride].im = in[0].im + z[2].im;
340  out[5*stride].re = in[0].re + z[1].re;
341  out[5*stride].im = in[0].im + t[3].im;
342  out[6*stride].re = in[0].re + t[1].re;
343  out[6*stride].im = in[0].im + z[0].im;
344 }
345 
347  ptrdiff_t stride)
348 {
349  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
350  TXComplex t[16], w[4], x[5], y[5], z[2];
351 #ifdef TX_INT32
352  int64_t mtmp[12];
353 #endif
354 
355  BF(t[1].re, t[0].re, in[1].re, in[8].re);
356  BF(t[1].im, t[0].im, in[1].im, in[8].im);
357  BF(t[3].re, t[2].re, in[2].re, in[7].re);
358  BF(t[3].im, t[2].im, in[2].im, in[7].im);
359  BF(t[5].re, t[4].re, in[3].re, in[6].re);
360  BF(t[5].im, t[4].im, in[3].im, in[6].im);
361  BF(t[7].re, t[6].re, in[4].re, in[5].re);
362  BF(t[7].im, t[6].im, in[4].im, in[5].im);
363 
364  w[0].re = t[0].re - t[6].re;
365  w[0].im = t[0].im - t[6].im;
366  w[1].re = t[2].re - t[6].re;
367  w[1].im = t[2].im - t[6].im;
368  w[2].re = t[1].re - t[7].re;
369  w[2].im = t[1].im - t[7].im;
370  w[3].re = t[3].re + t[7].re;
371  w[3].im = t[3].im + t[7].im;
372 
373  z[0].re = in[0].re + t[4].re;
374  z[0].im = in[0].im + t[4].im;
375 
376  z[1].re = t[0].re + t[2].re + t[6].re;
377  z[1].im = t[0].im + t[2].im + t[6].im;
378 
379  out[0*stride].re = z[0].re + z[1].re;
380  out[0*stride].im = z[0].im + z[1].im;
381 
382 #ifdef TX_INT32
383  mtmp[0] = t[1].re - t[3].re + t[7].re;
384  mtmp[1] = t[1].im - t[3].im + t[7].im;
385 
386  y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
387  y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
388 
389  mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
390  mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
391  mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
392  mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
393 
394  x[3].re = z[0].re + (int32_t)mtmp[0];
395  x[3].im = z[0].im + (int32_t)mtmp[1];
396  z[0].re = in[0].re + (int32_t)mtmp[2];
397  z[0].im = in[0].im + (int32_t)mtmp[3];
398 
399  mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
400  mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
401  mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
402  mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
403  mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
404  mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
405  mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
406  mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
407 
408  x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
409  x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
410  x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
411  x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
412  y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
413  y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
414  y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
415  y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
416 
417  y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
418  y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
419 
420 #else
421  y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
422  y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
423 
424  x[3].re = z[0].re + tab[0].re*z[1].re;
425  x[3].im = z[0].im + tab[0].re*z[1].im;
426  z[0].re = in[0].re + tab[0].re*t[4].re;
427  z[0].im = in[0].im + tab[0].re*t[4].im;
428 
429  x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
430  x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
431  x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
432  x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
433  y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
434  y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
435  y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
436  y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
437 
438  y[0].re = tab[0].im*t[5].re;
439  y[0].im = tab[0].im*t[5].im;
440 #endif
441 
442  x[4].re = x[1].re + x[2].re;
443  x[4].im = x[1].im + x[2].im;
444 
445  y[4].re = y[1].re - y[2].re;
446  y[4].im = y[1].im - y[2].im;
447  x[1].re = z[0].re + x[1].re;
448  x[1].im = z[0].im + x[1].im;
449  y[1].re = y[0].re + y[1].re;
450  y[1].im = y[0].im + y[1].im;
451  x[2].re = z[0].re + x[2].re;
452  x[2].im = z[0].im + x[2].im;
453  y[2].re = y[2].re - y[0].re;
454  y[2].im = y[2].im - y[0].im;
455  x[4].re = z[0].re - x[4].re;
456  x[4].im = z[0].im - x[4].im;
457  y[4].re = y[0].re - y[4].re;
458  y[4].im = y[0].im - y[4].im;
459 
460  out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
461  out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
462  out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
463  out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
464  out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
465  out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
466  out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
467  out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
468 }
469 
471  ptrdiff_t stride)
472 {
473  TXComplex tmp[15];
474 
475  for (int i = 0; i < 5; i++)
476  fft3(tmp + i, in + i*3, 5);
477 
478  fft5_m1(out, tmp + 0, stride);
479  fft5_m2(out, tmp + 5, stride);
480  fft5_m3(out, tmp + 10, stride);
481 }
482 
483 #define BUTTERFLIES(a0, a1, a2, a3) \
484  do { \
485  r0=a0.re; \
486  i0=a0.im; \
487  r1=a1.re; \
488  i1=a1.im; \
489  BF(t3, t5, t5, t1); \
490  BF(a2.re, a0.re, r0, t5); \
491  BF(a3.im, a1.im, i1, t3); \
492  BF(t4, t6, t2, t6); \
493  BF(a3.re, a1.re, r1, t4); \
494  BF(a2.im, a0.im, i0, t6); \
495  } while (0)
496 
497 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
498  do { \
499  CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
500  CMUL(t5, t6, a3.re, a3.im, wre, wim); \
501  BUTTERFLIES(a0, a1, a2, a3); \
502  } while (0)
503 
504 /* z[0...8n-1], w[1...2n-1] */
505 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
506  const TXSample *cos, int len)
507 {
508  int o1 = 2*len;
509  int o2 = 4*len;
510  int o3 = 6*len;
511  const TXSample *wim = cos + o1 - 7;
512  TXSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
513 
514  for (int i = 0; i < len; i += 4) {
515  TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
516  TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
517  TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
518  TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
519 
520  TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
521  TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
522  TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
523  TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
524 
525  z += 2*4;
526  cos += 2*4;
527  wim -= 2*4;
528  }
529 }
530 
532  const FFTXCodelet *cd,
533  uint64_t flags,
535  int len, int inv,
536  const void *scale)
537 {
538  TX_TAB(ff_tx_init_tabs)(len);
539  return ff_tx_gen_ptwo_revtab(s, opts ? opts->invert_lookup : 1);
540 }
541 
542 #define DECL_SR_CODELET_DEF(n) \
543 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
544  .name = TX_NAME_STR("fft" #n "_ns"), \
545  .function = TX_NAME(ff_tx_fft##n##_ns), \
546  .type = TX_TYPE(FFT), \
547  .flags = AV_TX_INPLACE | AV_TX_UNALIGNED | \
548  FF_TX_PRESHUFFLE, \
549  .factors[0] = 2, \
550  .min_len = n, \
551  .max_len = n, \
552  .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
553  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
554  .prio = FF_TX_PRIO_BASE, \
555 };
556 
557 #define DECL_SR_CODELET(n, n2, n4) \
558 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *dst, \
559  void *src, ptrdiff_t stride) \
560 { \
561  TXComplex *z = dst; \
562  const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
563  \
564  TX_NAME(ff_tx_fft##n2##_ns)(s, z, z, stride); \
565  TX_NAME(ff_tx_fft##n4##_ns)(s, z + n4*2, z + n4*2, stride); \
566  TX_NAME(ff_tx_fft##n4##_ns)(s, z + n4*3, z + n4*3, stride); \
567  TX_NAME(ff_tx_fft_sr_combine)(z, cos, n4 >> 1); \
568 } \
569  \
570 DECL_SR_CODELET_DEF(n)
571 
572 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *dst,
573  void *src, ptrdiff_t stride)
574 {
575  TXComplex *z = dst;
576  TXComplex tmp;
577 
578  BF(tmp.re, z[0].re, z[0].re, z[1].re);
579  BF(tmp.im, z[0].im, z[0].im, z[1].im);
580  z[1] = tmp;
581 }
582 
583 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *dst,
584  void *src, ptrdiff_t stride)
585 {
586  TXComplex *z = dst;
587  TXSample t1, t2, t3, t4, t5, t6, t7, t8;
588 
589  BF(t3, t1, z[0].re, z[1].re);
590  BF(t8, t6, z[3].re, z[2].re);
591  BF(z[2].re, z[0].re, t1, t6);
592  BF(t4, t2, z[0].im, z[1].im);
593  BF(t7, t5, z[2].im, z[3].im);
594  BF(z[3].im, z[1].im, t4, t8);
595  BF(z[3].re, z[1].re, t3, t7);
596  BF(z[2].im, z[0].im, t2, t5);
597 }
598 
599 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *dst,
600  void *src, ptrdiff_t stride)
601 {
602  TXComplex *z = dst;
603  TXSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
604  const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
605 
606  TX_NAME(ff_tx_fft4_ns)(s, z, z, stride);
607 
608  BF(t1, z[5].re, z[4].re, -z[5].re);
609  BF(t2, z[5].im, z[4].im, -z[5].im);
610  BF(t5, z[7].re, z[6].re, -z[7].re);
611  BF(t6, z[7].im, z[6].im, -z[7].im);
612 
613  BUTTERFLIES(z[0], z[2], z[4], z[6]);
614  TRANSFORM(z[1], z[3], z[5], z[7], cos, cos);
615 }
616 
617 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *dst,
618  void *src, ptrdiff_t stride)
619 {
620  TXComplex *z = dst;
621  const TXSample *cos = TX_TAB(ff_tx_tab_16);
622 
623  TXSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
624  TXSample cos_16_1 = cos[1];
625  TXSample cos_16_2 = cos[2];
626  TXSample cos_16_3 = cos[3];
627 
628  TX_NAME(ff_tx_fft8_ns)(s, z + 0, z + 0, stride);
629  TX_NAME(ff_tx_fft4_ns)(s, z + 8, z + 8, stride);
630  TX_NAME(ff_tx_fft4_ns)(s, z + 12, z + 12, stride);
631 
632  t1 = z[ 8].re;
633  t2 = z[ 8].im;
634  t5 = z[12].re;
635  t6 = z[12].im;
636  BUTTERFLIES(z[0], z[4], z[8], z[12]);
637 
638  TRANSFORM(z[ 2], z[ 6], z[10], z[14], cos_16_2, cos_16_2);
639  TRANSFORM(z[ 1], z[ 5], z[ 9], z[13], cos_16_1, cos_16_3);
640  TRANSFORM(z[ 3], z[ 7], z[11], z[15], cos_16_3, cos_16_1);
641 }
642 
647 DECL_SR_CODELET(32,16,8)
648 DECL_SR_CODELET(64,32,16)
649 DECL_SR_CODELET(128,64,32)
650 DECL_SR_CODELET(256,128,64)
651 DECL_SR_CODELET(512,256,128)
652 DECL_SR_CODELET(1024,512,256)
653 DECL_SR_CODELET(2048,1024,512)
654 DECL_SR_CODELET(4096,2048,1024)
655 DECL_SR_CODELET(8192,4096,2048)
656 DECL_SR_CODELET(16384,8192,4096)
657 DECL_SR_CODELET(32768,16384,8192)
658 DECL_SR_CODELET(65536,32768,16384)
659 DECL_SR_CODELET(131072,65536,32768)
660 
662  const FFTXCodelet *cd,
663  uint64_t flags,
665  int len, int inv,
666  const void *scale)
667 {
668  int ret;
669  int is_inplace = !!(flags & AV_TX_INPLACE);
670  FFTXCodeletOptions sub_opts = { .invert_lookup = !is_inplace };
671 
672  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
673  flags |= AV_TX_INPLACE; /* in-place */
674  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
675 
676  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
677  return ret;
678 
679  if (is_inplace && (ret = ff_tx_gen_ptwo_inplace_revtab_idx(s)))
680  return ret;
681 
682  return 0;
683 }
684 
685 static void TX_NAME(ff_tx_fft_sr)(AVTXContext *s, void *_dst,
686  void *_src, ptrdiff_t stride)
687 {
688  TXComplex *src = _src;
689  TXComplex *dst = _dst;
690  int *map = s->sub[0].map;
691  int len = s->len;
692 
693  /* Compilers can't vectorize this anyway without assuming AVX2, which they
694  * generally don't, at least without -march=native -mtune=native */
695  for (int i = 0; i < len; i++)
696  dst[i] = src[map[i]];
697 
698  s->fn[0](&s->sub[0], dst, dst, stride);
699 }
700 
701 static void TX_NAME(ff_tx_fft_sr_inplace)(AVTXContext *s, void *_dst,
702  void *_src, ptrdiff_t stride)
703 {
704  TXComplex *dst = _dst;
705  TXComplex tmp;
706  const int *map = s->sub->map;
707  const int *inplace_idx = s->map;
708  int src_idx, dst_idx;
709 
710  src_idx = *inplace_idx++;
711  do {
712  tmp = dst[src_idx];
713  dst_idx = map[src_idx];
714  do {
715  FFSWAP(TXComplex, tmp, dst[dst_idx]);
716  dst_idx = map[dst_idx];
717  } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
718  dst[dst_idx] = tmp;
719  } while ((src_idx = *inplace_idx++));
720 
721  s->fn[0](&s->sub[0], dst, dst, stride);
722 }
723 
724 static const FFTXCodelet TX_NAME(ff_tx_fft_sr_def) = {
725  .name = TX_NAME_STR("fft_sr"),
726  .function = TX_NAME(ff_tx_fft_sr),
727  .type = TX_TYPE(FFT),
729  .factors[0] = 2,
730  .min_len = 2,
731  .max_len = TX_LEN_UNLIMITED,
732  .init = TX_NAME(ff_tx_fft_sr_init),
734  .prio = FF_TX_PRIO_BASE,
735 };
736 
737 static const FFTXCodelet TX_NAME(ff_tx_fft_sr_inplace_def) = {
738  .name = TX_NAME_STR("fft_sr_inplace"),
739  .function = TX_NAME(ff_tx_fft_sr_inplace),
740  .type = TX_TYPE(FFT),
741  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE,
742  .factors[0] = 2,
743  .min_len = 2,
744  .max_len = TX_LEN_UNLIMITED,
745  .init = TX_NAME(ff_tx_fft_sr_init),
747  .prio = FF_TX_PRIO_BASE,
748 };
749 
750 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
751  ptrdiff_t stride)
752 {
753  TXComplex *src = _src;
754  TXComplex *dst = _dst;
755  const int n = s->len;
756  double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
757 
758  for(int i = 0; i < n; i++) {
759  TXComplex tmp = { 0 };
760  for(int j = 0; j < n; j++) {
761  const double factor = phase*i*j;
762  const TXComplex mult = {
763  RESCALE(cos(factor)),
764  RESCALE(sin(factor)),
765  };
766  TXComplex res;
767  CMUL3(res, src[j], mult);
768  tmp.re += res.re;
769  tmp.im += res.im;
770  }
771  dst[i] = tmp;
772  }
773 }
774 
775 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
776  .name = TX_NAME_STR("fft_naive"),
777  .function = TX_NAME(ff_tx_fft_naive),
778  .type = TX_TYPE(FFT),
780  .factors[0] = TX_FACTOR_ANY,
781  .min_len = 2,
782  .max_len = TX_LEN_UNLIMITED,
783  .init = NULL,
784  .cpu_flags = FF_TX_CPU_FLAGS_ALL,
785  .prio = FF_TX_PRIO_MIN,
786 };
787 
789  const FFTXCodelet *cd,
790  uint64_t flags,
792  int len, int inv,
793  const void *scale)
794 {
795  int ret;
796  int sub_len = len / cd->factors[0];
797  FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
798 
799  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
800  flags |= AV_TX_INPLACE; /* in-place */
801  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
802 
803  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
804  sub_len, inv, scale)))
805  return ret;
806 
807  if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
808  return ret;
809 
810  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
811  return AVERROR(ENOMEM);
812 
813  TX_TAB(ff_tx_init_tabs)(len / sub_len);
814 
815  return 0;
816 }
817 
818 #define DECL_COMP_FFT(N) \
819 static void TX_NAME(ff_tx_fft_pfa_##N##xM)(AVTXContext *s, void *_out, \
820  void *_in, ptrdiff_t stride) \
821 { \
822  const int m = s->sub->len; \
823  const int *in_map = s->map, *out_map = in_map + s->len; \
824  const int *sub_map = s->sub->map; \
825  TXComplex *in = _in; \
826  TXComplex *out = _out; \
827  TXComplex fft##N##in[N]; \
828  \
829  for (int i = 0; i < m; i++) { \
830  for (int j = 0; j < N; j++) \
831  fft##N##in[j] = in[in_map[i*N + j]]; \
832  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
833  } \
834  \
835  for (int i = 0; i < N; i++) \
836  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
837  \
838  for (int i = 0; i < N*m; i++) \
839  out[i] = s->tmp[out_map[i]]; \
840 } \
841  \
842 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_##N##xM_def) = { \
843  .name = TX_NAME_STR("fft_pfa_" #N "xM"), \
844  .function = TX_NAME(ff_tx_fft_pfa_##N##xM), \
845  .type = TX_TYPE(FFT), \
846  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, \
847  .factors = { N, TX_FACTOR_ANY }, \
848  .min_len = N*2, \
849  .max_len = TX_LEN_UNLIMITED, \
850  .init = TX_NAME(ff_tx_fft_pfa_init), \
851  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
852  .prio = FF_TX_PRIO_BASE, \
853 };
854 
855 DECL_COMP_FFT(3)
856 DECL_COMP_FFT(5)
857 DECL_COMP_FFT(7)
858 DECL_COMP_FFT(9)
859 DECL_COMP_FFT(15)
860 
862  const FFTXCodelet *cd,
863  uint64_t flags,
865  int len, int inv,
866  const void *scale)
867 {
868  s->scale_d = *((SCALE_TYPE *)scale);
869  s->scale_f = s->scale_d;
870  return 0;
871 }
872 
873 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
874  void *_src, ptrdiff_t stride)
875 {
876  TXSample *src = _src;
877  TXSample *dst = _dst;
878  double scale = s->scale_d;
879  int len = s->len;
880  const double phase = M_PI/(4.0*len);
881 
882  stride /= sizeof(*dst);
883 
884  for (int i = 0; i < len; i++) {
885  double sum = 0.0;
886  for (int j = 0; j < len*2; j++) {
887  int a = (2*j + 1 + len) * (2*i + 1);
888  sum += UNSCALE(src[j]) * cos(a * phase);
889  }
890  dst[i*stride] = RESCALE(sum*scale);
891  }
892 }
893 
894 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
895  void *_src, ptrdiff_t stride)
896 {
897  TXSample *src = _src;
898  TXSample *dst = _dst;
899  double scale = s->scale_d;
900  int len = s->len >> 1;
901  int len2 = len*2;
902  const double phase = M_PI/(4.0*len2);
903 
904  stride /= sizeof(*src);
905 
906  for (int i = 0; i < len; i++) {
907  double sum_d = 0.0;
908  double sum_u = 0.0;
909  double i_d = phase * (4*len - 2*i - 1);
910  double i_u = phase * (3*len2 + 2*i + 1);
911  for (int j = 0; j < len2; j++) {
912  double a = (2 * j + 1);
913  double a_d = cos(a * i_d);
914  double a_u = cos(a * i_u);
915  double val = UNSCALE(src[j*stride]);
916  sum_d += a_d * val;
917  sum_u += a_u * val;
918  }
919  dst[i + 0] = RESCALE( sum_d*scale);
920  dst[i + len] = RESCALE(-sum_u*scale);
921  }
922 }
923 
924 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
925  .name = TX_NAME_STR("mdct_naive_fwd"),
926  .function = TX_NAME(ff_tx_mdct_naive_fwd),
927  .type = TX_TYPE(MDCT),
929  .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
930  .min_len = 2,
931  .max_len = TX_LEN_UNLIMITED,
934  .prio = FF_TX_PRIO_MIN,
935 };
936 
937 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
938  .name = TX_NAME_STR("mdct_naive_inv"),
939  .function = TX_NAME(ff_tx_mdct_naive_inv),
940  .type = TX_TYPE(MDCT),
942  .factors = { 2, TX_FACTOR_ANY },
943  .min_len = 2,
944  .max_len = TX_LEN_UNLIMITED,
947  .prio = FF_TX_PRIO_MIN,
948 };
949 
951  const FFTXCodelet *cd,
952  uint64_t flags,
954  int len, int inv,
955  const void *scale)
956 {
957  int ret;
958  FFTXCodeletOptions sub_opts = { .invert_lookup = inv };
959 
960  s->scale_d = *((SCALE_TYPE *)scale);
961  s->scale_f = s->scale_d;
962 
963  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
964  flags |= AV_TX_INPLACE; /* in-place */
965  flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */
966 
967  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
968  inv, scale))) {
969  flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
970  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
971  inv, scale)))
972  return ret;
973  }
974 
975  s->map = av_malloc((len >> 1)*sizeof(*s->map));
976  if (!s->map)
977  return AVERROR(ENOMEM);
978 
979  /* If we need to preshuffle copy the map from the subcontext */
980  if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
981  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
982  } else {
983  for (int i = 0; i < len >> 1; i++)
984  s->map[i] = i;
985  }
986 
987  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
988  return ret;
989 
990  /* Saves a multiply in a hot path. */
991  if (inv)
992  for (int i = 0; i < (s->len >> 1); i++)
993  s->map[i] <<= 1;
994 
995  return 0;
996 }
997 
998 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
999  ptrdiff_t stride)
1000 {
1001  TXSample *src = _src, *dst = _dst;
1002  TXComplex *exp = s->exp, tmp, *z = _dst;
1003  const int len2 = s->len >> 1;
1004  const int len4 = s->len >> 2;
1005  const int len3 = len2 * 3;
1006  const int *sub_map = s->map;
1007 
1008  stride /= sizeof(*dst);
1009 
1010  for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1011  const int k = 2*i;
1012  const int idx = sub_map[i];
1013  if (k < len2) {
1014  tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]);
1015  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1016  } else {
1017  tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1018  tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1019  }
1020  CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1021  }
1022 
1023  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1024 
1025  for (int i = 0; i < len4; i++) {
1026  const int i0 = len4 + i, i1 = len4 - i - 1;
1027  TXComplex src1 = { z[i1].re, z[i1].im };
1028  TXComplex src0 = { z[i0].re, z[i0].im };
1029 
1030  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1031  exp[i0].im, exp[i0].re);
1032  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1033  exp[i1].im, exp[i1].re);
1034  }
1035 }
1036 
1037 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1038  ptrdiff_t stride)
1039 {
1040  TXComplex *z = _dst, *exp = s->exp;
1041  const TXSample *src = _src, *in1, *in2;
1042  const int len2 = s->len >> 1;
1043  const int len4 = s->len >> 2;
1044  const int *sub_map = s->map;
1045 
1046  stride /= sizeof(*src);
1047  in1 = src;
1048  in2 = src + ((len2*2) - 1) * stride;
1049 
1050  for (int i = 0; i < len2; i++) {
1051  int k = sub_map[i];
1052  TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1053  CMUL3(z[i], tmp, exp[i]);
1054  }
1055 
1056  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1057 
1058  exp += len2;
1059  for (int i = 0; i < len4; i++) {
1060  const int i0 = len4 + i, i1 = len4 - i - 1;
1061  TXComplex src1 = { z[i1].im, z[i1].re };
1062  TXComplex src0 = { z[i0].im, z[i0].re };
1063 
1064  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1065  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1066  }
1067 }
1068 
1069 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1070  .name = TX_NAME_STR("mdct_fwd"),
1071  .function = TX_NAME(ff_tx_mdct_fwd),
1072  .type = TX_TYPE(MDCT),
1074  .factors = { 2, TX_FACTOR_ANY },
1075  .min_len = 2,
1076  .max_len = TX_LEN_UNLIMITED,
1077  .init = TX_NAME(ff_tx_mdct_init),
1079  .prio = FF_TX_PRIO_BASE,
1080 };
1081 
1082 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1083  .name = TX_NAME_STR("mdct_inv"),
1084  .function = TX_NAME(ff_tx_mdct_inv),
1085  .type = TX_TYPE(MDCT),
1087  .factors = { 2, TX_FACTOR_ANY },
1088  .min_len = 2,
1089  .max_len = TX_LEN_UNLIMITED,
1090  .init = TX_NAME(ff_tx_mdct_init),
1092  .prio = FF_TX_PRIO_BASE,
1093 };
1094 
1096  const FFTXCodelet *cd,
1097  uint64_t flags,
1099  int len, int inv,
1100  const void *scale)
1101 {
1102  int ret;
1103 
1104  s->scale_d = *((SCALE_TYPE *)scale);
1105  s->scale_f = s->scale_d;
1106 
1107  flags &= ~AV_TX_FULL_IMDCT;
1108 
1109  if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1110  return ret;
1111 
1112  return 0;
1113 }
1114 
1115 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1116  void *_src, ptrdiff_t stride)
1117 {
1118  int len = s->len << 1;
1119  int len2 = len >> 1;
1120  int len4 = len >> 2;
1121  TXSample *dst = _dst;
1122 
1123  s->fn[0](&s->sub[0], dst + len4, _src, stride);
1124 
1125  stride /= sizeof(*dst);
1126 
1127  for (int i = 0; i < len4; i++) {
1128  dst[ i*stride] = -dst[(len2 - i - 1)*stride];
1129  dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride];
1130  }
1131 }
1132 
1133 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1134  .name = TX_NAME_STR("mdct_inv_full"),
1135  .function = TX_NAME(ff_tx_mdct_inv_full),
1136  .type = TX_TYPE(MDCT),
1137  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1139  .factors = { 2, TX_FACTOR_ANY },
1140  .min_len = 2,
1141  .max_len = TX_LEN_UNLIMITED,
1144  .prio = FF_TX_PRIO_BASE,
1145 };
1146 
1148  const FFTXCodelet *cd,
1149  uint64_t flags,
1151  int len, int inv,
1152  const void *scale)
1153 {
1154  int ret, sub_len;
1155  FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
1156 
1157  len >>= 1;
1158  sub_len = len / cd->factors[0];
1159 
1160  s->scale_d = *((SCALE_TYPE *)scale);
1161  s->scale_f = s->scale_d;
1162 
1163  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1164  flags |= AV_TX_INPLACE; /* in-place */
1165  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
1166 
1167  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1168  sub_len, inv, scale)))
1169  return ret;
1170 
1171  if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
1172  return ret;
1173 
1174  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1175  return ret;
1176 
1177  /* Saves multiplies in loops. */
1178  for (int i = 0; i < len; i++)
1179  s->map[i] <<= 1;
1180 
1181  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1182  return AVERROR(ENOMEM);
1183 
1184  TX_TAB(ff_tx_init_tabs)(len / sub_len);
1185 
1186  return 0;
1187 }
1188 
1189 #define DECL_COMP_IMDCT(N) \
1190 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1191  void *_src, ptrdiff_t stride) \
1192 { \
1193  TXComplex fft##N##in[N]; \
1194  TXComplex *z = _dst, *exp = s->exp; \
1195  const TXSample *src = _src, *in1, *in2; \
1196  const int len4 = s->len >> 2; \
1197  const int len2 = s->len >> 1; \
1198  const int m = s->sub->len; \
1199  const int *in_map = s->map, *out_map = in_map + N*m; \
1200  const int *sub_map = s->sub->map; \
1201  \
1202  stride /= sizeof(*src); /* To convert it from bytes */ \
1203  in1 = src; \
1204  in2 = src + ((N*m*2) - 1) * stride; \
1205  \
1206  for (int i = 0; i < len2; i += N) { \
1207  for (int j = 0; j < N; j++) { \
1208  const int k = in_map[j]; \
1209  TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1210  CMUL3(fft##N##in[j], tmp, exp[j]); \
1211  } \
1212  fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1213  exp += N; \
1214  in_map += N; \
1215  } \
1216  \
1217  for (int i = 0; i < N; i++) \
1218  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1219  \
1220  for (int i = 0; i < len4; i++) { \
1221  const int i0 = len4 + i, i1 = len4 - i - 1; \
1222  const int s0 = out_map[i0], s1 = out_map[i1]; \
1223  TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1224  TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1225  \
1226  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1227  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1228  } \
1229 } \
1230  \
1231 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1232  .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1233  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1234  .type = TX_TYPE(MDCT), \
1235  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1236  .factors = { N, TX_FACTOR_ANY }, \
1237  .min_len = N*2, \
1238  .max_len = TX_LEN_UNLIMITED, \
1239  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1240  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1241  .prio = FF_TX_PRIO_BASE, \
1242 };
1243 
1244 DECL_COMP_IMDCT(3)
1245 DECL_COMP_IMDCT(5)
1246 DECL_COMP_IMDCT(7)
1247 DECL_COMP_IMDCT(9)
1248 DECL_COMP_IMDCT(15)
1249 
1250 #define DECL_COMP_MDCT(N) \
1251 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1252  void *_src, ptrdiff_t stride) \
1253 { \
1254  TXComplex fft##N##in[N]; \
1255  TXSample *src = _src, *dst = _dst; \
1256  TXComplex *exp = s->exp, tmp; \
1257  const int m = s->sub->len; \
1258  const int len4 = N*m; \
1259  const int len3 = len4 * 3; \
1260  const int len8 = s->len >> 2; \
1261  const int *in_map = s->map, *out_map = in_map + N*m; \
1262  const int *sub_map = s->sub->map; \
1263  \
1264  stride /= sizeof(*dst); \
1265  \
1266  for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \
1267  for (int j = 0; j < N; j++) { \
1268  const int k = in_map[i*N + j]; \
1269  if (k < len4) { \
1270  tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1271  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1272  } else { \
1273  tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1274  tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1275  } \
1276  CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1277  exp[k >> 1].re, exp[k >> 1].im); \
1278  } \
1279  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1280  } \
1281  \
1282  for (int i = 0; i < N; i++) \
1283  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1284  \
1285  for (int i = 0; i < len8; i++) { \
1286  const int i0 = len8 + i, i1 = len8 - i - 1; \
1287  const int s0 = out_map[i0], s1 = out_map[i1]; \
1288  TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1289  TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1290  \
1291  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1292  exp[i0].im, exp[i0].re); \
1293  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1294  exp[i1].im, exp[i1].re); \
1295  } \
1296 } \
1297  \
1298 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1299  .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1300  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1301  .type = TX_TYPE(MDCT), \
1302  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1303  .factors = { N, TX_FACTOR_ANY }, \
1304  .min_len = N*2, \
1305  .max_len = TX_LEN_UNLIMITED, \
1306  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1307  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1308  .prio = FF_TX_PRIO_BASE, \
1309 };
1310 
1311 DECL_COMP_MDCT(3)
1312 DECL_COMP_MDCT(5)
1313 DECL_COMP_MDCT(7)
1314 DECL_COMP_MDCT(9)
1315 DECL_COMP_MDCT(15)
1316 
1318  const FFTXCodelet *cd,
1319  uint64_t flags,
1321  int len, int inv,
1322  const void *scale)
1323 {
1324  int ret;
1325  double f, m;
1326  TXSample *tab;
1327 
1328  s->scale_d = *((SCALE_TYPE *)scale);
1329  s->scale_f = s->scale_d;
1330 
1331  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1332  return ret;
1333 
1334  if (!(s->exp = av_mallocz((8 + (len >> 2) - 1)*sizeof(*s->exp))))
1335  return AVERROR(ENOMEM);
1336 
1337  tab = (TXSample *)s->exp;
1338 
1339  f = 2*M_PI/len;
1340 
1341  m = (inv ? 2*s->scale_d : s->scale_d);
1342 
1343  *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1344  *tab++ = RESCALE(inv ? 0.5*m : 1.0);
1345  *tab++ = RESCALE( m);
1346  *tab++ = RESCALE(-m);
1347 
1348  *tab++ = RESCALE( (0.5 - 0.0) * m);
1349  *tab++ = RESCALE( (0.0 - 0.5) * m);
1350  *tab++ = RESCALE( (0.5 - inv) * m);
1351  *tab++ = RESCALE(-(0.5 - inv) * m);
1352 
1353  for (int i = 0; i < len >> 2; i++)
1354  *tab++ = RESCALE(cos(i*f));
1355  for (int i = len >> 2; i >= 0; i--)
1356  *tab++ = RESCALE(cos(i*f) * (inv ? +1.0 : -1.0));
1357 
1358  return 0;
1359 }
1360 
1361 #define DECL_RDFT(name, inv) \
1362 static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst, \
1363  void *_src, ptrdiff_t stride) \
1364 { \
1365  const int len2 = s->len >> 1; \
1366  const int len4 = s->len >> 2; \
1367  const TXSample *fact = (void *)s->exp; \
1368  const TXSample *tcos = fact + 8; \
1369  const TXSample *tsin = tcos + len4; \
1370  TXComplex *data = inv ? _src : _dst; \
1371  TXComplex t[3]; \
1372  \
1373  if (!inv) \
1374  s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1375  else \
1376  data[0].im = data[len2].re; \
1377  \
1378  /* The DC value's both components are real, but we need to change them \
1379  * into complex values. Also, the middle of the array is special-cased. \
1380  * These operations can be done before or after the loop. */ \
1381  t[0].re = data[0].re; \
1382  data[0].re = t[0].re + data[0].im; \
1383  data[0].im = t[0].re - data[0].im; \
1384  data[ 0].re = MULT(fact[0], data[ 0].re); \
1385  data[ 0].im = MULT(fact[1], data[ 0].im); \
1386  data[len4].re = MULT(fact[2], data[len4].re); \
1387  data[len4].im = MULT(fact[3], data[len4].im); \
1388  \
1389  for (int i = 1; i < len4; i++) { \
1390  /* Separate even and odd FFTs */ \
1391  t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1392  t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1393  t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1394  t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1395  \
1396  /* Apply twiddle factors to the odd FFT and add to the even FFT */ \
1397  CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1398  \
1399  data[ i].re = t[0].re + t[2].re; \
1400  data[ i].im = t[2].im - t[0].im; \
1401  data[len2 - i].re = t[0].re - t[2].re; \
1402  data[len2 - i].im = t[2].im + t[0].im; \
1403  } \
1404  \
1405  if (inv) { \
1406  s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1407  } else { \
1408  /* Move [0].im to the last position, as convention requires */ \
1409  data[len2].re = data[0].im; \
1410  data[ 0].im = 0; \
1411  } \
1412 }
1413 
1414 DECL_RDFT(r2c, 0)
1415 DECL_RDFT(c2r, 1)
1416 
1417 static const FFTXCodelet TX_NAME(ff_tx_rdft_r2c_def) = {
1418  .name = TX_NAME_STR("rdft_r2c"),
1419  .function = TX_NAME(ff_tx_rdft_r2c),
1420  .type = TX_TYPE(RDFT),
1421  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1423  .factors = { 2, TX_FACTOR_ANY },
1424  .min_len = 2,
1425  .max_len = TX_LEN_UNLIMITED,
1426  .init = TX_NAME(ff_tx_rdft_init),
1428  .prio = FF_TX_PRIO_BASE,
1429 };
1430 
1431 static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
1432  .name = TX_NAME_STR("rdft_c2r"),
1433  .function = TX_NAME(ff_tx_rdft_c2r),
1434  .type = TX_TYPE(RDFT),
1435  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1437  .factors = { 2, TX_FACTOR_ANY },
1438  .min_len = 2,
1439  .max_len = TX_LEN_UNLIMITED,
1440  .init = TX_NAME(ff_tx_rdft_init),
1442  .prio = FF_TX_PRIO_BASE,
1443 };
1444 
1445 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
1446 {
1447  int off = 0;
1448  int len4 = s->len >> 1;
1449  double scale = s->scale_d;
1450  const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
1451  size_t alloc = pre_tab ? 2*len4 : len4;
1452 
1453  if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
1454  return AVERROR(ENOMEM);
1455 
1456  scale = sqrt(fabs(scale));
1457 
1458  if (pre_tab)
1459  off = len4;
1460 
1461  for (int i = 0; i < len4; i++) {
1462  const double alpha = M_PI_2 * (i + theta) / len4;
1463  s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
1464  RESCALE(sin(alpha) * scale) };
1465  }
1466 
1467  if (pre_tab)
1468  for (int i = 0; i < len4; i++)
1469  s->exp[i] = s->exp[len4 + pre_tab[i]];
1470 
1471  return 0;
1472 }
1473 
1474 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
1475  /* Split-Radix codelets */
1476  &TX_NAME(ff_tx_fft2_ns_def),
1477  &TX_NAME(ff_tx_fft4_ns_def),
1478  &TX_NAME(ff_tx_fft8_ns_def),
1479  &TX_NAME(ff_tx_fft16_ns_def),
1480  &TX_NAME(ff_tx_fft32_ns_def),
1481  &TX_NAME(ff_tx_fft64_ns_def),
1482  &TX_NAME(ff_tx_fft128_ns_def),
1483  &TX_NAME(ff_tx_fft256_ns_def),
1484  &TX_NAME(ff_tx_fft512_ns_def),
1485  &TX_NAME(ff_tx_fft1024_ns_def),
1486  &TX_NAME(ff_tx_fft2048_ns_def),
1487  &TX_NAME(ff_tx_fft4096_ns_def),
1488  &TX_NAME(ff_tx_fft8192_ns_def),
1489  &TX_NAME(ff_tx_fft16384_ns_def),
1490  &TX_NAME(ff_tx_fft32768_ns_def),
1491  &TX_NAME(ff_tx_fft65536_ns_def),
1492  &TX_NAME(ff_tx_fft131072_ns_def),
1493 
1494  /* Standalone transforms */
1495  &TX_NAME(ff_tx_fft_sr_def),
1496  &TX_NAME(ff_tx_fft_sr_inplace_def),
1497  &TX_NAME(ff_tx_fft_pfa_3xM_def),
1498  &TX_NAME(ff_tx_fft_pfa_5xM_def),
1499  &TX_NAME(ff_tx_fft_pfa_7xM_def),
1500  &TX_NAME(ff_tx_fft_pfa_9xM_def),
1501  &TX_NAME(ff_tx_fft_pfa_15xM_def),
1502  &TX_NAME(ff_tx_fft_naive_def),
1503  &TX_NAME(ff_tx_mdct_fwd_def),
1504  &TX_NAME(ff_tx_mdct_inv_def),
1505  &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
1506  &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
1507  &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
1508  &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
1509  &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
1510  &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
1511  &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
1512  &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
1513  &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
1514  &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
1515  &TX_NAME(ff_tx_mdct_naive_fwd_def),
1516  &TX_NAME(ff_tx_mdct_naive_inv_def),
1517  &TX_NAME(ff_tx_mdct_inv_full_def),
1518  &TX_NAME(ff_tx_rdft_r2c_def),
1519  &TX_NAME(ff_tx_rdft_c2r_def),
1520 
1521  NULL,
1522 };
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:68
ff_tx_fft_sr_combine
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
Definition: tx_template.c:505
ff_tx_gen_ptwo_inplace_revtab_idx
int ff_tx_gen_ptwo_inplace_revtab_idx(AVTXContext *s)
Definition: tx.c:124
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
out
FILE * out
Definition: movenc.c:54
ff_ctz
#define ff_ctz
Definition: intmath.h:107
TRANSFORM
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
Definition: tx_template.c:497
sr_tabs_init_once
static FFSRTabsInitOnce sr_tabs_init_once[]
Definition: tx_template.c:89
src1
const pixel * src1
Definition: h264pred_template.c:421
AVTXContext
Definition: tx_priv.h:202
TX_NAME
static const FFTXCodelet TX_NAME(ff_tx_fft_sr_def)
im
float im
Definition: fft.c:79
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:161
w
uint8_t w
Definition: llviddspenc.c:38
M_PI_2
#define M_PI_2
Definition: mathematics.h:55
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, int n, int m)
Definition: tx.c:43
CMUL3
#define CMUL3(c, a, b)
Definition: mdct15.c:42
DECL_RDFT
#define DECL_RDFT(name, inv)
Definition: tx_template.c:1361
t1
#define t1
Definition: regdef.h:29
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:470
FF_TX_CPU_FLAGS_ALL
#define FF_TX_CPU_FLAGS_ALL
Definition: tx_priv.h:197
FFSRTabsInitOnce::factors
int factors[TX_MAX_SUB]
Definition: tx_template.c:58
fft5
static void fft5(FFTComplex *out, FFTComplex *in, FFTComplex exptab[2])
Definition: mdct15.c:93
ff_tx_fft_naive
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:750
SR_TABLE
#define SR_TABLE(len)
Definition: tx_template.c:30
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
DECL_FFT5
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
Definition: tx_template.c:219
ff_tx_mdct_naive_fwd
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:873
ff_tx_rdft_init
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1317
DECL_SR_CODELET_DEF
#define DECL_SR_CODELET_DEF(n)
Definition: tx_template.c:542
tab
static const struct twinvq_data tab
Definition: twinvq_data.h:10345
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
val
static double val(void *priv, double ch)
Definition: aeval.c:77
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1389
TX_MAX_SUB
#define TX_MAX_SUB
Definition: tx_priv.h:167
TABLE_DEF
#define TABLE_DEF(name, size)
Definition: tx_template.c:27
FFTXCodelet::type
enum AVTXType type
Definition: tx_priv.h:172
mult
static int16_t mult(Float11 *f1, Float11 *f2)
Definition: g726.c:60
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:179
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
av_cold
#define av_cold
Definition: attributes.h:90
c2r
static void c2r(float *buffer, int size)
Definition: af_apsyclip.c:386
s
#define s(width, name)
Definition: cbs_vp9.c:256
ff_tx_mdct_fwd
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:998
t7
#define t7
Definition: regdef.h:35
ff_tx_mdct_naive_init
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:861
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:147
FFTXCodelet::cpu_flags
int cpu_flags
Definition: tx_priv.h:194
if
if(ret)
Definition: filter_design.txt:179
FFSRTabsInitOnce::control
AVOnce control
Definition: tx_template.c:57
AV_TX_FULL_IMDCT
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
Definition: tx.h:136
opts
AVDictionary * opts
Definition: movenc.c:50
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:177
fabs
static __device__ float fabs(float a)
Definition: cuda_runtime.h:182
NULL
#define NULL
Definition: coverity.c:32
t5
#define t5
Definition: regdef.h:33
ff_tx_mdct_init
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:950
FFSRTabsInitOnce::func
void(* func)(void)
Definition: tx_template.c:56
t6
#define t6
Definition: regdef.h:34
AV_TX_INPLACE
@ AV_TX_INPLACE
Performs an in-place transformation on the input.
Definition: tx.h:122
r2c
static void r2c(float *buffer, int size)
Definition: af_apsyclip.c:377
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:143
ff_tx_fft8_ns
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *dst, void *src, ptrdiff_t stride)
Definition: tx_template.c:599
AV_TX_UNALIGNED
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
Definition: tx.h:128
exp
int8_t exp
Definition: eval.c:72
DECL_COMP_MDCT
#define DECL_COMP_MDCT(N)
Definition: tx_template.c:1250
AVOnce
#define AVOnce
Definition: thread.h:176
ff_tx_fft_pfa_init
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:788
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:145
ff_tx_fft_sr_codelet_init
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:531
f
f
Definition: af_crystalizer.c:122
ff_tx_fft_sr_init
static av_cold int TX_NAME() ff_tx_fft_sr_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:661
ff_tx_init_tab_53
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Definition: tx_template.c:107
FF_TX_PRIO_BASE
@ FF_TX_PRIO_BASE
Definition: tx_priv.h:151
fft9
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:346
t8
#define t8
Definition: regdef.h:53
nptwo_tabs_init_once
static FFSRTabsInitOnce nptwo_tabs_init_once[]
Definition: tx_template.c:148
BF
#define BF(a, b, c, s)
Definition: dct32_template.c:90
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
ff_tx_fft2_ns
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *dst, void *src, ptrdiff_t stride)
Definition: tx_template.c:572
M_PI
#define M_PI
Definition: mathematics.h:52
TXComplex
void TXComplex
Definition: tx_priv.h:61
ff_tx_fft_sr_inplace
static void TX_NAME() ff_tx_fft_sr_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:701
ff_tx_mdct_inv
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1037
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:31
ff_tx_gen_ptwo_revtab
int ff_tx_gen_ptwo_revtab(AVTXContext *s, int invert_lookup)
Definition: tx.c:106
av_always_inline
#define av_always_inline
Definition: attributes.h:49
DECL_SR_CODELET
#define DECL_SR_CODELET(n, n2, n4)
Definition: tx_template.c:557
DECL_COMP_IMDCT
#define DECL_COMP_IMDCT(N)
Definition: tx_template.c:1189
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:264
len
int len
Definition: vorbis_enc_data.h:426
fft3
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:183
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:183
stride
#define stride
Definition: h264pred_template.c:537
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:439
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
ff_tx_init_tab_7
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
Definition: tx_template.c:126
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:179
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:146
ff_tx_init_tab_9
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
Definition: tx_template.c:136
FFTXCodelet
Definition: tx_priv.h:169
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:154
t2
#define t2
Definition: regdef.h:30
ff_tx_mdct_naive_inv
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:894
BUTTERFLIES
#define BUTTERFLIES(a0, a1, a2, a3)
Definition: tx_template.c:483
ff_tx_fft16_ns
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *dst, void *src, ptrdiff_t stride)
Definition: tx_template.c:617
src0
const pixel *const src0
Definition: h264pred_template.c:420
FFTXCodelet::name
const char * name
Definition: tx_priv.h:170
factor
static const int factor[16]
Definition: vf_pp7.c:76
ff_tx_fft_sr
static void TX_NAME() ff_tx_fft_sr(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:685
FFTXCodeletOptions::invert_lookup
int invert_lookup
Definition: tx_priv.h:162
FFSRTabsInitOnce
Definition: tx_template.c:55
INIT_FF_SR_TAB
#define INIT_FF_SR_TAB(len)
Definition: tx_template.c:61
map
const VDPAUPixFmtMap * map
Definition: hwcontext_vdpau.c:71
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
fft7
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:259
int32_t
int32_t
Definition: audioconvert.c:56
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
ff_tx_mdct_gen_exp
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
Definition: tx_template.c:1443
ff_tx_fft4_ns
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *dst, void *src, ptrdiff_t stride)
Definition: tx_template.c:583
DECL_COMP_FFT
#define DECL_COMP_FFT(N)
Definition: tx_template.c:818
ff_tx_mdct_pfa_init
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1147
ff_tx_mdct_inv_full_init
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1095
CMUL
#define CMUL(dre, dim, are, aim, bre, bim)
Definition: fft-internal.h:42
re
float re
Definition: fft.c:79
ff_tx_mdct_inv_full
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1115
FF_TX_PRIO_MIN
@ FF_TX_PRIO_MIN
Definition: tx_priv.h:156