FFmpeg
fft_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Stanislav Ocovaj (socovaj@mips.com)
30  * Author: Zoran Lukic (zoranl@mips.com)
31  *
32  * Optimized MDCT/IMDCT and FFT transforms
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50 #include "config.h"
51 #include "libavcodec/fft.h"
52 #include "libavcodec/fft_table.h"
53 #include "libavutil/mips/asmdefs.h"
54 
55 /**
56  * FFT transform
57  */
58 
59 #if HAVE_INLINE_ASM
60 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
61 static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
62 {
63  int nbits, i, n, num_transforms, offset, step;
64  int n4, n2, n34;
65  FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
66  FFTComplex *tmpz;
67  float w_re, w_im;
68  float *w_re_ptr, *w_im_ptr;
69  const int fft_size = (1 << s->nbits);
70  float pom, pom1, pom2, pom3;
71  float temp, temp1, temp3, temp4;
72  FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
73  FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
74 
75  num_transforms = (21845 >> (17 - s->nbits)) | 1;
76 
77  for (n=0; n<num_transforms; n++) {
78  offset = ff_fft_offsets_lut[n] << 2;
79  tmpz = z + offset;
80 
81  tmp1 = tmpz[0].re + tmpz[1].re;
82  tmp5 = tmpz[2].re + tmpz[3].re;
83  tmp2 = tmpz[0].im + tmpz[1].im;
84  tmp6 = tmpz[2].im + tmpz[3].im;
85  tmp3 = tmpz[0].re - tmpz[1].re;
86  tmp8 = tmpz[2].im - tmpz[3].im;
87  tmp4 = tmpz[0].im - tmpz[1].im;
88  tmp7 = tmpz[2].re - tmpz[3].re;
89 
90  tmpz[0].re = tmp1 + tmp5;
91  tmpz[2].re = tmp1 - tmp5;
92  tmpz[0].im = tmp2 + tmp6;
93  tmpz[2].im = tmp2 - tmp6;
94  tmpz[1].re = tmp3 + tmp8;
95  tmpz[3].re = tmp3 - tmp8;
96  tmpz[1].im = tmp4 - tmp7;
97  tmpz[3].im = tmp4 + tmp7;
98 
99  }
100 
101  if (fft_size < 8)
102  return;
103 
104  num_transforms = (num_transforms >> 1) | 1;
105 
106  for (n=0; n<num_transforms; n++) {
107  offset = ff_fft_offsets_lut[n] << 3;
108  tmpz = z + offset;
109 
110  __asm__ volatile (
111  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
112  "lwc1 %[pom], 40(%[tmpz]) \n\t"
113  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
114  "lwc1 %[pom1], 56(%[tmpz]) \n\t"
115  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
116  "lwc1 %[pom2], 44(%[tmpz]) \n\t"
117  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
118  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
119  "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
120  "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
121  "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
122  "lwc1 %[pom], 40(%[tmpz]) \n\t"
123  "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
124  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
125  "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
126  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
127  "lwc1 %[pom1], 44(%[tmpz]) \n\t"
128  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
129  "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
130  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
131  "lwc1 %[pom2], 56(%[tmpz]) \n\t"
132  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
133  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
134  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
135  "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
136  "lwc1 %[pom], 0(%[tmpz]) \n\t"
137  "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
138  "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
139  "lwc1 %[pom2], 4(%[tmpz]) \n\t"
140  "sub.s %[pom1], %[pom], %[tmp5] \n\t"
141  "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
142  "add.s %[pom3], %[pom], %[tmp5] \n\t"
143  "sub.s %[pom], %[pom2], %[tmp6] \n\t"
144  "add.s %[pom2], %[pom2], %[tmp6] \n\t"
145  "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
146  "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
147  "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
148  "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
149  "lwc1 %[pom1], 16(%[tmpz]) \n\t"
150  "lwc1 %[pom3], 20(%[tmpz]) \n\t"
151  "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
152  "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
153  "sub.s %[temp], %[pom1], %[tmp8] \n\t"
154  "add.s %[pom2], %[pom3], %[tmp7] \n\t"
155  "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
156  "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
157  "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
158  "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
159  "add.s %[pom1], %[pom1], %[tmp8] \n\t"
160  "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
161  "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
162  "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
163  "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
164  "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
165  "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
166  "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
167  "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
168  "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
169  "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
170  "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
171  "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
172  "lwc1 %[temp], 8(%[tmpz]) \n\t"
173  "lwc1 %[temp1],12(%[tmpz]) \n\t"
174  "lwc1 %[pom], 24(%[tmpz]) \n\t"
175  "lwc1 %[pom2], 28(%[tmpz]) \n\t"
176  "sub.s %[temp4],%[temp], %[tmp1] \n\t"
177  "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
178  "add.s %[temp], %[temp], %[tmp1] \n\t"
179  "add.s %[temp1],%[temp1], %[tmp2] \n\t"
180  "sub.s %[pom1], %[pom], %[tmp4] \n\t"
181  "add.s %[pom3], %[pom2], %[tmp3] \n\t"
182  "add.s %[pom], %[pom], %[tmp4] \n\t"
183  "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
184  "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
185  "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
186  "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
187  "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
188  "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
189  "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
190  "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
191  "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
192  : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
193  [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
194  [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
195  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
196  : [tmpz]"r"(tmpz)
197  : "memory"
198  );
199  }
200 
201  step = 1 << (MAX_LOG2_NFFT - 4);
202  n4 = 4;
203 
204  for (nbits=4; nbits<=s->nbits; nbits++) {
205  num_transforms = (num_transforms >> 1) | 1;
206  n2 = 2 * n4;
207  n34 = 3 * n4;
208 
209  for (n=0; n<num_transforms; n++) {
210  offset = ff_fft_offsets_lut[n] << nbits;
211  tmpz = z + offset;
212 
213  tmpz_n2 = tmpz + n2;
214  tmpz_n4 = tmpz + n4;
215  tmpz_n34 = tmpz + n34;
216 
217  __asm__ volatile (
218  "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
219  "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
220  "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
221  "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
222  "lwc1 %[temp1],0(%[tmpz]) \n\t"
223  "lwc1 %[temp3],4(%[tmpz]) \n\t"
224  "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
225  "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
226  "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
227  "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
228  "sub.s %[temp], %[temp1], %[tmp5] \n\t"
229  "add.s %[temp1],%[temp1], %[tmp5] \n\t"
230  "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
231  "add.s %[temp3],%[temp3], %[tmp6] \n\t"
232  "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
233  "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
234  "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
235  "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
236  "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
237  "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
238  "sub.s %[pom], %[pom1], %[tmp2] \n\t"
239  "add.s %[pom1], %[pom1], %[tmp2] \n\t"
240  "add.s %[temp1],%[temp], %[tmp1] \n\t"
241  "sub.s %[temp], %[temp], %[tmp1] \n\t"
242  "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
243  "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
244  "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
245  "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
246  : [tmp5]"=&f"(tmp5),
247  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
248  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
249  [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
250  : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
251  : "memory"
252  );
253 
254  w_re_ptr = (float*)(ff_cos_131072 + step);
255  w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
256 
257  for (i=1; i<n4; i++) {
258  w_re = w_re_ptr[0];
259  w_im = w_im_ptr[0];
260  tmpz_n2_i = tmpz_n2 + i;
261  tmpz_n4_i = tmpz_n4 + i;
262  tmpz_n34_i= tmpz_n34 + i;
263  tmpz_i = tmpz + i;
264 
265  __asm__ volatile (
266  "lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
267  "lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
268  "lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
269  "lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
270  "mul.s %[temp3], %[w_im], %[temp] \n\t"
271  "mul.s %[temp4], %[w_im], %[temp1] \n\t"
272  "mul.s %[pom2], %[w_im], %[pom1] \n\t"
273  "mul.s %[pom3], %[w_im], %[pom] \n\t"
274  "msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
275  "madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
276  "msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
277  "madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
278  "lwc1 %[temp], 0(%[tmpz_i]) \n\t"
279  "lwc1 %[pom], 4(%[tmpz_i]) \n\t"
280  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
281  "sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
282  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
283  "sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
284  "sub.s %[temp1], %[temp], %[tmp5] \n\t"
285  "add.s %[temp], %[temp], %[tmp5] \n\t"
286  "sub.s %[pom1], %[pom], %[tmp6] \n\t"
287  "add.s %[pom], %[pom], %[tmp6] \n\t"
288  "lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
289  "lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
290  "swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
291  "swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
292  "swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
293  "swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
294  "sub.s %[temp4], %[temp3], %[tmp2] \n\t"
295  "add.s %[pom3], %[pom2], %[tmp1] \n\t"
296  "add.s %[temp3], %[temp3], %[tmp2] \n\t"
297  "sub.s %[pom2], %[pom2], %[tmp1] \n\t"
298  "swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
299  "swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
300  "swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
301  "swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
302  : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
303  [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
304  [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
305  [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
306  : [w_re]"f"(w_re), [w_im]"f"(w_im),
307  [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
308  [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
309  : "memory"
310  );
311  w_re_ptr += step;
312  w_im_ptr -= step;
313  }
314  }
315  step >>= 1;
316  n4 <<= 1;
317  }
318 }
319 
320 /**
321  * MDCT/IMDCT transforms.
322  */
323 
324 static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
325 {
326  int k, n8, n4, n2, n, j;
327  const uint16_t *revtab = s->revtab;
328  const FFTSample *tcos = s->tcos;
329  const FFTSample *tsin = s->tsin;
330  const FFTSample *in1, *in2, *in3, *in4;
331  FFTComplex *z = (FFTComplex *)output;
332 
333  int j1;
334  const float *tcos1, *tsin1, *tcos2, *tsin2;
335  float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
336  temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
337  FFTComplex *z1, *z2;
338 
339  n = 1 << s->mdct_bits;
340  n2 = n >> 1;
341  n4 = n >> 2;
342  n8 = n >> 3;
343 
344  /* pre rotation */
345  in1 = input;
346  in2 = input + n2 - 1;
347  in3 = input + 2;
348  in4 = input + n2 - 3;
349 
350  tcos1 = tcos;
351  tsin1 = tsin;
352 
353  /* n4 = 64 or 128 */
354  for(k = 0; k < n4; k += 2) {
355  j = revtab[k ];
356  j1 = revtab[k + 1];
357 
358  __asm__ volatile (
359  "lwc1 %[temp1], 0(%[in2]) \t\n"
360  "lwc1 %[temp2], 0(%[tcos1]) \t\n"
361  "lwc1 %[temp3], 0(%[tsin1]) \t\n"
362  "lwc1 %[temp4], 0(%[in1]) \t\n"
363  "lwc1 %[temp5], 0(%[in4]) \t\n"
364  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
365  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
366  "lwc1 %[temp6], 4(%[tcos1]) \t\n"
367  "lwc1 %[temp7], 4(%[tsin1]) \t\n"
368  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
369  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
370  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
371  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
372  "lwc1 %[temp8], 0(%[in3]) \t\n"
373  PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n"
374  PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n"
375  PTR_ADDIU " %[in1], %[in1], 16 \t\n"
376  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
377  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
378  PTR_ADDIU " %[in2], %[in2], -16 \t\n"
379  PTR_ADDIU " %[in3], %[in3], 16 \t\n"
380  PTR_ADDIU " %[in4], %[in4], -16 \t\n"
381 
382  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
383  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
384  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
385  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
386  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
387  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
388  [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
389  [in1]"+r"(in1), [in2]"+r"(in2),
390  [in3]"+r"(in3), [in4]"+r"(in4)
391  :
392  : "memory"
393  );
394 
395  z[j ].re = temp9;
396  z[j ].im = temp10;
397  z[j1].re = temp11;
398  z[j1].im = temp12;
399  }
400 
401  s->fft_calc(s, z);
402 
403  /* post rotation + reordering */
404  /* n8 = 32 or 64 */
405  for(k = 0; k < n8; k += 2) {
406  tcos1 = &tcos[n8 - k - 2];
407  tsin1 = &tsin[n8 - k - 2];
408  tcos2 = &tcos[n8 + k];
409  tsin2 = &tsin[n8 + k];
410  z1 = &z[n8 - k - 2];
411  z2 = &z[n8 + k ];
412 
413  __asm__ volatile (
414  "lwc1 %[temp1], 12(%[z1]) \t\n"
415  "lwc1 %[temp2], 4(%[tsin1]) \t\n"
416  "lwc1 %[temp3], 4(%[tcos1]) \t\n"
417  "lwc1 %[temp4], 8(%[z1]) \t\n"
418  "lwc1 %[temp5], 4(%[z1]) \t\n"
419  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
420  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
421  "lwc1 %[temp6], 0(%[tsin1]) \t\n"
422  "lwc1 %[temp7], 0(%[tcos1]) \t\n"
423  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
424  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
425  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
426  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
427  "lwc1 %[temp8], 0(%[z1]) \t\n"
428  "lwc1 %[temp1], 4(%[z2]) \t\n"
429  "lwc1 %[temp2], 0(%[tsin2]) \t\n"
430  "lwc1 %[temp3], 0(%[tcos2]) \t\n"
431  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
432  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
433  "mul.s %[temp13], %[temp1], %[temp2] \t\n"
434  "mul.s %[temp14], %[temp1], %[temp3] \t\n"
435  "lwc1 %[temp4], 0(%[z2]) \t\n"
436  "lwc1 %[temp5], 12(%[z2]) \t\n"
437  "lwc1 %[temp6], 4(%[tsin2]) \t\n"
438  "lwc1 %[temp7], 4(%[tcos2]) \t\n"
439  "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
440  "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
441  "mul.s %[temp15], %[temp5], %[temp6] \t\n"
442  "mul.s %[temp16], %[temp5], %[temp7] \t\n"
443  "lwc1 %[temp8], 8(%[z2]) \t\n"
444  "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
445  "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
446  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
447  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
448  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
449  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
450  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
451  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
452  [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
453  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
454  : [z1]"r"(z1), [z2]"r"(z2),
455  [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
456  [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
457  : "memory"
458  );
459 
460  z1[1].re = temp9;
461  z1[1].im = temp14;
462  z2[0].re = temp13;
463  z2[0].im = temp10;
464 
465  z1[0].re = temp11;
466  z1[0].im = temp16;
467  z2[1].re = temp15;
468  z2[1].im = temp12;
469  }
470 }
471 
472 /**
473  * Compute inverse MDCT of size N = 2^nbits
474  * @param output N samples
475  * @param input N/2 samples
476  */
477 static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
478 {
479  int k;
480  int n = 1 << s->mdct_bits;
481  int n2 = n >> 1;
482  int n4 = n >> 2;
483 
484  ff_imdct_half_mips(s, output+n4, input);
485 
486  for(k = 0; k < n4; k+=4) {
487  output[k] = -output[n2-k-1];
488  output[k+1] = -output[n2-k-2];
489  output[k+2] = -output[n2-k-3];
490  output[k+3] = -output[n2-k-4];
491 
492  output[n-k-1] = output[n2+k];
493  output[n-k-2] = output[n2+k+1];
494  output[n-k-3] = output[n2+k+2];
495  output[n-k-4] = output[n2+k+3];
496  }
497 }
498 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
499 #endif /* HAVE_INLINE_ASM */
500 
502 {
503  int n=0;
504 
505  ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
507 
508 #if HAVE_INLINE_ASM
509 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
510  s->fft_calc = ff_fft_calc_mips;
511 #if CONFIG_MDCT
512  s->imdct_calc = ff_imdct_calc_mips;
513  s->imdct_half = ff_imdct_half_mips;
514 #endif
515 #endif
516 #endif
517 }
ff_init_ff_cos_tabs
#define ff_init_ff_cos_tabs
Definition: fft.h:141
n
int n
Definition: avisynth_c.h:760
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
ff_fft_init_mips
av_cold void ff_fft_init_mips(FFTContext *s)
FFT transform.
Definition: fft_mips.c:501
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
asmdefs.h
MAX_FFT_SIZE
#define MAX_FFT_SIZE
Definition: fft_table.h:60
av_cold
#define av_cold
Definition: attributes.h:84
s
#define s(width, name)
Definition: cbs_vp9.c:257
ff_fft_lut_init
void ff_fft_lut_init(uint16_t *table, int off, int size, int *index)
Definition: fft_init_table.c:317
MAX_LOG2_NFFT
#define MAX_LOG2_NFFT
Specifies maximum allowed fft size.
Definition: fft_table.h:59
FFTSample
float FFTSample
Definition: avfft.h:35
fft_table.h
FFTComplex::im
FFTSample im
Definition: avfft.h:38
FFTComplex::re
FFTSample re
Definition: avfft.h:38
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
FFTContext
Definition: fft.h:88
ff_fft_offsets_lut
uint16_t ff_fft_offsets_lut[21845]
Definition: fft_init_table.c:315
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
fft.h
config.h
temp
else temp
Definition: vf_mcdeint.c:256
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
FFTComplex
Definition: avfft.h:37