FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
aaccoder_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Stanislav Ocovaj (socovaj@mips.com)
30  * Szabolcs Pal (sabolc@mips.com)
31  *
32  * AAC coefficients encoder optimized for MIPS floating-point architecture
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50 
51 /**
52  * @file
53  * Reference: libavcodec/aaccoder.c
54  */
55 
56 #include "libavutil/libm.h"
57 
58 #include <float.h>
59 #include "libavutil/mathematics.h"
60 #include "libavcodec/avcodec.h"
61 #include "libavcodec/put_bits.h"
62 #include "libavcodec/aac.h"
63 #include "libavcodec/aacenc.h"
64 #include "libavcodec/aactab.h"
65 
66 #if HAVE_INLINE_ASM
67 typedef struct BandCodingPath {
68  int prev_idx;
69  float cost;
70  int run;
72 
73 static const uint8_t run_value_bits_long[64] = {
74  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
75  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10,
76  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
77  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
78 };
79 
80 static const uint8_t run_value_bits_short[16] = {
81  3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
82 };
83 
84 static const uint8_t * const run_value_bits[2] = {
86 };
87 
88 static const uint8_t uquad_sign_bits[81] = {
89  0, 1, 1, 1, 2, 2, 1, 2, 2,
90  1, 2, 2, 2, 3, 3, 2, 3, 3,
91  1, 2, 2, 2, 3, 3, 2, 3, 3,
92  1, 2, 2, 2, 3, 3, 2, 3, 3,
93  2, 3, 3, 3, 4, 4, 3, 4, 4,
94  2, 3, 3, 3, 4, 4, 3, 4, 4,
95  1, 2, 2, 2, 3, 3, 2, 3, 3,
96  2, 3, 3, 3, 4, 4, 3, 4, 4,
97  2, 3, 3, 3, 4, 4, 3, 4, 4
98 };
99 
100 static const uint8_t upair7_sign_bits[64] = {
101  0, 1, 1, 1, 1, 1, 1, 1,
102  1, 2, 2, 2, 2, 2, 2, 2,
103  1, 2, 2, 2, 2, 2, 2, 2,
104  1, 2, 2, 2, 2, 2, 2, 2,
105  1, 2, 2, 2, 2, 2, 2, 2,
106  1, 2, 2, 2, 2, 2, 2, 2,
107  1, 2, 2, 2, 2, 2, 2, 2,
108  1, 2, 2, 2, 2, 2, 2, 2,
109 };
110 
111 static const uint8_t upair12_sign_bits[169] = {
112  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
125 };
126 
127 static const uint8_t esc_sign_bits[289] = {
128  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
144  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
145 };
146 
147 static void abs_pow34_v(float *out, const float *in, const int size) {
148 #ifndef USE_REALLY_FULL_SEARCH
149  int i;
150  float a, b, c, d;
151  float ax, bx, cx, dx;
152 
153  for (i = 0; i < size; i += 4) {
154  a = fabsf(in[i ]);
155  b = fabsf(in[i+1]);
156  c = fabsf(in[i+2]);
157  d = fabsf(in[i+3]);
158 
159  ax = sqrtf(a);
160  bx = sqrtf(b);
161  cx = sqrtf(c);
162  dx = sqrtf(d);
163 
164  a = a * ax;
165  b = b * bx;
166  c = c * cx;
167  d = d * dx;
168 
169  out[i ] = sqrtf(a);
170  out[i+1] = sqrtf(b);
171  out[i+2] = sqrtf(c);
172  out[i+3] = sqrtf(d);
173  }
174 #endif /* USE_REALLY_FULL_SEARCH */
175 }
176 
177 static float find_max_val(int group_len, int swb_size, const float *scaled) {
178  float maxval = 0.0f;
179  int w2, i;
180  for (w2 = 0; w2 < group_len; w2++) {
181  for (i = 0; i < swb_size; i++) {
182  maxval = FFMAX(maxval, scaled[w2*128+i]);
183  }
184  }
185  return maxval;
186 }
187 
188 static int find_min_book(float maxval, int sf) {
190  float Q34 = sqrtf(Q * sqrtf(Q));
191  int qmaxval, cb;
192  qmaxval = maxval * Q34 + 0.4054f;
193  if (qmaxval == 0) cb = 0;
194  else if (qmaxval == 1) cb = 1;
195  else if (qmaxval == 2) cb = 3;
196  else if (qmaxval <= 4) cb = 5;
197  else if (qmaxval <= 7) cb = 7;
198  else if (qmaxval <= 12) cb = 9;
199  else cb = 11;
200  return cb;
201 }
202 
203 /**
204  * Functions developed from template function and optimized for quantizing and encoding band
205  */
206 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
207  PutBitContext *pb, const float *in,
208  const float *scaled, int size, int scale_idx,
209  int cb, const float lambda, const float uplim,
210  int *bits)
211 {
212  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
213  int i;
214  int qc1, qc2, qc3, qc4;
215 
216  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
217  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
218 
219  abs_pow34_v(s->scoefs, in, size);
220  scaled = s->scoefs;
221  for (i = 0; i < size; i += 4) {
222  int curidx;
223  int *in_int = (int *)&in[i];
224  int t0, t1, t2, t3, t4, t5, t6, t7;
225 
226  qc1 = scaled[i ] * Q34 + 0.4054f;
227  qc2 = scaled[i+1] * Q34 + 0.4054f;
228  qc3 = scaled[i+2] * Q34 + 0.4054f;
229  qc4 = scaled[i+3] * Q34 + 0.4054f;
230 
231  __asm__ volatile (
232  ".set push \n\t"
233  ".set noreorder \n\t"
234 
235  "slt %[qc1], $zero, %[qc1] \n\t"
236  "slt %[qc2], $zero, %[qc2] \n\t"
237  "slt %[qc3], $zero, %[qc3] \n\t"
238  "slt %[qc4], $zero, %[qc4] \n\t"
239  "lw %[t0], 0(%[in_int]) \n\t"
240  "lw %[t1], 4(%[in_int]) \n\t"
241  "lw %[t2], 8(%[in_int]) \n\t"
242  "lw %[t3], 12(%[in_int]) \n\t"
243  "srl %[t0], %[t0], 31 \n\t"
244  "srl %[t1], %[t1], 31 \n\t"
245  "srl %[t2], %[t2], 31 \n\t"
246  "srl %[t3], %[t3], 31 \n\t"
247  "subu %[t4], $zero, %[qc1] \n\t"
248  "subu %[t5], $zero, %[qc2] \n\t"
249  "subu %[t6], $zero, %[qc3] \n\t"
250  "subu %[t7], $zero, %[qc4] \n\t"
251  "movn %[qc1], %[t4], %[t0] \n\t"
252  "movn %[qc2], %[t5], %[t1] \n\t"
253  "movn %[qc3], %[t6], %[t2] \n\t"
254  "movn %[qc4], %[t7], %[t3] \n\t"
255 
256  ".set pop \n\t"
257 
258  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
259  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
260  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
261  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
262  : [in_int]"r"(in_int)
263  : "memory"
264  );
265 
266  curidx = qc1;
267  curidx *= 3;
268  curidx += qc2;
269  curidx *= 3;
270  curidx += qc3;
271  curidx *= 3;
272  curidx += qc4;
273  curidx += 40;
274 
275  put_bits(pb, p_bits[curidx], p_codes[curidx]);
276  }
277 }
278 
279 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
280  PutBitContext *pb, const float *in,
281  const float *scaled, int size, int scale_idx,
282  int cb, const float lambda, const float uplim,
283  int *bits)
284 {
285  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
286  int i;
287  int qc1, qc2, qc3, qc4;
288 
289  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
290  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
291 
292  abs_pow34_v(s->scoefs, in, size);
293  scaled = s->scoefs;
294  for (i = 0; i < size; i += 4) {
295  int curidx, sign, count;
296  int *in_int = (int *)&in[i];
297  uint8_t v_bits;
298  unsigned int v_codes;
299  int t0, t1, t2, t3, t4;
300 
301  qc1 = scaled[i ] * Q34 + 0.4054f;
302  qc2 = scaled[i+1] * Q34 + 0.4054f;
303  qc3 = scaled[i+2] * Q34 + 0.4054f;
304  qc4 = scaled[i+3] * Q34 + 0.4054f;
305 
306  __asm__ volatile (
307  ".set push \n\t"
308  ".set noreorder \n\t"
309 
310  "ori %[t4], $zero, 2 \n\t"
311  "ori %[sign], $zero, 0 \n\t"
312  "slt %[t0], %[t4], %[qc1] \n\t"
313  "slt %[t1], %[t4], %[qc2] \n\t"
314  "slt %[t2], %[t4], %[qc3] \n\t"
315  "slt %[t3], %[t4], %[qc4] \n\t"
316  "movn %[qc1], %[t4], %[t0] \n\t"
317  "movn %[qc2], %[t4], %[t1] \n\t"
318  "movn %[qc3], %[t4], %[t2] \n\t"
319  "movn %[qc4], %[t4], %[t3] \n\t"
320  "lw %[t0], 0(%[in_int]) \n\t"
321  "lw %[t1], 4(%[in_int]) \n\t"
322  "lw %[t2], 8(%[in_int]) \n\t"
323  "lw %[t3], 12(%[in_int]) \n\t"
324  "slt %[t0], %[t0], $zero \n\t"
325  "movn %[sign], %[t0], %[qc1] \n\t"
326  "slt %[t1], %[t1], $zero \n\t"
327  "slt %[t2], %[t2], $zero \n\t"
328  "slt %[t3], %[t3], $zero \n\t"
329  "sll %[t0], %[sign], 1 \n\t"
330  "or %[t0], %[t0], %[t1] \n\t"
331  "movn %[sign], %[t0], %[qc2] \n\t"
332  "slt %[t4], $zero, %[qc1] \n\t"
333  "slt %[t1], $zero, %[qc2] \n\t"
334  "slt %[count], $zero, %[qc3] \n\t"
335  "sll %[t0], %[sign], 1 \n\t"
336  "or %[t0], %[t0], %[t2] \n\t"
337  "movn %[sign], %[t0], %[qc3] \n\t"
338  "slt %[t2], $zero, %[qc4] \n\t"
339  "addu %[count], %[count], %[t4] \n\t"
340  "addu %[count], %[count], %[t1] \n\t"
341  "sll %[t0], %[sign], 1 \n\t"
342  "or %[t0], %[t0], %[t3] \n\t"
343  "movn %[sign], %[t0], %[qc4] \n\t"
344  "addu %[count], %[count], %[t2] \n\t"
345 
346  ".set pop \n\t"
347 
348  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
349  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
350  [sign]"=&r"(sign), [count]"=&r"(count),
351  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
352  [t4]"=&r"(t4)
353  : [in_int]"r"(in_int)
354  : "memory"
355  );
356 
357  curidx = qc1;
358  curidx *= 3;
359  curidx += qc2;
360  curidx *= 3;
361  curidx += qc3;
362  curidx *= 3;
363  curidx += qc4;
364 
365  v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
366  v_bits = p_bits[curidx] + count;
367  put_bits(pb, v_bits, v_codes);
368  }
369 }
370 
371 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
372  PutBitContext *pb, const float *in,
373  const float *scaled, int size, int scale_idx,
374  int cb, const float lambda, const float uplim,
375  int *bits)
376 {
377  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
378  int i;
379  int qc1, qc2, qc3, qc4;
380 
381  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
382  uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
383 
384  abs_pow34_v(s->scoefs, in, size);
385  scaled = s->scoefs;
386  for (i = 0; i < size; i += 4) {
387  int curidx, curidx2;
388  int *in_int = (int *)&in[i];
389  uint8_t v_bits;
390  unsigned int v_codes;
391  int t0, t1, t2, t3, t4, t5, t6, t7;
392 
393  qc1 = scaled[i ] * Q34 + 0.4054f;
394  qc2 = scaled[i+1] * Q34 + 0.4054f;
395  qc3 = scaled[i+2] * Q34 + 0.4054f;
396  qc4 = scaled[i+3] * Q34 + 0.4054f;
397 
398  __asm__ volatile (
399  ".set push \n\t"
400  ".set noreorder \n\t"
401 
402  "ori %[t4], $zero, 4 \n\t"
403  "slt %[t0], %[t4], %[qc1] \n\t"
404  "slt %[t1], %[t4], %[qc2] \n\t"
405  "slt %[t2], %[t4], %[qc3] \n\t"
406  "slt %[t3], %[t4], %[qc4] \n\t"
407  "movn %[qc1], %[t4], %[t0] \n\t"
408  "movn %[qc2], %[t4], %[t1] \n\t"
409  "movn %[qc3], %[t4], %[t2] \n\t"
410  "movn %[qc4], %[t4], %[t3] \n\t"
411  "lw %[t0], 0(%[in_int]) \n\t"
412  "lw %[t1], 4(%[in_int]) \n\t"
413  "lw %[t2], 8(%[in_int]) \n\t"
414  "lw %[t3], 12(%[in_int]) \n\t"
415  "srl %[t0], %[t0], 31 \n\t"
416  "srl %[t1], %[t1], 31 \n\t"
417  "srl %[t2], %[t2], 31 \n\t"
418  "srl %[t3], %[t3], 31 \n\t"
419  "subu %[t4], $zero, %[qc1] \n\t"
420  "subu %[t5], $zero, %[qc2] \n\t"
421  "subu %[t6], $zero, %[qc3] \n\t"
422  "subu %[t7], $zero, %[qc4] \n\t"
423  "movn %[qc1], %[t4], %[t0] \n\t"
424  "movn %[qc2], %[t5], %[t1] \n\t"
425  "movn %[qc3], %[t6], %[t2] \n\t"
426  "movn %[qc4], %[t7], %[t3] \n\t"
427 
428  ".set pop \n\t"
429 
430  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
431  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
432  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
433  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
434  : [in_int]"r"(in_int)
435  : "memory"
436  );
437 
438  curidx = 9 * qc1;
439  curidx += qc2 + 40;
440 
441  curidx2 = 9 * qc3;
442  curidx2 += qc4 + 40;
443 
444  v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
445  v_bits = p_bits[curidx] + p_bits[curidx2];
446  put_bits(pb, v_bits, v_codes);
447  }
448 }
449 
450 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
451  PutBitContext *pb, const float *in,
452  const float *scaled, int size, int scale_idx,
453  int cb, const float lambda, const float uplim,
454  int *bits)
455 {
456  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
457  int i;
458  int qc1, qc2, qc3, qc4;
459 
460  uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
461  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
462 
463  abs_pow34_v(s->scoefs, in, size);
464  scaled = s->scoefs;
465  for (i = 0; i < size; i += 4) {
466  int curidx, sign1, count1, sign2, count2;
467  int *in_int = (int *)&in[i];
468  uint8_t v_bits;
469  unsigned int v_codes;
470  int t0, t1, t2, t3, t4;
471 
472  qc1 = scaled[i ] * Q34 + 0.4054f;
473  qc2 = scaled[i+1] * Q34 + 0.4054f;
474  qc3 = scaled[i+2] * Q34 + 0.4054f;
475  qc4 = scaled[i+3] * Q34 + 0.4054f;
476 
477  __asm__ volatile (
478  ".set push \n\t"
479  ".set noreorder \n\t"
480 
481  "ori %[t4], $zero, 7 \n\t"
482  "ori %[sign1], $zero, 0 \n\t"
483  "ori %[sign2], $zero, 0 \n\t"
484  "slt %[t0], %[t4], %[qc1] \n\t"
485  "slt %[t1], %[t4], %[qc2] \n\t"
486  "slt %[t2], %[t4], %[qc3] \n\t"
487  "slt %[t3], %[t4], %[qc4] \n\t"
488  "movn %[qc1], %[t4], %[t0] \n\t"
489  "movn %[qc2], %[t4], %[t1] \n\t"
490  "movn %[qc3], %[t4], %[t2] \n\t"
491  "movn %[qc4], %[t4], %[t3] \n\t"
492  "lw %[t0], 0(%[in_int]) \n\t"
493  "lw %[t1], 4(%[in_int]) \n\t"
494  "lw %[t2], 8(%[in_int]) \n\t"
495  "lw %[t3], 12(%[in_int]) \n\t"
496  "slt %[t0], %[t0], $zero \n\t"
497  "movn %[sign1], %[t0], %[qc1] \n\t"
498  "slt %[t2], %[t2], $zero \n\t"
499  "movn %[sign2], %[t2], %[qc3] \n\t"
500  "slt %[t1], %[t1], $zero \n\t"
501  "sll %[t0], %[sign1], 1 \n\t"
502  "or %[t0], %[t0], %[t1] \n\t"
503  "movn %[sign1], %[t0], %[qc2] \n\t"
504  "slt %[t3], %[t3], $zero \n\t"
505  "sll %[t0], %[sign2], 1 \n\t"
506  "or %[t0], %[t0], %[t3] \n\t"
507  "movn %[sign2], %[t0], %[qc4] \n\t"
508  "slt %[count1], $zero, %[qc1] \n\t"
509  "slt %[t1], $zero, %[qc2] \n\t"
510  "slt %[count2], $zero, %[qc3] \n\t"
511  "slt %[t2], $zero, %[qc4] \n\t"
512  "addu %[count1], %[count1], %[t1] \n\t"
513  "addu %[count2], %[count2], %[t2] \n\t"
514 
515  ".set pop \n\t"
516 
517  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
518  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
519  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
520  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
521  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
522  [t4]"=&r"(t4)
523  : [in_int]"r"(in_int)
524  : "t0", "t1", "t2", "t3", "t4",
525  "memory"
526  );
527 
528  curidx = 8 * qc1;
529  curidx += qc2;
530 
531  v_codes = (p_codes[curidx] << count1) | sign1;
532  v_bits = p_bits[curidx] + count1;
533  put_bits(pb, v_bits, v_codes);
534 
535  curidx = 8 * qc3;
536  curidx += qc4;
537 
538  v_codes = (p_codes[curidx] << count2) | sign2;
539  v_bits = p_bits[curidx] + count2;
540  put_bits(pb, v_bits, v_codes);
541  }
542 }
543 
544 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
545  PutBitContext *pb, const float *in,
546  const float *scaled, int size, int scale_idx,
547  int cb, const float lambda, const float uplim,
548  int *bits)
549 {
550  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
551  int i;
552  int qc1, qc2, qc3, qc4;
553 
554  uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
555  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
556 
557  abs_pow34_v(s->scoefs, in, size);
558  scaled = s->scoefs;
559  for (i = 0; i < size; i += 4) {
560  int curidx, sign1, count1, sign2, count2;
561  int *in_int = (int *)&in[i];
562  uint8_t v_bits;
563  unsigned int v_codes;
564  int t0, t1, t2, t3, t4;
565 
566  qc1 = scaled[i ] * Q34 + 0.4054f;
567  qc2 = scaled[i+1] * Q34 + 0.4054f;
568  qc3 = scaled[i+2] * Q34 + 0.4054f;
569  qc4 = scaled[i+3] * Q34 + 0.4054f;
570 
571  __asm__ volatile (
572  ".set push \n\t"
573  ".set noreorder \n\t"
574 
575  "ori %[t4], $zero, 12 \n\t"
576  "ori %[sign1], $zero, 0 \n\t"
577  "ori %[sign2], $zero, 0 \n\t"
578  "slt %[t0], %[t4], %[qc1] \n\t"
579  "slt %[t1], %[t4], %[qc2] \n\t"
580  "slt %[t2], %[t4], %[qc3] \n\t"
581  "slt %[t3], %[t4], %[qc4] \n\t"
582  "movn %[qc1], %[t4], %[t0] \n\t"
583  "movn %[qc2], %[t4], %[t1] \n\t"
584  "movn %[qc3], %[t4], %[t2] \n\t"
585  "movn %[qc4], %[t4], %[t3] \n\t"
586  "lw %[t0], 0(%[in_int]) \n\t"
587  "lw %[t1], 4(%[in_int]) \n\t"
588  "lw %[t2], 8(%[in_int]) \n\t"
589  "lw %[t3], 12(%[in_int]) \n\t"
590  "slt %[t0], %[t0], $zero \n\t"
591  "movn %[sign1], %[t0], %[qc1] \n\t"
592  "slt %[t2], %[t2], $zero \n\t"
593  "movn %[sign2], %[t2], %[qc3] \n\t"
594  "slt %[t1], %[t1], $zero \n\t"
595  "sll %[t0], %[sign1], 1 \n\t"
596  "or %[t0], %[t0], %[t1] \n\t"
597  "movn %[sign1], %[t0], %[qc2] \n\t"
598  "slt %[t3], %[t3], $zero \n\t"
599  "sll %[t0], %[sign2], 1 \n\t"
600  "or %[t0], %[t0], %[t3] \n\t"
601  "movn %[sign2], %[t0], %[qc4] \n\t"
602  "slt %[count1], $zero, %[qc1] \n\t"
603  "slt %[t1], $zero, %[qc2] \n\t"
604  "slt %[count2], $zero, %[qc3] \n\t"
605  "slt %[t2], $zero, %[qc4] \n\t"
606  "addu %[count1], %[count1], %[t1] \n\t"
607  "addu %[count2], %[count2], %[t2] \n\t"
608 
609  ".set pop \n\t"
610 
611  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
612  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
613  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
614  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
615  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
616  [t4]"=&r"(t4)
617  : [in_int]"r"(in_int)
618  : "memory"
619  );
620 
621  curidx = 13 * qc1;
622  curidx += qc2;
623 
624  v_codes = (p_codes[curidx] << count1) | sign1;
625  v_bits = p_bits[curidx] + count1;
626  put_bits(pb, v_bits, v_codes);
627 
628  curidx = 13 * qc3;
629  curidx += qc4;
630 
631  v_codes = (p_codes[curidx] << count2) | sign2;
632  v_bits = p_bits[curidx] + count2;
633  put_bits(pb, v_bits, v_codes);
634  }
635 }
636 
637 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
638  PutBitContext *pb, const float *in,
639  const float *scaled, int size, int scale_idx,
640  int cb, const float lambda, const float uplim,
641  int *bits)
642 {
643  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
644  int i;
645  int qc1, qc2, qc3, qc4;
646 
647  uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
648  uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
649  float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
650 
651  abs_pow34_v(s->scoefs, in, size);
652  scaled = s->scoefs;
653 
654  if (cb < 11) {
655  for (i = 0; i < size; i += 4) {
656  int curidx, curidx2, sign1, count1, sign2, count2;
657  int *in_int = (int *)&in[i];
658  uint8_t v_bits;
659  unsigned int v_codes;
660  int t0, t1, t2, t3, t4;
661 
662  qc1 = scaled[i ] * Q34 + 0.4054f;
663  qc2 = scaled[i+1] * Q34 + 0.4054f;
664  qc3 = scaled[i+2] * Q34 + 0.4054f;
665  qc4 = scaled[i+3] * Q34 + 0.4054f;
666 
667  __asm__ volatile (
668  ".set push \n\t"
669  ".set noreorder \n\t"
670 
671  "ori %[t4], $zero, 16 \n\t"
672  "ori %[sign1], $zero, 0 \n\t"
673  "ori %[sign2], $zero, 0 \n\t"
674  "slt %[t0], %[t4], %[qc1] \n\t"
675  "slt %[t1], %[t4], %[qc2] \n\t"
676  "slt %[t2], %[t4], %[qc3] \n\t"
677  "slt %[t3], %[t4], %[qc4] \n\t"
678  "movn %[qc1], %[t4], %[t0] \n\t"
679  "movn %[qc2], %[t4], %[t1] \n\t"
680  "movn %[qc3], %[t4], %[t2] \n\t"
681  "movn %[qc4], %[t4], %[t3] \n\t"
682  "lw %[t0], 0(%[in_int]) \n\t"
683  "lw %[t1], 4(%[in_int]) \n\t"
684  "lw %[t2], 8(%[in_int]) \n\t"
685  "lw %[t3], 12(%[in_int]) \n\t"
686  "slt %[t0], %[t0], $zero \n\t"
687  "movn %[sign1], %[t0], %[qc1] \n\t"
688  "slt %[t2], %[t2], $zero \n\t"
689  "movn %[sign2], %[t2], %[qc3] \n\t"
690  "slt %[t1], %[t1], $zero \n\t"
691  "sll %[t0], %[sign1], 1 \n\t"
692  "or %[t0], %[t0], %[t1] \n\t"
693  "movn %[sign1], %[t0], %[qc2] \n\t"
694  "slt %[t3], %[t3], $zero \n\t"
695  "sll %[t0], %[sign2], 1 \n\t"
696  "or %[t0], %[t0], %[t3] \n\t"
697  "movn %[sign2], %[t0], %[qc4] \n\t"
698  "slt %[count1], $zero, %[qc1] \n\t"
699  "slt %[t1], $zero, %[qc2] \n\t"
700  "slt %[count2], $zero, %[qc3] \n\t"
701  "slt %[t2], $zero, %[qc4] \n\t"
702  "addu %[count1], %[count1], %[t1] \n\t"
703  "addu %[count2], %[count2], %[t2] \n\t"
704 
705  ".set pop \n\t"
706 
707  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
708  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
709  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
710  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
711  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
712  [t4]"=&r"(t4)
713  : [in_int]"r"(in_int)
714  : "memory"
715  );
716 
717  curidx = 17 * qc1;
718  curidx += qc2;
719  curidx2 = 17 * qc3;
720  curidx2 += qc4;
721 
722  v_codes = (p_codes[curidx] << count1) | sign1;
723  v_bits = p_bits[curidx] + count1;
724  put_bits(pb, v_bits, v_codes);
725 
726  v_codes = (p_codes[curidx2] << count2) | sign2;
727  v_bits = p_bits[curidx2] + count2;
728  put_bits(pb, v_bits, v_codes);
729  }
730  } else {
731  for (i = 0; i < size; i += 4) {
732  int curidx, curidx2, sign1, count1, sign2, count2;
733  int *in_int = (int *)&in[i];
734  uint8_t v_bits;
735  unsigned int v_codes;
736  int c1, c2, c3, c4;
737  int t0, t1, t2, t3, t4;
738 
739  qc1 = scaled[i ] * Q34 + 0.4054f;
740  qc2 = scaled[i+1] * Q34 + 0.4054f;
741  qc3 = scaled[i+2] * Q34 + 0.4054f;
742  qc4 = scaled[i+3] * Q34 + 0.4054f;
743 
744  __asm__ volatile (
745  ".set push \n\t"
746  ".set noreorder \n\t"
747 
748  "ori %[t4], $zero, 16 \n\t"
749  "ori %[sign1], $zero, 0 \n\t"
750  "ori %[sign2], $zero, 0 \n\t"
751  "shll_s.w %[c1], %[qc1], 18 \n\t"
752  "shll_s.w %[c2], %[qc2], 18 \n\t"
753  "shll_s.w %[c3], %[qc3], 18 \n\t"
754  "shll_s.w %[c4], %[qc4], 18 \n\t"
755  "srl %[c1], %[c1], 18 \n\t"
756  "srl %[c2], %[c2], 18 \n\t"
757  "srl %[c3], %[c3], 18 \n\t"
758  "srl %[c4], %[c4], 18 \n\t"
759  "slt %[t0], %[t4], %[qc1] \n\t"
760  "slt %[t1], %[t4], %[qc2] \n\t"
761  "slt %[t2], %[t4], %[qc3] \n\t"
762  "slt %[t3], %[t4], %[qc4] \n\t"
763  "movn %[qc1], %[t4], %[t0] \n\t"
764  "movn %[qc2], %[t4], %[t1] \n\t"
765  "movn %[qc3], %[t4], %[t2] \n\t"
766  "movn %[qc4], %[t4], %[t3] \n\t"
767  "lw %[t0], 0(%[in_int]) \n\t"
768  "lw %[t1], 4(%[in_int]) \n\t"
769  "lw %[t2], 8(%[in_int]) \n\t"
770  "lw %[t3], 12(%[in_int]) \n\t"
771  "slt %[t0], %[t0], $zero \n\t"
772  "movn %[sign1], %[t0], %[qc1] \n\t"
773  "slt %[t2], %[t2], $zero \n\t"
774  "movn %[sign2], %[t2], %[qc3] \n\t"
775  "slt %[t1], %[t1], $zero \n\t"
776  "sll %[t0], %[sign1], 1 \n\t"
777  "or %[t0], %[t0], %[t1] \n\t"
778  "movn %[sign1], %[t0], %[qc2] \n\t"
779  "slt %[t3], %[t3], $zero \n\t"
780  "sll %[t0], %[sign2], 1 \n\t"
781  "or %[t0], %[t0], %[t3] \n\t"
782  "movn %[sign2], %[t0], %[qc4] \n\t"
783  "slt %[count1], $zero, %[qc1] \n\t"
784  "slt %[t1], $zero, %[qc2] \n\t"
785  "slt %[count2], $zero, %[qc3] \n\t"
786  "slt %[t2], $zero, %[qc4] \n\t"
787  "addu %[count1], %[count1], %[t1] \n\t"
788  "addu %[count2], %[count2], %[t2] \n\t"
789 
790  ".set pop \n\t"
791 
792  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
793  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
794  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
795  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
796  [c1]"=&r"(c1), [c2]"=&r"(c2),
797  [c3]"=&r"(c3), [c4]"=&r"(c4),
798  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
799  [t4]"=&r"(t4)
800  : [in_int]"r"(in_int)
801  : "memory"
802  );
803 
804  curidx = 17 * qc1;
805  curidx += qc2;
806 
807  curidx2 = 17 * qc3;
808  curidx2 += qc4;
809 
810  v_codes = (p_codes[curidx] << count1) | sign1;
811  v_bits = p_bits[curidx] + count1;
812  put_bits(pb, v_bits, v_codes);
813 
814  if (p_vectors[curidx*2 ] == 64.0f) {
815  int len = av_log2(c1);
816  v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
817  put_bits(pb, len * 2 - 3, v_codes);
818  }
819  if (p_vectors[curidx*2+1] == 64.0f) {
820  int len = av_log2(c2);
821  v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
822  put_bits(pb, len*2-3, v_codes);
823  }
824 
825  v_codes = (p_codes[curidx2] << count2) | sign2;
826  v_bits = p_bits[curidx2] + count2;
827  put_bits(pb, v_bits, v_codes);
828 
829  if (p_vectors[curidx2*2 ] == 64.0f) {
830  int len = av_log2(c3);
831  v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
832  put_bits(pb, len* 2 - 3, v_codes);
833  }
834  if (p_vectors[curidx2*2+1] == 64.0f) {
835  int len = av_log2(c4);
836  v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
837  put_bits(pb, len * 2 - 3, v_codes);
838  }
839  }
840  }
841 }
842 
843 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
844  PutBitContext *pb, const float *in,
845  const float *scaled, int size, int scale_idx,
846  int cb, const float lambda, const float uplim,
847  int *bits) = {
848  NULL,
849  quantize_and_encode_band_cost_SQUAD_mips,
850  quantize_and_encode_band_cost_SQUAD_mips,
851  quantize_and_encode_band_cost_UQUAD_mips,
852  quantize_and_encode_band_cost_UQUAD_mips,
853  quantize_and_encode_band_cost_SPAIR_mips,
854  quantize_and_encode_band_cost_SPAIR_mips,
855  quantize_and_encode_band_cost_UPAIR7_mips,
856  quantize_and_encode_band_cost_UPAIR7_mips,
857  quantize_and_encode_band_cost_UPAIR12_mips,
858  quantize_and_encode_band_cost_UPAIR12_mips,
859  quantize_and_encode_band_cost_ESC_mips,
860 };
861 
862 #define quantize_and_encode_band_cost( \
863  s, pb, in, scaled, size, scale_idx, cb, \
864  lambda, uplim, bits) \
865  quantize_and_encode_band_cost_arr[cb]( \
866  s, pb, in, scaled, size, scale_idx, cb, \
867  lambda, uplim, bits)
868 
869 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
870  const float *in, int size, int scale_idx,
871  int cb, const float lambda)
872 {
873  quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
874  INFINITY, NULL);
875 }
876 
877 /**
878  * Functions developed from template function and optimized for getting the number of bits
879  */
880 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
881  PutBitContext *pb, const float *in,
882  const float *scaled, int size, int scale_idx,
883  int cb, const float lambda, const float uplim,
884  int *bits)
885 {
886  return 0;
887 }
888 
889 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
890  PutBitContext *pb, const float *in,
891  const float *scaled, int size, int scale_idx,
892  int cb, const float lambda, const float uplim,
893  int *bits)
894 {
895  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
896  int i;
897  int qc1, qc2, qc3, qc4;
898  int curbits = 0;
899 
900  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
901 
902  for (i = 0; i < size; i += 4) {
903  int curidx;
904  int *in_int = (int *)&in[i];
905  int t0, t1, t2, t3, t4, t5, t6, t7;
906 
907  qc1 = scaled[i ] * Q34 + 0.4054f;
908  qc2 = scaled[i+1] * Q34 + 0.4054f;
909  qc3 = scaled[i+2] * Q34 + 0.4054f;
910  qc4 = scaled[i+3] * Q34 + 0.4054f;
911 
912  __asm__ volatile (
913  ".set push \n\t"
914  ".set noreorder \n\t"
915 
916  "slt %[qc1], $zero, %[qc1] \n\t"
917  "slt %[qc2], $zero, %[qc2] \n\t"
918  "slt %[qc3], $zero, %[qc3] \n\t"
919  "slt %[qc4], $zero, %[qc4] \n\t"
920  "lw %[t0], 0(%[in_int]) \n\t"
921  "lw %[t1], 4(%[in_int]) \n\t"
922  "lw %[t2], 8(%[in_int]) \n\t"
923  "lw %[t3], 12(%[in_int]) \n\t"
924  "srl %[t0], %[t0], 31 \n\t"
925  "srl %[t1], %[t1], 31 \n\t"
926  "srl %[t2], %[t2], 31 \n\t"
927  "srl %[t3], %[t3], 31 \n\t"
928  "subu %[t4], $zero, %[qc1] \n\t"
929  "subu %[t5], $zero, %[qc2] \n\t"
930  "subu %[t6], $zero, %[qc3] \n\t"
931  "subu %[t7], $zero, %[qc4] \n\t"
932  "movn %[qc1], %[t4], %[t0] \n\t"
933  "movn %[qc2], %[t5], %[t1] \n\t"
934  "movn %[qc3], %[t6], %[t2] \n\t"
935  "movn %[qc4], %[t7], %[t3] \n\t"
936 
937  ".set pop \n\t"
938 
939  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
940  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
941  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
942  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
943  : [in_int]"r"(in_int)
944  : "memory"
945  );
946 
947  curidx = qc1;
948  curidx *= 3;
949  curidx += qc2;
950  curidx *= 3;
951  curidx += qc3;
952  curidx *= 3;
953  curidx += qc4;
954  curidx += 40;
955 
956  curbits += p_bits[curidx];
957  }
958  return curbits;
959 }
960 
961 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
962  PutBitContext *pb, const float *in,
963  const float *scaled, int size, int scale_idx,
964  int cb, const float lambda, const float uplim,
965  int *bits)
966 {
967  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
968  int i;
969  int curbits = 0;
970  int qc1, qc2, qc3, qc4;
971 
972  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
973 
974  for (i = 0; i < size; i += 4) {
975  int curidx;
976  int t0, t1, t2, t3, t4;
977 
978  qc1 = scaled[i ] * Q34 + 0.4054f;
979  qc2 = scaled[i+1] * Q34 + 0.4054f;
980  qc3 = scaled[i+2] * Q34 + 0.4054f;
981  qc4 = scaled[i+3] * Q34 + 0.4054f;
982 
983  __asm__ volatile (
984  ".set push \n\t"
985  ".set noreorder \n\t"
986 
987  "ori %[t4], $zero, 2 \n\t"
988  "slt %[t0], %[t4], %[qc1] \n\t"
989  "slt %[t1], %[t4], %[qc2] \n\t"
990  "slt %[t2], %[t4], %[qc3] \n\t"
991  "slt %[t3], %[t4], %[qc4] \n\t"
992  "movn %[qc1], %[t4], %[t0] \n\t"
993  "movn %[qc2], %[t4], %[t1] \n\t"
994  "movn %[qc3], %[t4], %[t2] \n\t"
995  "movn %[qc4], %[t4], %[t3] \n\t"
996 
997  ".set pop \n\t"
998 
999  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1000  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1001  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1002  [t4]"=&r"(t4)
1003  );
1004 
1005  curidx = qc1;
1006  curidx *= 3;
1007  curidx += qc2;
1008  curidx *= 3;
1009  curidx += qc3;
1010  curidx *= 3;
1011  curidx += qc4;
1012 
1013  curbits += p_bits[curidx];
1014  curbits += uquad_sign_bits[curidx];
1015  }
1016  return curbits;
1017 }
1018 
1019 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1020  PutBitContext *pb, const float *in,
1021  const float *scaled, int size, int scale_idx,
1022  int cb, const float lambda, const float uplim,
1023  int *bits)
1024 {
1025  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1026  int i;
1027  int qc1, qc2, qc3, qc4;
1028  int curbits = 0;
1029 
1030  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1031 
1032  for (i = 0; i < size; i += 4) {
1033  int curidx, curidx2;
1034  int *in_int = (int *)&in[i];
1035  int t0, t1, t2, t3, t4, t5, t6, t7;
1036 
1037  qc1 = scaled[i ] * Q34 + 0.4054f;
1038  qc2 = scaled[i+1] * Q34 + 0.4054f;
1039  qc3 = scaled[i+2] * Q34 + 0.4054f;
1040  qc4 = scaled[i+3] * Q34 + 0.4054f;
1041 
1042  __asm__ volatile (
1043  ".set push \n\t"
1044  ".set noreorder \n\t"
1045 
1046  "ori %[t4], $zero, 4 \n\t"
1047  "slt %[t0], %[t4], %[qc1] \n\t"
1048  "slt %[t1], %[t4], %[qc2] \n\t"
1049  "slt %[t2], %[t4], %[qc3] \n\t"
1050  "slt %[t3], %[t4], %[qc4] \n\t"
1051  "movn %[qc1], %[t4], %[t0] \n\t"
1052  "movn %[qc2], %[t4], %[t1] \n\t"
1053  "movn %[qc3], %[t4], %[t2] \n\t"
1054  "movn %[qc4], %[t4], %[t3] \n\t"
1055  "lw %[t0], 0(%[in_int]) \n\t"
1056  "lw %[t1], 4(%[in_int]) \n\t"
1057  "lw %[t2], 8(%[in_int]) \n\t"
1058  "lw %[t3], 12(%[in_int]) \n\t"
1059  "srl %[t0], %[t0], 31 \n\t"
1060  "srl %[t1], %[t1], 31 \n\t"
1061  "srl %[t2], %[t2], 31 \n\t"
1062  "srl %[t3], %[t3], 31 \n\t"
1063  "subu %[t4], $zero, %[qc1] \n\t"
1064  "subu %[t5], $zero, %[qc2] \n\t"
1065  "subu %[t6], $zero, %[qc3] \n\t"
1066  "subu %[t7], $zero, %[qc4] \n\t"
1067  "movn %[qc1], %[t4], %[t0] \n\t"
1068  "movn %[qc2], %[t5], %[t1] \n\t"
1069  "movn %[qc3], %[t6], %[t2] \n\t"
1070  "movn %[qc4], %[t7], %[t3] \n\t"
1071 
1072  ".set pop \n\t"
1073 
1074  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1075  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1076  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1077  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1078  : [in_int]"r"(in_int)
1079  : "memory"
1080  );
1081 
1082  curidx = 9 * qc1;
1083  curidx += qc2 + 40;
1084 
1085  curidx2 = 9 * qc3;
1086  curidx2 += qc4 + 40;
1087 
1088  curbits += p_bits[curidx] + p_bits[curidx2];
1089  }
1090  return curbits;
1091 }
1092 
1093 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1094  PutBitContext *pb, const float *in,
1095  const float *scaled, int size, int scale_idx,
1096  int cb, const float lambda, const float uplim,
1097  int *bits)
1098 {
1099  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1100  int i;
1101  int qc1, qc2, qc3, qc4;
1102  int curbits = 0;
1103 
1104  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1105 
1106  for (i = 0; i < size; i += 4) {
1107  int curidx, curidx2;
1108  int t0, t1, t2, t3, t4;
1109 
1110  qc1 = scaled[i ] * Q34 + 0.4054f;
1111  qc2 = scaled[i+1] * Q34 + 0.4054f;
1112  qc3 = scaled[i+2] * Q34 + 0.4054f;
1113  qc4 = scaled[i+3] * Q34 + 0.4054f;
1114 
1115  __asm__ volatile (
1116  ".set push \n\t"
1117  ".set noreorder \n\t"
1118 
1119  "ori %[t4], $zero, 7 \n\t"
1120  "slt %[t0], %[t4], %[qc1] \n\t"
1121  "slt %[t1], %[t4], %[qc2] \n\t"
1122  "slt %[t2], %[t4], %[qc3] \n\t"
1123  "slt %[t3], %[t4], %[qc4] \n\t"
1124  "movn %[qc1], %[t4], %[t0] \n\t"
1125  "movn %[qc2], %[t4], %[t1] \n\t"
1126  "movn %[qc3], %[t4], %[t2] \n\t"
1127  "movn %[qc4], %[t4], %[t3] \n\t"
1128 
1129  ".set pop \n\t"
1130 
1131  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1132  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1133  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1134  [t4]"=&r"(t4)
1135  );
1136 
1137  curidx = 8 * qc1;
1138  curidx += qc2;
1139 
1140  curidx2 = 8 * qc3;
1141  curidx2 += qc4;
1142 
1143  curbits += p_bits[curidx] +
1144  upair7_sign_bits[curidx] +
1145  p_bits[curidx2] +
1146  upair7_sign_bits[curidx2];
1147  }
1148  return curbits;
1149 }
1150 
1151 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1152  PutBitContext *pb, const float *in,
1153  const float *scaled, int size, int scale_idx,
1154  int cb, const float lambda, const float uplim,
1155  int *bits)
1156 {
1157  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1158  int i;
1159  int qc1, qc2, qc3, qc4;
1160  int curbits = 0;
1161 
1162  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1163 
1164  for (i = 0; i < size; i += 4) {
1165  int curidx, curidx2;
1166  int t0, t1, t2, t3, t4;
1167 
1168  qc1 = scaled[i ] * Q34 + 0.4054f;
1169  qc2 = scaled[i+1] * Q34 + 0.4054f;
1170  qc3 = scaled[i+2] * Q34 + 0.4054f;
1171  qc4 = scaled[i+3] * Q34 + 0.4054f;
1172 
1173  __asm__ volatile (
1174  ".set push \n\t"
1175  ".set noreorder \n\t"
1176 
1177  "ori %[t4], $zero, 12 \n\t"
1178  "slt %[t0], %[t4], %[qc1] \n\t"
1179  "slt %[t1], %[t4], %[qc2] \n\t"
1180  "slt %[t2], %[t4], %[qc3] \n\t"
1181  "slt %[t3], %[t4], %[qc4] \n\t"
1182  "movn %[qc1], %[t4], %[t0] \n\t"
1183  "movn %[qc2], %[t4], %[t1] \n\t"
1184  "movn %[qc3], %[t4], %[t2] \n\t"
1185  "movn %[qc4], %[t4], %[t3] \n\t"
1186 
1187  ".set pop \n\t"
1188 
1189  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1190  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1191  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1192  [t4]"=&r"(t4)
1193  );
1194 
1195  curidx = 13 * qc1;
1196  curidx += qc2;
1197 
1198  curidx2 = 13 * qc3;
1199  curidx2 += qc4;
1200 
1201  curbits += p_bits[curidx] +
1202  p_bits[curidx2] +
1203  upair12_sign_bits[curidx] +
1204  upair12_sign_bits[curidx2];
1205  }
1206  return curbits;
1207 }
1208 
1209 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1210  PutBitContext *pb, const float *in,
1211  const float *scaled, int size, int scale_idx,
1212  int cb, const float lambda, const float uplim,
1213  int *bits)
1214 {
1215  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1216  int i;
1217  int qc1, qc2, qc3, qc4;
1218  int curbits = 0;
1219 
1220  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1221 
1222  for (i = 0; i < size; i += 4) {
1223  int curidx, curidx2;
1224  int cond0, cond1, cond2, cond3;
1225  int c1, c2, c3, c4;
1226  int t4, t5;
1227 
1228  qc1 = scaled[i ] * Q34 + 0.4054f;
1229  qc2 = scaled[i+1] * Q34 + 0.4054f;
1230  qc3 = scaled[i+2] * Q34 + 0.4054f;
1231  qc4 = scaled[i+3] * Q34 + 0.4054f;
1232 
1233  __asm__ volatile (
1234  ".set push \n\t"
1235  ".set noreorder \n\t"
1236 
1237  "ori %[t4], $zero, 15 \n\t"
1238  "ori %[t5], $zero, 16 \n\t"
1239  "shll_s.w %[c1], %[qc1], 18 \n\t"
1240  "shll_s.w %[c2], %[qc2], 18 \n\t"
1241  "shll_s.w %[c3], %[qc3], 18 \n\t"
1242  "shll_s.w %[c4], %[qc4], 18 \n\t"
1243  "srl %[c1], %[c1], 18 \n\t"
1244  "srl %[c2], %[c2], 18 \n\t"
1245  "srl %[c3], %[c3], 18 \n\t"
1246  "srl %[c4], %[c4], 18 \n\t"
1247  "slt %[cond0], %[t4], %[qc1] \n\t"
1248  "slt %[cond1], %[t4], %[qc2] \n\t"
1249  "slt %[cond2], %[t4], %[qc3] \n\t"
1250  "slt %[cond3], %[t4], %[qc4] \n\t"
1251  "movn %[qc1], %[t5], %[cond0] \n\t"
1252  "movn %[qc2], %[t5], %[cond1] \n\t"
1253  "movn %[qc3], %[t5], %[cond2] \n\t"
1254  "movn %[qc4], %[t5], %[cond3] \n\t"
1255  "ori %[t5], $zero, 31 \n\t"
1256  "clz %[c1], %[c1] \n\t"
1257  "clz %[c2], %[c2] \n\t"
1258  "clz %[c3], %[c3] \n\t"
1259  "clz %[c4], %[c4] \n\t"
1260  "subu %[c1], %[t5], %[c1] \n\t"
1261  "subu %[c2], %[t5], %[c2] \n\t"
1262  "subu %[c3], %[t5], %[c3] \n\t"
1263  "subu %[c4], %[t5], %[c4] \n\t"
1264  "sll %[c1], %[c1], 1 \n\t"
1265  "sll %[c2], %[c2], 1 \n\t"
1266  "sll %[c3], %[c3], 1 \n\t"
1267  "sll %[c4], %[c4], 1 \n\t"
1268  "addiu %[c1], %[c1], -3 \n\t"
1269  "addiu %[c2], %[c2], -3 \n\t"
1270  "addiu %[c3], %[c3], -3 \n\t"
1271  "addiu %[c4], %[c4], -3 \n\t"
1272  "subu %[cond0], $zero, %[cond0] \n\t"
1273  "subu %[cond1], $zero, %[cond1] \n\t"
1274  "subu %[cond2], $zero, %[cond2] \n\t"
1275  "subu %[cond3], $zero, %[cond3] \n\t"
1276  "and %[c1], %[c1], %[cond0] \n\t"
1277  "and %[c2], %[c2], %[cond1] \n\t"
1278  "and %[c3], %[c3], %[cond2] \n\t"
1279  "and %[c4], %[c4], %[cond3] \n\t"
1280 
1281  ".set pop \n\t"
1282 
1283  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1284  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1285  [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1286  [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1287  [c1]"=&r"(c1), [c2]"=&r"(c2),
1288  [c3]"=&r"(c3), [c4]"=&r"(c4),
1289  [t4]"=&r"(t4), [t5]"=&r"(t5)
1290  );
1291 
1292  curidx = 17 * qc1;
1293  curidx += qc2;
1294 
1295  curidx2 = 17 * qc3;
1296  curidx2 += qc4;
1297 
1298  curbits += p_bits[curidx];
1299  curbits += esc_sign_bits[curidx];
1300  curbits += p_bits[curidx2];
1301  curbits += esc_sign_bits[curidx2];
1302 
1303  curbits += c1;
1304  curbits += c2;
1305  curbits += c3;
1306  curbits += c4;
1307  }
1308  return curbits;
1309 }
1310 
1311 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1312  PutBitContext *pb, const float *in,
1313  const float *scaled, int size, int scale_idx,
1314  int cb, const float lambda, const float uplim,
1315  int *bits) = {
1316  get_band_numbits_ZERO_mips,
1317  get_band_numbits_SQUAD_mips,
1318  get_band_numbits_SQUAD_mips,
1319  get_band_numbits_UQUAD_mips,
1320  get_band_numbits_UQUAD_mips,
1321  get_band_numbits_SPAIR_mips,
1322  get_band_numbits_SPAIR_mips,
1323  get_band_numbits_UPAIR7_mips,
1324  get_band_numbits_UPAIR7_mips,
1325  get_band_numbits_UPAIR12_mips,
1326  get_band_numbits_UPAIR12_mips,
1327  get_band_numbits_ESC_mips,
1328 };
1329 
1330 #define get_band_numbits( \
1331  s, pb, in, scaled, size, scale_idx, cb, \
1332  lambda, uplim, bits) \
1333  get_band_numbits_arr[cb]( \
1334  s, pb, in, scaled, size, scale_idx, cb, \
1335  lambda, uplim, bits)
1336 
1337 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1338  const float *scaled, int size, int scale_idx,
1339  int cb, const float lambda, const float uplim,
1340  int *bits)
1341 {
1342  return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1343 }
1344 
1345 /**
1346  * Functions developed from template function and optimized for getting the band cost
1347  */
1348 #if HAVE_MIPSFPU
1349 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1350  PutBitContext *pb, const float *in,
1351  const float *scaled, int size, int scale_idx,
1352  int cb, const float lambda, const float uplim,
1353  int *bits)
1354 {
1355  int i;
1356  float cost = 0;
1357 
1358  for (i = 0; i < size; i += 4) {
1359  cost += in[i ] * in[i ];
1360  cost += in[i+1] * in[i+1];
1361  cost += in[i+2] * in[i+2];
1362  cost += in[i+3] * in[i+3];
1363  }
1364  if (bits)
1365  *bits = 0;
1366  return cost * lambda;
1367 }
1368 
1369 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1370  PutBitContext *pb, const float *in,
1371  const float *scaled, int size, int scale_idx,
1372  int cb, const float lambda, const float uplim,
1373  int *bits)
1374 {
1375  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1376  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1377  int i;
1378  float cost = 0;
1379  int qc1, qc2, qc3, qc4;
1380  int curbits = 0;
1381 
1382  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1383  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1384 
1385  for (i = 0; i < size; i += 4) {
1386  const float *vec;
1387  int curidx;
1388  int *in_int = (int *)&in[i];
1389  float *in_pos = (float *)&in[i];
1390  float di0, di1, di2, di3;
1391  int t0, t1, t2, t3, t4, t5, t6, t7;
1392 
1393  qc1 = scaled[i ] * Q34 + 0.4054f;
1394  qc2 = scaled[i+1] * Q34 + 0.4054f;
1395  qc3 = scaled[i+2] * Q34 + 0.4054f;
1396  qc4 = scaled[i+3] * Q34 + 0.4054f;
1397 
1398  __asm__ volatile (
1399  ".set push \n\t"
1400  ".set noreorder \n\t"
1401 
1402  "slt %[qc1], $zero, %[qc1] \n\t"
1403  "slt %[qc2], $zero, %[qc2] \n\t"
1404  "slt %[qc3], $zero, %[qc3] \n\t"
1405  "slt %[qc4], $zero, %[qc4] \n\t"
1406  "lw %[t0], 0(%[in_int]) \n\t"
1407  "lw %[t1], 4(%[in_int]) \n\t"
1408  "lw %[t2], 8(%[in_int]) \n\t"
1409  "lw %[t3], 12(%[in_int]) \n\t"
1410  "srl %[t0], %[t0], 31 \n\t"
1411  "srl %[t1], %[t1], 31 \n\t"
1412  "srl %[t2], %[t2], 31 \n\t"
1413  "srl %[t3], %[t3], 31 \n\t"
1414  "subu %[t4], $zero, %[qc1] \n\t"
1415  "subu %[t5], $zero, %[qc2] \n\t"
1416  "subu %[t6], $zero, %[qc3] \n\t"
1417  "subu %[t7], $zero, %[qc4] \n\t"
1418  "movn %[qc1], %[t4], %[t0] \n\t"
1419  "movn %[qc2], %[t5], %[t1] \n\t"
1420  "movn %[qc3], %[t6], %[t2] \n\t"
1421  "movn %[qc4], %[t7], %[t3] \n\t"
1422 
1423  ".set pop \n\t"
1424 
1425  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1426  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1427  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1428  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1429  : [in_int]"r"(in_int)
1430  : "memory"
1431  );
1432 
1433  curidx = qc1;
1434  curidx *= 3;
1435  curidx += qc2;
1436  curidx *= 3;
1437  curidx += qc3;
1438  curidx *= 3;
1439  curidx += qc4;
1440  curidx += 40;
1441 
1442  curbits += p_bits[curidx];
1443  vec = &p_codes[curidx*4];
1444 
1445  __asm__ volatile (
1446  ".set push \n\t"
1447  ".set noreorder \n\t"
1448 
1449  "lwc1 $f0, 0(%[in_pos]) \n\t"
1450  "lwc1 $f1, 0(%[vec]) \n\t"
1451  "lwc1 $f2, 4(%[in_pos]) \n\t"
1452  "lwc1 $f3, 4(%[vec]) \n\t"
1453  "lwc1 $f4, 8(%[in_pos]) \n\t"
1454  "lwc1 $f5, 8(%[vec]) \n\t"
1455  "lwc1 $f6, 12(%[in_pos]) \n\t"
1456  "lwc1 $f7, 12(%[vec]) \n\t"
1457  "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1458  "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1459  "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1460  "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1461 
1462  ".set pop \n\t"
1463 
1464  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1465  [di2]"=&f"(di2), [di3]"=&f"(di3)
1466  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1467  [IQ]"f"(IQ)
1468  : "$f0", "$f1", "$f2", "$f3",
1469  "$f4", "$f5", "$f6", "$f7",
1470  "memory"
1471  );
1472 
1473  cost += di0 * di0 + di1 * di1
1474  + di2 * di2 + di3 * di3;
1475  }
1476 
1477  if (bits)
1478  *bits = curbits;
1479  return cost * lambda + curbits;
1480 }
1481 
1482 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1483  PutBitContext *pb, const float *in,
1484  const float *scaled, int size, int scale_idx,
1485  int cb, const float lambda, const float uplim,
1486  int *bits)
1487 {
1488  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1489  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1490  int i;
1491  float cost = 0;
1492  int curbits = 0;
1493  int qc1, qc2, qc3, qc4;
1494 
1495  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1496  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1497 
1498  for (i = 0; i < size; i += 4) {
1499  const float *vec;
1500  int curidx;
1501  float *in_pos = (float *)&in[i];
1502  float di0, di1, di2, di3;
1503  int t0, t1, t2, t3, t4;
1504 
1505  qc1 = scaled[i ] * Q34 + 0.4054f;
1506  qc2 = scaled[i+1] * Q34 + 0.4054f;
1507  qc3 = scaled[i+2] * Q34 + 0.4054f;
1508  qc4 = scaled[i+3] * Q34 + 0.4054f;
1509 
1510  __asm__ volatile (
1511  ".set push \n\t"
1512  ".set noreorder \n\t"
1513 
1514  "ori %[t4], $zero, 2 \n\t"
1515  "slt %[t0], %[t4], %[qc1] \n\t"
1516  "slt %[t1], %[t4], %[qc2] \n\t"
1517  "slt %[t2], %[t4], %[qc3] \n\t"
1518  "slt %[t3], %[t4], %[qc4] \n\t"
1519  "movn %[qc1], %[t4], %[t0] \n\t"
1520  "movn %[qc2], %[t4], %[t1] \n\t"
1521  "movn %[qc3], %[t4], %[t2] \n\t"
1522  "movn %[qc4], %[t4], %[t3] \n\t"
1523 
1524  ".set pop \n\t"
1525 
1526  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1527  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1528  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1529  [t4]"=&r"(t4)
1530  );
1531 
1532  curidx = qc1;
1533  curidx *= 3;
1534  curidx += qc2;
1535  curidx *= 3;
1536  curidx += qc3;
1537  curidx *= 3;
1538  curidx += qc4;
1539 
1540  curbits += p_bits[curidx];
1541  curbits += uquad_sign_bits[curidx];
1542  vec = &p_codes[curidx*4];
1543 
1544  __asm__ volatile (
1545  ".set push \n\t"
1546  ".set noreorder \n\t"
1547 
1548  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1549  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1550  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1551  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1552  "abs.s %[di0], %[di0] \n\t"
1553  "abs.s %[di1], %[di1] \n\t"
1554  "abs.s %[di2], %[di2] \n\t"
1555  "abs.s %[di3], %[di3] \n\t"
1556  "lwc1 $f0, 0(%[vec]) \n\t"
1557  "lwc1 $f1, 4(%[vec]) \n\t"
1558  "lwc1 $f2, 8(%[vec]) \n\t"
1559  "lwc1 $f3, 12(%[vec]) \n\t"
1560  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1561  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1562  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1563  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1564 
1565  ".set pop \n\t"
1566 
1567  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1568  [di2]"=&f"(di2), [di3]"=&f"(di3)
1569  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1570  [IQ]"f"(IQ)
1571  : "$f0", "$f1", "$f2", "$f3",
1572  "memory"
1573  );
1574 
1575  cost += di0 * di0 + di1 * di1
1576  + di2 * di2 + di3 * di3;
1577  }
1578 
1579  if (bits)
1580  *bits = curbits;
1581  return cost * lambda + curbits;
1582 }
1583 
1584 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1585  PutBitContext *pb, const float *in,
1586  const float *scaled, int size, int scale_idx,
1587  int cb, const float lambda, const float uplim,
1588  int *bits)
1589 {
1590  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1591  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1592  int i;
1593  float cost = 0;
1594  int qc1, qc2, qc3, qc4;
1595  int curbits = 0;
1596 
1597  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1598  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1599 
1600  for (i = 0; i < size; i += 4) {
1601  const float *vec, *vec2;
1602  int curidx, curidx2;
1603  int *in_int = (int *)&in[i];
1604  float *in_pos = (float *)&in[i];
1605  float di0, di1, di2, di3;
1606  int t0, t1, t2, t3, t4, t5, t6, t7;
1607 
1608  qc1 = scaled[i ] * Q34 + 0.4054f;
1609  qc2 = scaled[i+1] * Q34 + 0.4054f;
1610  qc3 = scaled[i+2] * Q34 + 0.4054f;
1611  qc4 = scaled[i+3] * Q34 + 0.4054f;
1612 
1613  __asm__ volatile (
1614  ".set push \n\t"
1615  ".set noreorder \n\t"
1616 
1617  "ori %[t4], $zero, 4 \n\t"
1618  "slt %[t0], %[t4], %[qc1] \n\t"
1619  "slt %[t1], %[t4], %[qc2] \n\t"
1620  "slt %[t2], %[t4], %[qc3] \n\t"
1621  "slt %[t3], %[t4], %[qc4] \n\t"
1622  "movn %[qc1], %[t4], %[t0] \n\t"
1623  "movn %[qc2], %[t4], %[t1] \n\t"
1624  "movn %[qc3], %[t4], %[t2] \n\t"
1625  "movn %[qc4], %[t4], %[t3] \n\t"
1626  "lw %[t0], 0(%[in_int]) \n\t"
1627  "lw %[t1], 4(%[in_int]) \n\t"
1628  "lw %[t2], 8(%[in_int]) \n\t"
1629  "lw %[t3], 12(%[in_int]) \n\t"
1630  "srl %[t0], %[t0], 31 \n\t"
1631  "srl %[t1], %[t1], 31 \n\t"
1632  "srl %[t2], %[t2], 31 \n\t"
1633  "srl %[t3], %[t3], 31 \n\t"
1634  "subu %[t4], $zero, %[qc1] \n\t"
1635  "subu %[t5], $zero, %[qc2] \n\t"
1636  "subu %[t6], $zero, %[qc3] \n\t"
1637  "subu %[t7], $zero, %[qc4] \n\t"
1638  "movn %[qc1], %[t4], %[t0] \n\t"
1639  "movn %[qc2], %[t5], %[t1] \n\t"
1640  "movn %[qc3], %[t6], %[t2] \n\t"
1641  "movn %[qc4], %[t7], %[t3] \n\t"
1642 
1643  ".set pop \n\t"
1644 
1645  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1646  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1647  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1648  [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
1649  : [in_int]"r"(in_int)
1650  : "memory"
1651  );
1652 
1653  curidx = 9 * qc1;
1654  curidx += qc2 + 40;
1655 
1656  curidx2 = 9 * qc3;
1657  curidx2 += qc4 + 40;
1658 
1659  curbits += p_bits[curidx];
1660  curbits += p_bits[curidx2];
1661 
1662  vec = &p_codes[curidx*2];
1663  vec2 = &p_codes[curidx2*2];
1664 
1665  __asm__ volatile (
1666  ".set push \n\t"
1667  ".set noreorder \n\t"
1668 
1669  "lwc1 $f0, 0(%[in_pos]) \n\t"
1670  "lwc1 $f1, 0(%[vec]) \n\t"
1671  "lwc1 $f2, 4(%[in_pos]) \n\t"
1672  "lwc1 $f3, 4(%[vec]) \n\t"
1673  "lwc1 $f4, 8(%[in_pos]) \n\t"
1674  "lwc1 $f5, 0(%[vec2]) \n\t"
1675  "lwc1 $f6, 12(%[in_pos]) \n\t"
1676  "lwc1 $f7, 4(%[vec2]) \n\t"
1677  "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1678  "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1679  "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1680  "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1681 
1682  ".set pop \n\t"
1683 
1684  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1685  [di2]"=&f"(di2), [di3]"=&f"(di3)
1686  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1687  [vec2]"r"(vec2), [IQ]"f"(IQ)
1688  : "$f0", "$f1", "$f2", "$f3",
1689  "$f4", "$f5", "$f6", "$f7",
1690  "memory"
1691  );
1692 
1693  cost += di0 * di0 + di1 * di1
1694  + di2 * di2 + di3 * di3;
1695  }
1696 
1697  if (bits)
1698  *bits = curbits;
1699  return cost * lambda + curbits;
1700 }
1701 
1702 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1703  PutBitContext *pb, const float *in,
1704  const float *scaled, int size, int scale_idx,
1705  int cb, const float lambda, const float uplim,
1706  int *bits)
1707 {
1708  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1709  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1710  int i;
1711  float cost = 0;
1712  int qc1, qc2, qc3, qc4;
1713  int curbits = 0;
1714 
1715  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1716  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1717 
1718  for (i = 0; i < size; i += 4) {
1719  const float *vec, *vec2;
1720  int curidx, curidx2, sign1, count1, sign2, count2;
1721  int *in_int = (int *)&in[i];
1722  float *in_pos = (float *)&in[i];
1723  float di0, di1, di2, di3;
1724  int t0, t1, t2, t3, t4;
1725 
1726  qc1 = scaled[i ] * Q34 + 0.4054f;
1727  qc2 = scaled[i+1] * Q34 + 0.4054f;
1728  qc3 = scaled[i+2] * Q34 + 0.4054f;
1729  qc4 = scaled[i+3] * Q34 + 0.4054f;
1730 
1731  __asm__ volatile (
1732  ".set push \n\t"
1733  ".set noreorder \n\t"
1734 
1735  "ori %[t4], $zero, 7 \n\t"
1736  "ori %[sign1], $zero, 0 \n\t"
1737  "ori %[sign2], $zero, 0 \n\t"
1738  "slt %[t0], %[t4], %[qc1] \n\t"
1739  "slt %[t1], %[t4], %[qc2] \n\t"
1740  "slt %[t2], %[t4], %[qc3] \n\t"
1741  "slt %[t3], %[t4], %[qc4] \n\t"
1742  "movn %[qc1], %[t4], %[t0] \n\t"
1743  "movn %[qc2], %[t4], %[t1] \n\t"
1744  "movn %[qc3], %[t4], %[t2] \n\t"
1745  "movn %[qc4], %[t4], %[t3] \n\t"
1746  "lw %[t0], 0(%[in_int]) \n\t"
1747  "lw %[t1], 4(%[in_int]) \n\t"
1748  "lw %[t2], 8(%[in_int]) \n\t"
1749  "lw %[t3], 12(%[in_int]) \n\t"
1750  "slt %[t0], %[t0], $zero \n\t"
1751  "movn %[sign1], %[t0], %[qc1] \n\t"
1752  "slt %[t2], %[t2], $zero \n\t"
1753  "movn %[sign2], %[t2], %[qc3] \n\t"
1754  "slt %[t1], %[t1], $zero \n\t"
1755  "sll %[t0], %[sign1], 1 \n\t"
1756  "or %[t0], %[t0], %[t1] \n\t"
1757  "movn %[sign1], %[t0], %[qc2] \n\t"
1758  "slt %[t3], %[t3], $zero \n\t"
1759  "sll %[t0], %[sign2], 1 \n\t"
1760  "or %[t0], %[t0], %[t3] \n\t"
1761  "movn %[sign2], %[t0], %[qc4] \n\t"
1762  "slt %[count1], $zero, %[qc1] \n\t"
1763  "slt %[t1], $zero, %[qc2] \n\t"
1764  "slt %[count2], $zero, %[qc3] \n\t"
1765  "slt %[t2], $zero, %[qc4] \n\t"
1766  "addu %[count1], %[count1], %[t1] \n\t"
1767  "addu %[count2], %[count2], %[t2] \n\t"
1768 
1769  ".set pop \n\t"
1770 
1771  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1772  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1773  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1774  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1775  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1776  [t4]"=&r"(t4)
1777  : [in_int]"r"(in_int)
1778  : "memory"
1779  );
1780 
1781  curidx = 8 * qc1;
1782  curidx += qc2;
1783 
1784  curidx2 = 8 * qc3;
1785  curidx2 += qc4;
1786 
1787  curbits += p_bits[curidx];
1788  curbits += upair7_sign_bits[curidx];
1789  vec = &p_codes[curidx*2];
1790 
1791  curbits += p_bits[curidx2];
1792  curbits += upair7_sign_bits[curidx2];
1793  vec2 = &p_codes[curidx2*2];
1794 
1795  __asm__ volatile (
1796  ".set push \n\t"
1797  ".set noreorder \n\t"
1798 
1799  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1800  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1801  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1802  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1803  "abs.s %[di0], %[di0] \n\t"
1804  "abs.s %[di1], %[di1] \n\t"
1805  "abs.s %[di2], %[di2] \n\t"
1806  "abs.s %[di3], %[di3] \n\t"
1807  "lwc1 $f0, 0(%[vec]) \n\t"
1808  "lwc1 $f1, 4(%[vec]) \n\t"
1809  "lwc1 $f2, 0(%[vec2]) \n\t"
1810  "lwc1 $f3, 4(%[vec2]) \n\t"
1811  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1812  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1813  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1814  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1815 
1816  ".set pop \n\t"
1817 
1818  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1819  [di2]"=&f"(di2), [di3]"=&f"(di3)
1820  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1821  [vec2]"r"(vec2), [IQ]"f"(IQ)
1822  : "$f0", "$f1", "$f2", "$f3",
1823  "memory"
1824  );
1825 
1826  cost += di0 * di0 + di1 * di1
1827  + di2 * di2 + di3 * di3;
1828  }
1829 
1830  if (bits)
1831  *bits = curbits;
1832  return cost * lambda + curbits;
1833 }
1834 
1835 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1836  PutBitContext *pb, const float *in,
1837  const float *scaled, int size, int scale_idx,
1838  int cb, const float lambda, const float uplim,
1839  int *bits)
1840 {
1841  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1842  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1843  int i;
1844  float cost = 0;
1845  int qc1, qc2, qc3, qc4;
1846  int curbits = 0;
1847 
1848  uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1849  float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1850 
1851  for (i = 0; i < size; i += 4) {
1852  const float *vec, *vec2;
1853  int curidx, curidx2;
1854  int sign1, count1, sign2, count2;
1855  int *in_int = (int *)&in[i];
1856  float *in_pos = (float *)&in[i];
1857  float di0, di1, di2, di3;
1858  int t0, t1, t2, t3, t4;
1859 
1860  qc1 = scaled[i ] * Q34 + 0.4054f;
1861  qc2 = scaled[i+1] * Q34 + 0.4054f;
1862  qc3 = scaled[i+2] * Q34 + 0.4054f;
1863  qc4 = scaled[i+3] * Q34 + 0.4054f;
1864 
1865  __asm__ volatile (
1866  ".set push \n\t"
1867  ".set noreorder \n\t"
1868 
1869  "ori %[t4], $zero, 12 \n\t"
1870  "ori %[sign1], $zero, 0 \n\t"
1871  "ori %[sign2], $zero, 0 \n\t"
1872  "slt %[t0], %[t4], %[qc1] \n\t"
1873  "slt %[t1], %[t4], %[qc2] \n\t"
1874  "slt %[t2], %[t4], %[qc3] \n\t"
1875  "slt %[t3], %[t4], %[qc4] \n\t"
1876  "movn %[qc1], %[t4], %[t0] \n\t"
1877  "movn %[qc2], %[t4], %[t1] \n\t"
1878  "movn %[qc3], %[t4], %[t2] \n\t"
1879  "movn %[qc4], %[t4], %[t3] \n\t"
1880  "lw %[t0], 0(%[in_int]) \n\t"
1881  "lw %[t1], 4(%[in_int]) \n\t"
1882  "lw %[t2], 8(%[in_int]) \n\t"
1883  "lw %[t3], 12(%[in_int]) \n\t"
1884  "slt %[t0], %[t0], $zero \n\t"
1885  "movn %[sign1], %[t0], %[qc1] \n\t"
1886  "slt %[t2], %[t2], $zero \n\t"
1887  "movn %[sign2], %[t2], %[qc3] \n\t"
1888  "slt %[t1], %[t1], $zero \n\t"
1889  "sll %[t0], %[sign1], 1 \n\t"
1890  "or %[t0], %[t0], %[t1] \n\t"
1891  "movn %[sign1], %[t0], %[qc2] \n\t"
1892  "slt %[t3], %[t3], $zero \n\t"
1893  "sll %[t0], %[sign2], 1 \n\t"
1894  "or %[t0], %[t0], %[t3] \n\t"
1895  "movn %[sign2], %[t0], %[qc4] \n\t"
1896  "slt %[count1], $zero, %[qc1] \n\t"
1897  "slt %[t1], $zero, %[qc2] \n\t"
1898  "slt %[count2], $zero, %[qc3] \n\t"
1899  "slt %[t2], $zero, %[qc4] \n\t"
1900  "addu %[count1], %[count1], %[t1] \n\t"
1901  "addu %[count2], %[count2], %[t2] \n\t"
1902 
1903  ".set pop \n\t"
1904 
1905  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1906  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1907  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1908  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
1909  [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
1910  [t4]"=&r"(t4)
1911  : [in_int]"r"(in_int)
1912  : "memory"
1913  );
1914 
1915  curidx = 13 * qc1;
1916  curidx += qc2;
1917 
1918  curidx2 = 13 * qc3;
1919  curidx2 += qc4;
1920 
1921  curbits += p_bits[curidx];
1922  curbits += p_bits[curidx2];
1923  curbits += upair12_sign_bits[curidx];
1924  curbits += upair12_sign_bits[curidx2];
1925  vec = &p_codes[curidx*2];
1926  vec2 = &p_codes[curidx2*2];
1927 
1928  __asm__ volatile (
1929  ".set push \n\t"
1930  ".set noreorder \n\t"
1931 
1932  "lwc1 %[di0], 0(%[in_pos]) \n\t"
1933  "lwc1 %[di1], 4(%[in_pos]) \n\t"
1934  "lwc1 %[di2], 8(%[in_pos]) \n\t"
1935  "lwc1 %[di3], 12(%[in_pos]) \n\t"
1936  "abs.s %[di0], %[di0] \n\t"
1937  "abs.s %[di1], %[di1] \n\t"
1938  "abs.s %[di2], %[di2] \n\t"
1939  "abs.s %[di3], %[di3] \n\t"
1940  "lwc1 $f0, 0(%[vec]) \n\t"
1941  "lwc1 $f1, 4(%[vec]) \n\t"
1942  "lwc1 $f2, 0(%[vec2]) \n\t"
1943  "lwc1 $f3, 4(%[vec2]) \n\t"
1944  "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1945  "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1946  "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1947  "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1948 
1949  ".set pop \n\t"
1950 
1951  : [di0]"=&f"(di0), [di1]"=&f"(di1),
1952  [di2]"=&f"(di2), [di3]"=&f"(di3)
1953  : [in_pos]"r"(in_pos), [vec]"r"(vec),
1954  [vec2]"r"(vec2), [IQ]"f"(IQ)
1955  : "$f0", "$f1", "$f2", "$f3",
1956  "memory"
1957  );
1958 
1959  cost += di0 * di0 + di1 * di1
1960  + di2 * di2 + di3 * di3;
1961  }
1962 
1963  if (bits)
1964  *bits = curbits;
1965  return cost * lambda + curbits;
1966 }
1967 
1968 static float get_band_cost_ESC_mips(struct AACEncContext *s,
1969  PutBitContext *pb, const float *in,
1970  const float *scaled, int size, int scale_idx,
1971  int cb, const float lambda, const float uplim,
1972  int *bits)
1973 {
1974  const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1975  const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1976  const float CLIPPED_ESCAPE = 165140.0f * IQ;
1977  int i;
1978  float cost = 0;
1979  int qc1, qc2, qc3, qc4;
1980  int curbits = 0;
1981 
1982  uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1983  float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
1984 
1985  for (i = 0; i < size; i += 4) {
1986  const float *vec, *vec2;
1987  int curidx, curidx2;
1988  float t1, t2, t3, t4;
1989  float di1, di2, di3, di4;
1990  int cond0, cond1, cond2, cond3;
1991  int c1, c2, c3, c4;
1992  int t6, t7;
1993 
1994  qc1 = scaled[i ] * Q34 + 0.4054f;
1995  qc2 = scaled[i+1] * Q34 + 0.4054f;
1996  qc3 = scaled[i+2] * Q34 + 0.4054f;
1997  qc4 = scaled[i+3] * Q34 + 0.4054f;
1998 
1999  __asm__ volatile (
2000  ".set push \n\t"
2001  ".set noreorder \n\t"
2002 
2003  "ori %[t6], $zero, 15 \n\t"
2004  "ori %[t7], $zero, 16 \n\t"
2005  "shll_s.w %[c1], %[qc1], 18 \n\t"
2006  "shll_s.w %[c2], %[qc2], 18 \n\t"
2007  "shll_s.w %[c3], %[qc3], 18 \n\t"
2008  "shll_s.w %[c4], %[qc4], 18 \n\t"
2009  "srl %[c1], %[c1], 18 \n\t"
2010  "srl %[c2], %[c2], 18 \n\t"
2011  "srl %[c3], %[c3], 18 \n\t"
2012  "srl %[c4], %[c4], 18 \n\t"
2013  "slt %[cond0], %[t6], %[qc1] \n\t"
2014  "slt %[cond1], %[t6], %[qc2] \n\t"
2015  "slt %[cond2], %[t6], %[qc3] \n\t"
2016  "slt %[cond3], %[t6], %[qc4] \n\t"
2017  "movn %[qc1], %[t7], %[cond0] \n\t"
2018  "movn %[qc2], %[t7], %[cond1] \n\t"
2019  "movn %[qc3], %[t7], %[cond2] \n\t"
2020  "movn %[qc4], %[t7], %[cond3] \n\t"
2021 
2022  ".set pop \n\t"
2023 
2024  : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
2025  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2026  [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2027  [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2028  [c1]"=&r"(c1), [c2]"=&r"(c2),
2029  [c3]"=&r"(c3), [c4]"=&r"(c4),
2030  [t6]"=&r"(t6), [t7]"=&r"(t7)
2031  );
2032 
2033  curidx = 17 * qc1;
2034  curidx += qc2;
2035 
2036  curidx2 = 17 * qc3;
2037  curidx2 += qc4;
2038 
2039  curbits += p_bits[curidx];
2040  curbits += esc_sign_bits[curidx];
2041  vec = &p_codes[curidx*2];
2042 
2043  curbits += p_bits[curidx2];
2044  curbits += esc_sign_bits[curidx2];
2045  vec2 = &p_codes[curidx2*2];
2046 
2047  curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2048  curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2049  curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2050  curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2051 
2052  t1 = fabsf(in[i ]);
2053  t2 = fabsf(in[i+1]);
2054  t3 = fabsf(in[i+2]);
2055  t4 = fabsf(in[i+3]);
2056 
2057  if (cond0) {
2058  if (t1 >= CLIPPED_ESCAPE) {
2059  di1 = t1 - CLIPPED_ESCAPE;
2060  } else {
2061  di1 = t1 - c1 * cbrtf(c1) * IQ;
2062  }
2063  } else
2064  di1 = t1 - vec[0] * IQ;
2065 
2066  if (cond1) {
2067  if (t2 >= CLIPPED_ESCAPE) {
2068  di2 = t2 - CLIPPED_ESCAPE;
2069  } else {
2070  di2 = t2 - c2 * cbrtf(c2) * IQ;
2071  }
2072  } else
2073  di2 = t2 - vec[1] * IQ;
2074 
2075  if (cond2) {
2076  if (t3 >= CLIPPED_ESCAPE) {
2077  di3 = t3 - CLIPPED_ESCAPE;
2078  } else {
2079  di3 = t3 - c3 * cbrtf(c3) * IQ;
2080  }
2081  } else
2082  di3 = t3 - vec2[0] * IQ;
2083 
2084  if (cond3) {
2085  if (t4 >= CLIPPED_ESCAPE) {
2086  di4 = t4 - CLIPPED_ESCAPE;
2087  } else {
2088  di4 = t4 - c4 * cbrtf(c4) * IQ;
2089  }
2090  } else
2091  di4 = t4 - vec2[1]*IQ;
2092 
2093  cost += di1 * di1 + di2 * di2
2094  + di3 * di3 + di4 * di4;
2095  }
2096 
2097  if (bits)
2098  *bits = curbits;
2099  return cost * lambda + curbits;
2100 }
2101 
2102 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2103  PutBitContext *pb, const float *in,
2104  const float *scaled, int size, int scale_idx,
2105  int cb, const float lambda, const float uplim,
2106  int *bits) = {
2107  get_band_cost_ZERO_mips,
2108  get_band_cost_SQUAD_mips,
2109  get_band_cost_SQUAD_mips,
2110  get_band_cost_UQUAD_mips,
2111  get_band_cost_UQUAD_mips,
2112  get_band_cost_SPAIR_mips,
2113  get_band_cost_SPAIR_mips,
2114  get_band_cost_UPAIR7_mips,
2115  get_band_cost_UPAIR7_mips,
2116  get_band_cost_UPAIR12_mips,
2117  get_band_cost_UPAIR12_mips,
2118  get_band_cost_ESC_mips,
2119 };
2120 
2121 #define get_band_cost( \
2122  s, pb, in, scaled, size, scale_idx, cb, \
2123  lambda, uplim, bits) \
2124  get_band_cost_arr[cb]( \
2125  s, pb, in, scaled, size, scale_idx, cb, \
2126  lambda, uplim, bits)
2127 
2128 static float quantize_band_cost(struct AACEncContext *s, const float *in,
2129  const float *scaled, int size, int scale_idx,
2130  int cb, const float lambda, const float uplim,
2131  int *bits)
2132 {
2133  return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2134 }
2135 
2136 static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2137  AACEncContext *s,
2138  SingleChannelElement *sce,
2139  const float lambda)
2140 {
2141  int start = 0, i, w, w2, g;
2142  int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2143  float dists[128] = { 0 }, uplims[128];
2144  float maxvals[128];
2145  int fflag, minscaler;
2146  int its = 0;
2147  int allz = 0;
2148  float minthr = INFINITY;
2149 
2150  destbits = FFMIN(destbits, 5800);
2151  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2152  for (g = 0; g < sce->ics.num_swb; g++) {
2153  int nz = 0;
2154  float uplim = 0.0f;
2155  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2156  FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2157  uplim += band->threshold;
2158  if (band->energy <= band->threshold || band->threshold == 0.0f) {
2159  sce->zeroes[(w+w2)*16+g] = 1;
2160  continue;
2161  }
2162  nz = 1;
2163  }
2164  uplims[w*16+g] = uplim *512;
2165  sce->zeroes[w*16+g] = !nz;
2166  if (nz)
2167  minthr = FFMIN(minthr, uplim);
2168  allz |= nz;
2169  }
2170  }
2171  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2172  for (g = 0; g < sce->ics.num_swb; g++) {
2173  if (sce->zeroes[w*16+g]) {
2174  sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2175  continue;
2176  }
2177  sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2178  }
2179  }
2180 
2181  if (!allz)
2182  return;
2183  abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2184 
2185  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2186  start = w*128;
2187  for (g = 0; g < sce->ics.num_swb; g++) {
2188  const float *scaled = s->scoefs + start;
2189  maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2190  start += sce->ics.swb_sizes[g];
2191  }
2192  }
2193 
2194  do {
2195  int tbits, qstep;
2196  minscaler = sce->sf_idx[0];
2197  qstep = its ? 1 : 32;
2198  do {
2199  int prev = -1;
2200  tbits = 0;
2201  fflag = 0;
2202 
2203  if (qstep > 1) {
2204  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2205  start = w*128;
2206  for (g = 0; g < sce->ics.num_swb; g++) {
2207  const float *coefs = sce->coeffs + start;
2208  const float *scaled = s->scoefs + start;
2209  int bits = 0;
2210  int cb;
2211 
2212  if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2213  start += sce->ics.swb_sizes[g];
2214  continue;
2215  }
2216  minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2217  cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2218  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2219  int b;
2220  bits += quantize_band_cost_bits(s, coefs + w2*128,
2221  scaled + w2*128,
2222  sce->ics.swb_sizes[g],
2223  sce->sf_idx[w*16+g],
2224  cb,
2225  1.0f,
2226  INFINITY,
2227  &b);
2228  }
2229  if (prev != -1) {
2230  bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2231  }
2232  tbits += bits;
2233  start += sce->ics.swb_sizes[g];
2234  prev = sce->sf_idx[w*16+g];
2235  }
2236  }
2237  }
2238  else {
2239  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2240  start = w*128;
2241  for (g = 0; g < sce->ics.num_swb; g++) {
2242  const float *coefs = sce->coeffs + start;
2243  const float *scaled = s->scoefs + start;
2244  int bits = 0;
2245  int cb;
2246  float dist = 0.0f;
2247 
2248  if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2249  start += sce->ics.swb_sizes[g];
2250  continue;
2251  }
2252  minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2253  cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2254  for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2255  int b;
2256  dist += quantize_band_cost(s, coefs + w2*128,
2257  scaled + w2*128,
2258  sce->ics.swb_sizes[g],
2259  sce->sf_idx[w*16+g],
2260  cb,
2261  1.0f,
2262  INFINITY,
2263  &b);
2264  bits += b;
2265  }
2266  dists[w*16+g] = dist - bits;
2267  if (prev != -1) {
2268  bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2269  }
2270  tbits += bits;
2271  start += sce->ics.swb_sizes[g];
2272  prev = sce->sf_idx[w*16+g];
2273  }
2274  }
2275  }
2276  if (tbits > destbits) {
2277  for (i = 0; i < 128; i++)
2278  if (sce->sf_idx[i] < 218 - qstep)
2279  sce->sf_idx[i] += qstep;
2280  } else {
2281  for (i = 0; i < 128; i++)
2282  if (sce->sf_idx[i] > 60 - qstep)
2283  sce->sf_idx[i] -= qstep;
2284  }
2285  qstep >>= 1;
2286  if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2287  qstep = 1;
2288  } while (qstep);
2289 
2290  fflag = 0;
2291  minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2292  for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2293  for (g = 0; g < sce->ics.num_swb; g++) {
2294  int prevsc = sce->sf_idx[w*16+g];
2295  if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2296  if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2297  sce->sf_idx[w*16+g]--;
2298  else
2299  sce->sf_idx[w*16+g]-=2;
2300  }
2301  sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2302  sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2303  if (sce->sf_idx[w*16+g] != prevsc)
2304  fflag = 1;
2305  sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2306  }
2307  }
2308  its++;
2309  } while (fflag && its < 10);
2310 }
2311 
2312 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2313  const float lambda)
2314 {
2315  int start = 0, i, w, w2, g;
2316  float M[128], S[128];
2317  float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2318  SingleChannelElement *sce0 = &cpe->ch[0];
2319  SingleChannelElement *sce1 = &cpe->ch[1];
2320  if (!cpe->common_window)
2321  return;
2322  for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2323  for (g = 0; g < sce0->ics.num_swb; g++) {
2324  if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2325  float dist1 = 0.0f, dist2 = 0.0f;
2326  for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2327  FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2328  FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2329  float minthr = FFMIN(band0->threshold, band1->threshold);
2330  float maxthr = FFMAX(band0->threshold, band1->threshold);
2331  for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2332  M[i ] = (sce0->coeffs[start+w2*128+i ]
2333  + sce1->coeffs[start+w2*128+i ]) * 0.5;
2334  M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2335  + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2336  M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2337  + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2338  M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2339  + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2340 
2341  S[i ] = M[i ]
2342  - sce1->coeffs[start+w2*128+i ];
2343  S[i+1] = M[i+1]
2344  - sce1->coeffs[start+w2*128+i+1];
2345  S[i+2] = M[i+2]
2346  - sce1->coeffs[start+w2*128+i+2];
2347  S[i+3] = M[i+3]
2348  - sce1->coeffs[start+w2*128+i+3];
2349  }
2350  abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2351  abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2352  abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
2353  abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
2354  dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2355  L34,
2356  sce0->ics.swb_sizes[g],
2357  sce0->sf_idx[(w+w2)*16+g],
2358  sce0->band_type[(w+w2)*16+g],
2359  lambda / band0->threshold, INFINITY, NULL);
2360  dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2361  R34,
2362  sce1->ics.swb_sizes[g],
2363  sce1->sf_idx[(w+w2)*16+g],
2364  sce1->band_type[(w+w2)*16+g],
2365  lambda / band1->threshold, INFINITY, NULL);
2366  dist2 += quantize_band_cost(s, M,
2367  M34,
2368  sce0->ics.swb_sizes[g],
2369  sce0->sf_idx[(w+w2)*16+g],
2370  sce0->band_type[(w+w2)*16+g],
2371  lambda / maxthr, INFINITY, NULL);
2372  dist2 += quantize_band_cost(s, S,
2373  S34,
2374  sce1->ics.swb_sizes[g],
2375  sce1->sf_idx[(w+w2)*16+g],
2376  sce1->band_type[(w+w2)*16+g],
2377  lambda / minthr, INFINITY, NULL);
2378  }
2379  cpe->ms_mask[w*16+g] = dist2 < dist1;
2380  }
2381  start += sce0->ics.swb_sizes[g];
2382  }
2383  }
2384 }
2385 #endif /*HAVE_MIPSFPU */
2386 
2387 static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2388  int win, int group_len, const float lambda)
2389 {
2390  BandCodingPath path[120][12];
2391  int w, swb, cb, start, size;
2392  int i, j;
2393  const int max_sfb = sce->ics.max_sfb;
2394  const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2395  const int run_esc = (1 << run_bits) - 1;
2396  int idx, ppos, count;
2397  int stackrun[120], stackcb[120], stack_len;
2398  float next_minbits = INFINITY;
2399  int next_mincb = 0;
2400 
2401  abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2402  start = win*128;
2403  for (cb = 0; cb < 12; cb++) {
2404  path[0][cb].cost = run_bits+4;
2405  path[0][cb].prev_idx = -1;
2406  path[0][cb].run = 0;
2407  }
2408  for (swb = 0; swb < max_sfb; swb++) {
2409  size = sce->ics.swb_sizes[swb];
2410  if (sce->zeroes[win*16 + swb]) {
2411  float cost_stay_here = path[swb][0].cost;
2412  float cost_get_here = next_minbits + run_bits + 4;
2413  if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2414  != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2415  cost_stay_here += run_bits;
2416  if (cost_get_here < cost_stay_here) {
2417  path[swb+1][0].prev_idx = next_mincb;
2418  path[swb+1][0].cost = cost_get_here;
2419  path[swb+1][0].run = 1;
2420  } else {
2421  path[swb+1][0].prev_idx = 0;
2422  path[swb+1][0].cost = cost_stay_here;
2423  path[swb+1][0].run = path[swb][0].run + 1;
2424  }
2425  next_minbits = path[swb+1][0].cost;
2426  next_mincb = 0;
2427  for (cb = 1; cb < 12; cb++) {
2428  path[swb+1][cb].cost = 61450;
2429  path[swb+1][cb].prev_idx = -1;
2430  path[swb+1][cb].run = 0;
2431  }
2432  } else {
2433  float minbits = next_minbits;
2434  int mincb = next_mincb;
2435  int startcb = sce->band_type[win*16+swb];
2436  next_minbits = INFINITY;
2437  next_mincb = 0;
2438  for (cb = 0; cb < startcb; cb++) {
2439  path[swb+1][cb].cost = 61450;
2440  path[swb+1][cb].prev_idx = -1;
2441  path[swb+1][cb].run = 0;
2442  }
2443  for (cb = startcb; cb < 12; cb++) {
2444  float cost_stay_here, cost_get_here;
2445  float bits = 0.0f;
2446  for (w = 0; w < group_len; w++) {
2447  bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2448  s->scoefs + start + w*128, size,
2449  sce->sf_idx[(win+w)*16+swb], cb,
2450  0, INFINITY, NULL);
2451  }
2452  cost_stay_here = path[swb][cb].cost + bits;
2453  cost_get_here = minbits + bits + run_bits + 4;
2454  if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2455  != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2456  cost_stay_here += run_bits;
2457  if (cost_get_here < cost_stay_here) {
2458  path[swb+1][cb].prev_idx = mincb;
2459  path[swb+1][cb].cost = cost_get_here;
2460  path[swb+1][cb].run = 1;
2461  } else {
2462  path[swb+1][cb].prev_idx = cb;
2463  path[swb+1][cb].cost = cost_stay_here;
2464  path[swb+1][cb].run = path[swb][cb].run + 1;
2465  }
2466  if (path[swb+1][cb].cost < next_minbits) {
2467  next_minbits = path[swb+1][cb].cost;
2468  next_mincb = cb;
2469  }
2470  }
2471  }
2472  start += sce->ics.swb_sizes[swb];
2473  }
2474 
2475  stack_len = 0;
2476  idx = 0;
2477  for (cb = 1; cb < 12; cb++)
2478  if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2479  idx = cb;
2480  ppos = max_sfb;
2481  while (ppos > 0) {
2482  av_assert1(idx >= 0);
2483  cb = idx;
2484  stackrun[stack_len] = path[ppos][cb].run;
2485  stackcb [stack_len] = cb;
2486  idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2487  ppos -= path[ppos][cb].run;
2488  stack_len++;
2489  }
2490 
2491  start = 0;
2492  for (i = stack_len - 1; i >= 0; i--) {
2493  put_bits(&s->pb, 4, stackcb[i]);
2494  count = stackrun[i];
2495  memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2496  for (j = 0; j < count; j++) {
2497  sce->band_type[win*16 + start] = stackcb[i];
2498  start++;
2499  }
2500  while (count >= run_esc) {
2501  put_bits(&s->pb, run_bits, run_esc);
2502  count -= run_esc;
2503  }
2504  put_bits(&s->pb, run_bits, count);
2505  }
2506 }
2507 #endif /* HAVE_INLINE_ASM */
2508 
2510 #if HAVE_INLINE_ASM
2511  AACCoefficientsEncoder *e = c->coder;
2512  int option = c->options.aac_coder;
2513 
2514  if (option == 2) {
2515  e->quantize_and_encode_band = quantize_and_encode_band_mips;
2516  e->encode_window_bands_info = codebook_trellis_rate_mips;
2517 #if HAVE_MIPSFPU
2518  e->search_for_quantizers = search_for_quantizers_twoloop_mips;
2519  e->search_for_ms = search_for_ms_mips;
2520 #endif /* HAVE_MIPSFPU */
2521  }
2522 #endif /* HAVE_INLINE_ASM */
2523 }