FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
29 
30 
31 #define VC1_INV_TRANCS_8_STEP1_MMI(fp1, fp2, fp3, fp4, \
32  o1, o2, o3, o4, \
33  t1, t2, t3, t4, \
34  ff_p1, ff_p2, ff_p3, ff_p4) \
35  "pmullh "#t1" , "#fp1" , "#ff_p1" \n\t" \
36  "pmullh "#t2" , "#fp2" , "#ff_p2" \n\t" \
37  "pmullh "#t3" , "#fp3" , "#ff_p3" \n\t" \
38  "pmullh "#t4" , "#fp4" , "#ff_p4" \n\t" \
39  "paddh "#o1" , "#t1" , "#t2" \n\t" \
40  "paddh "#o1" , "#o1" , "#t3" \n\t" \
41  "paddh "#o1" , "#o1" , "#t4" \n\t" \
42  \
43  "pmullh "#t1" , "#fp1" , "#ff_p2" \n\t" \
44  "pmullh "#t2" , "#fp2" , "#ff_p4" \n\t" \
45  "pmullh "#t3" , "#fp3" , "#ff_p1" \n\t" \
46  "pmullh "#t4" , "#fp4" , "#ff_p3" \n\t" \
47  "psubh "#o2" , "#t1" , "#t2" \n\t" \
48  "psubh "#o2" , "#o2" , "#t3" \n\t" \
49  "psubh "#o2" , "#o2" , "#t4" \n\t" \
50  \
51  "pmullh "#t1" , "#fp1" , "#ff_p3" \n\t" \
52  "pmullh "#t2" , "#fp2" , "#ff_p1" \n\t" \
53  "pmullh "#t3" , "#fp3" , "#ff_p4" \n\t" \
54  "pmullh "#t4" , "#fp4" , "#ff_p2" \n\t" \
55  "psubh "#o3" , "#t1" , "#t2" \n\t" \
56  "paddh "#o3" , "#o3" , "#t3" \n\t" \
57  "paddh "#o3" , "#o3" , "#t4" \n\t" \
58  \
59  "pmullh "#t1" , "#fp1" , "#ff_p4" \n\t" \
60  "pmullh "#t2" , "#fp2" , "#ff_p3" \n\t" \
61  "pmullh "#t3" , "#fp3" , "#ff_p2" \n\t" \
62  "pmullh "#t4" , "#fp4" , "#ff_p1" \n\t" \
63  "psubh "#o4" , "#t1" , "#t2" \n\t" \
64  "paddh "#o4" , "#o4" , "#t3" \n\t" \
65  "psubh "#o4" , "#o4" , "#t4" \n\t"
66 
67 
68 #define VC1_INV_TRANCS_8_STEP2_MMI(fp1, fp2, fp3, fp4, \
69  fp5, fp6, fp7, fp8, \
70  o1, o2, o3, o4, \
71  ff_p1, ff_p2, ff_p3, ff_pw) \
72  "paddh "#fp5" , "#fp1" , "#fp2" \n\t" \
73  "psubh "#fp6" , "#fp1" , "#fp2" \n\t" \
74  "pmullh "#fp5" , "#fp5" , "#ff_p1" \n\t" \
75  "pmullh "#fp6" , "#fp6" , "#ff_p1" \n\t" \
76  "paddh "#fp5" , "#fp5" , "#ff_pw" \n\t" \
77  "paddh "#fp6" , "#fp6" , "#ff_pw" \n\t" \
78  \
79  "pmullh "#fp1" , "#fp3" , "#ff_p2" \n\t" \
80  "pmullh "#fp2" , "#fp4" , "#ff_p3" \n\t" \
81  "pmullh "#fp3" , "#fp3" , "#ff_p3" \n\t" \
82  "pmullh "#fp4" , "#fp4" , "#ff_p2" \n\t" \
83  "paddh "#fp7" , "#fp1" , "#fp2" \n\t" \
84  "psubh "#fp8" , "#fp3" , "#fp4" \n\t" \
85  \
86  "paddh "#fp1" , "#fp5" , "#fp7" \n\t" \
87  "paddh "#fp2" , "#fp6" , "#fp8" \n\t" \
88  "psubh "#fp3" , "#fp6" , "#fp8" \n\t" \
89  "psubh "#fp4" , "#fp5" , "#fp7" \n\t" \
90  \
91  "paddh "#fp5" , "#fp1" , "#o1" \n\t" \
92  "paddh "#fp6" , "#fp2" , "#o2" \n\t" \
93  "paddh "#fp7" , "#fp3" , "#o3" \n\t" \
94  "paddh "#fp8" , "#fp4" , "#o4" \n\t" \
95  \
96  "psubh "#fp4" , "#fp4" , "#o4" \n\t" \
97  "psubh "#fp3" , "#fp3" , "#o3" \n\t" \
98  "psubh "#fp2" , "#fp2" , "#o2" \n\t" \
99  "psubh "#fp1" , "#fp1" , "#o1" \n\t"
100 
101 
102 #define VC1_INV_TRANCS_4_STEP1_MMI(fp1, fp2, fp3, fp4, \
103  fp5, fp6, fp7, fp8, \
104  ff_p1, ff_p2, ff_p3, ff_pw) \
105  "paddh "#fp5" , "#fp1" , "#fp2" \n\t" \
106  "psubh "#fp6" , "#fp1" , "#fp2" \n\t" \
107  "pmullh "#fp5" , "#fp5" , "#ff_p1" \n\t" \
108  "pmullh "#fp6" , "#fp6" , "#ff_p1" \n\t" \
109  "paddh "#fp5" , "#fp5" , "#ff_pw" \n\t" \
110  "paddh "#fp6" , "#fp6" , "#ff_pw" \n\t" \
111  \
112  "pmullh "#fp1" , "#fp3" , "#ff_p2" \n\t" \
113  "pmullh "#fp2" , "#fp4" , "#ff_p3" \n\t" \
114  "pmullh "#fp3" , "#fp3" , "#ff_p3" \n\t" \
115  "pmullh "#fp4" , "#fp4" , "#ff_p2" \n\t" \
116  "paddh "#fp7" , "#fp1" , "#fp2" \n\t" \
117  "psubh "#fp8" , "#fp3" , "#fp4" \n\t" \
118  \
119  "paddh "#fp1" , "#fp5" , "#fp7" \n\t" \
120  "psubh "#fp2" , "#fp6" , "#fp8" \n\t" \
121  "paddh "#fp3" , "#fp6" , "#fp8" \n\t" \
122  "psubh "#fp4" , "#fp5" , "#fp7" \n\t"
123 
124 
125 #define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4, \
126  fp5, fp6, fp7, fp8, zero) \
127  "punpcklbh "#fp5" , "#fp5" , "#zero" \n\t" \
128  "punpcklbh "#fp6" , "#fp6" , "#zero" \n\t" \
129  "punpcklbh "#fp7" , "#fp7" , "#zero" \n\t" \
130  "punpcklbh "#fp8" , "#fp8" , "#zero" \n\t" \
131  \
132  "paddh "#fp1" , "#fp1" , "#fp5" \n\t" \
133  "paddh "#fp2" , "#fp2" , "#fp6" \n\t" \
134  "paddh "#fp3" , "#fp3" , "#fp7" \n\t" \
135  "paddh "#fp4" , "#fp4" , "#fp8" \n\t" \
136  \
137  "packushb "#fp1" , "#fp1" , "#zero" \n\t" \
138  "packushb "#fp2" , "#fp2" , "#zero" \n\t" \
139  "packushb "#fp3" , "#fp3" , "#zero" \n\t" \
140  "packushb "#fp4" , "#fp4" , "#zero" \n\t"
141 
142 
143 /* Do inverse transform on 8x8 block */
144 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
145 {
146  int dc = block[0];
147  double ftmp[9];
148  mips_reg addr[1];
149  int count;
150 
151  dc = (3 * dc + 1) >> 1;
152  dc = (3 * dc + 16) >> 5;
153 
154  __asm__ volatile(
155  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
156  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
157  "li %[count], 0x02 \n\t"
158 
159  "1: \n\t"
160  MMI_LDC1(%[ftmp1], %[dest], 0x00)
161  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
162  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
163  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
164  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
165  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
166  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
167 
168  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
169  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
170  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
171  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
172  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
173  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
174  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
175  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
176 
177  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
178  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
179  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
180  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
181  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
182  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
183  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
184  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
185 
186  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
187  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
188  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
189  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
190 
191  MMI_SDC1(%[ftmp1], %[dest], 0x00)
192  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
193  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
194  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
195  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
196  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
197  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
198 
199  "addiu %[count], %[count], -0x01 \n\t"
200  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
201  "bnez %[count], 1b \n\t"
202  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
203  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
204  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
205  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
206  [ftmp8]"=&f"(ftmp[8]),
207  [addr0]"=&r"(addr[0]),
208  [count]"=&r"(count), [dest]"+&r"(dest)
209  : [linesize]"r"((mips_reg)linesize),
210  [dc]"f"(dc)
211  : "memory"
212  );
213 }
214 
215 #if _MIPS_SIM != _ABIO32
216 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
217 {
218  DECLARE_ALIGNED(16, int16_t, temp[64]);
219  int16_t *src = block;
220  int16_t *dst = temp;
221  double ftmp[16];
222  uint32_t count, tmp[1];
223 
224  // 1st loop
225  __asm__ volatile (
226  "li %[tmp0], 0x03 \n\t"
227  "mtc1 %[tmp0], %[ftmp0] \n\t"
228  "li %[count], 0x02 \n\t"
229 
230  "1: \n\t"
231  MMI_LDC1(%[ftmp5], %[src], 0x10)
232  MMI_LDC1(%[ftmp6], %[src], 0x30)
233  MMI_LDC1(%[ftmp7], %[src], 0x50)
234  MMI_LDC1(%[ftmp8], %[src], 0x70)
235 
236  VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
237  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
238  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
239  %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
240  %[ff_pw_4])
241 
242  MMI_LDC1(%[ftmp1], %[src], 0x00)
243  MMI_LDC1(%[ftmp2], %[src], 0x40)
244  MMI_LDC1(%[ftmp3], %[src], 0x20)
245  MMI_LDC1(%[ftmp4], %[src], 0x60)
246 
247  VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
248  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
249  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
250  %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
251  %[ff_pw_4])
252 
253 
254  PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
255  %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
256 
257  TRANSPOSE_4H(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
258  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
259  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
260 
261  MMI_SDC1(%[ftmp5], %[dst], 0x00)
262  MMI_SDC1(%[ftmp6], %[dst], 0x10)
263  MMI_SDC1(%[ftmp7], %[dst], 0x20)
264  MMI_SDC1(%[ftmp8], %[dst], 0x30)
265 
266  TRANSPOSE_4H(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
267  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
268  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
269 
270  MMI_SDC1(%[ftmp4], %[dst], 0x08)
271  MMI_SDC1(%[ftmp3], %[dst], 0x18)
272  MMI_SDC1(%[ftmp2], %[dst], 0x28)
273  MMI_SDC1(%[ftmp1], %[dst], 0x38)
274 
275  "addiu %[count], %[count], -0x01 \n\t"
276  PTR_ADDIU "%[src], %[src], 0x08 \n\t"
277  PTR_ADDIU "%[dst], %[dst], 0x40 \n\t"
278  "bnez %[count], 1b \n\t"
279  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
280  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
281  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
282  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
283  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
284  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
285  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
286  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
287  [tmp0]"=&r"(tmp[0]),
288  [count]"=&r"(count),
289  [src]"+&r"(src), [dst]"+&r"(dst)
290  : [ff_pw_4]"f"(ff_pw_4), [ff_pw_6]"f"(ff_pw_6),
291  [ff_pw_9]"f"(ff_pw_9), [ff_pw_12]"f"(ff_pw_12),
292  [ff_pw_15]"f"(ff_pw_15), [ff_pw_16]"f"(ff_pw_16)
293  : "memory"
294  );
295 
296  src = temp;
297  dst = block;
298 
299  // 2nd loop
300  __asm__ volatile (
301  "li %[tmp0], 0x07 \n\t"
302  "mtc1 %[tmp0], %[ftmp0] \n\t"
303  "li %[count], 0x02 \n\t"
304 
305  "1: \n\t"
306  MMI_LDC1(%[ftmp5], %[src], 0x10)
307  MMI_LDC1(%[ftmp6], %[src], 0x30)
308  MMI_LDC1(%[ftmp7], %[src], 0x50)
309  MMI_LDC1(%[ftmp8], %[src], 0x70)
310 
311  VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
312  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
313  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
314  %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
315  %[ff_pw_4])
316 
317  MMI_LDC1(%[ftmp1], %[src], 0x00)
318  MMI_LDC1(%[ftmp2], %[src], 0x40)
319  MMI_LDC1(%[ftmp3], %[src], 0x20)
320  MMI_LDC1(%[ftmp4], %[src], 0x60)
321 
322  VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
323  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
324  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
325  %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
326  %[ff_pw_64])
327 
328  "paddh %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
329  "paddh %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
330  "paddh %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
331  "paddh %[ftmp1], %[ftmp1], %[ff_pw_1] \n\t"
332 
333  PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
334  %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
335 
336  MMI_SDC1(%[ftmp5], %[dst], 0x00)
337  MMI_SDC1(%[ftmp6], %[dst], 0x10)
338  MMI_SDC1(%[ftmp7], %[dst], 0x20)
339  MMI_SDC1(%[ftmp8], %[dst], 0x30)
340 
341  MMI_SDC1(%[ftmp4], %[dst], 0x40)
342  MMI_SDC1(%[ftmp3], %[dst], 0x50)
343  MMI_SDC1(%[ftmp2], %[dst], 0x60)
344  MMI_SDC1(%[ftmp1], %[dst], 0x70)
345 
346  "addiu %[count], %[count], -0x01 \n\t"
347  PTR_ADDIU "%[src], %[src], 0x08 \n\t"
348  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
349  "bnez %[count], 1b \n\t"
350  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
351  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
352  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
353  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
354  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
355  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
356  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
357  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
358  [tmp0]"=&r"(tmp[0]),
359  [count]"=&r"(count),
360  [src]"+&r"(src), [dst]"+&r"(dst)
361  : [ff_pw_1]"f"(ff_pw_1), [ff_pw_4]"f"(ff_pw_4),
362  [ff_pw_6]"f"(ff_pw_6), [ff_pw_9]"f"(ff_pw_9),
363  [ff_pw_12]"f"(ff_pw_12), [ff_pw_15]"f"(ff_pw_15),
364  [ff_pw_16]"f"(ff_pw_16), [ff_pw_64]"f"(ff_pw_64)
365  : "memory"
366  );
367 }
368 #endif
369 
370 /* Do inverse transform on 8x4 part of block */
371 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
372 {
373  int dc = block[0];
374  double ftmp[9];
375 
376  dc = ( 3 * dc + 1) >> 1;
377  dc = (17 * dc + 64) >> 7;
378 
379  __asm__ volatile(
380  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
381  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
382 
383  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
384  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
385  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
386  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
387 
388  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
389  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
390  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
391  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
392  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
393  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
394  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
395  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
396 
397  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
398  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
399  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
400  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
401  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
402  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
403  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
404  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
405 
406  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
407  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
408  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
409  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
410 
411  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
412  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
413  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
414  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
415  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
416  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
417  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
418  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
419  [ftmp8]"=&f"(ftmp[8])
420  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
421  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
422  [dc]"f"(dc)
423  : "memory"
424  );
425 }
426 
427 #if _MIPS_SIM != _ABIO32
428 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
429 {
430  int16_t *src = block;
431  int16_t *dst = block;
432  double ftmp[16];
433  uint32_t tmp[1];
434  mips_reg addr[1];
435  DECLARE_VAR_LOW32;
436 
437  // 1st loop
438  __asm__ volatile (
439  MMI_LDC1(%[ftmp1], %[src], 0x00)
440  MMI_LDC1(%[ftmp2], %[src], 0x08)
441  MMI_LDC1(%[ftmp3], %[src], 0x10)
442  MMI_LDC1(%[ftmp4], %[src], 0x18)
443  MMI_LDC1(%[ftmp5], %[src], 0x20)
444  MMI_LDC1(%[ftmp6], %[src], 0x28)
445  MMI_LDC1(%[ftmp7], %[src], 0x30)
446  MMI_LDC1(%[ftmp8], %[src], 0x38)
447 
448  // a1 b1 a3 b2
449  TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp5], %[ftmp7],
450  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
451  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
452 
453  // a2 b3 a4 b4
454  TRANSPOSE_4H(%[ftmp2], %[ftmp4], %[ftmp6], %[ftmp8],
455  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
456  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
457 
458  // input b1 b2 b3 b4
459  VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
460  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
461  %[ftmp0], %[ftmp13], %[ftmp14], %[ftmp15],
462  %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
463  %[ff_pw_4])
464  // input a1 a2 a3 a4
465  VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp5], %[ftmp6],
466  %[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
467  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
468  %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
469  %[ff_pw_4])
470 
471  "li %[tmp0], 0x03 \n\t"
472  "mtc1 %[tmp0], %[ftmp0] \n\t"
473 
474  PSRAH_8_MMI(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
475  %[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1], %[ftmp0])
476 
477  TRANSPOSE_4H(%[ftmp3], %[ftmp7], %[ftmp4], %[ftmp8],
478  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
479  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
480 
481  MMI_SDC1(%[ftmp3], %[dst], 0x00)
482  MMI_SDC1(%[ftmp7], %[dst], 0x10)
483  MMI_SDC1(%[ftmp4], %[dst], 0x20)
484  MMI_SDC1(%[ftmp8], %[dst], 0x30)
485 
486  TRANSPOSE_4H(%[ftmp6], %[ftmp5], %[ftmp2], %[ftmp1],
487  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
488  %[ftmp13], %[tmp0], %[ftmp14], %[ftmp15])
489 
490  MMI_SDC1(%[ftmp6], %[dst], 0x08)
491  MMI_SDC1(%[ftmp5], %[dst], 0x18)
492  MMI_SDC1(%[ftmp2], %[dst], 0x28)
493  MMI_SDC1(%[ftmp1], %[dst], 0x38)
494  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
495  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
496  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
497  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
498  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
499  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
500  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
501  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
502  [tmp0]"=&r"(tmp[0])
503  : [src]"r"(src), [dst]"r"(dst),
504  [ff_pw_4]"f"(ff_pw_4), [ff_pw_6]"f"(ff_pw_6),
505  [ff_pw_9]"f"(ff_pw_9), [ff_pw_12]"f"(ff_pw_12),
506  [ff_pw_15]"f"(ff_pw_15), [ff_pw_16]"f"(ff_pw_16)
507  : "memory"
508  );
509 
510  src = block;
511 
512  // 2nd loop
513  __asm__ volatile (
514  "li %[tmp0], 0x07 \n\t"
515  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
516  "mtc1 %[tmp0], %[ftmp9] \n\t"
517 
518  // dest low 32bit
519  MMI_LDC1(%[ftmp1], %[src], 0x00)
520  MMI_LDC1(%[ftmp2], %[src], 0x20)
521  MMI_LDC1(%[ftmp3], %[src], 0x30)
522  MMI_LDC1(%[ftmp4], %[src], 0x10)
523 
524  VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
525  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
526  %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
527  %[ff_pw_64])
528 
529  PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
530 
531  MMI_LWC1(%[ftmp5], %[dest], 0x00)
532  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
533  MMI_LWC1(%[ftmp6], %[addr0], 0x00)
534  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
535  MMI_LWC1(%[ftmp7], %[addr0], 0x00)
536  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
537  MMI_LWC1(%[ftmp8], %[addr0], 0x00)
538 
539  VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
540  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
541  %[ftmp0])
542 
543  MMI_SWC1(%[ftmp1], %[dest], 0x00)
544  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
545  MMI_SWC1(%[ftmp2], %[addr0], 0x00)
546  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
547  MMI_SWC1(%[ftmp3], %[addr0], 0x00)
548  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
549  MMI_SWC1(%[ftmp4], %[addr0], 0x00)
550 
551  // dest high 32bit
552  MMI_LDC1(%[ftmp1], %[src], 0x08)
553  MMI_LDC1(%[ftmp2], %[src], 0x28)
554  MMI_LDC1(%[ftmp3], %[src], 0x38)
555  MMI_LDC1(%[ftmp4], %[src], 0x18)
556 
557  VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
558  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
559  %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
560  %[ff_pw_64])
561 
562  PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp9])
563 
564  MMI_LWC1(%[ftmp5], %[dest], 0x04)
565  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
566  MMI_LWC1(%[ftmp6], %[addr0], 0x04)
567  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
568  MMI_LWC1(%[ftmp7], %[addr0], 0x04)
569  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
570  MMI_LWC1(%[ftmp8], %[addr0], 0x04)
571 
572  VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
573  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
574  %[ftmp0])
575 
576  MMI_SWC1(%[ftmp1], %[dest], 0x04)
577  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
578  MMI_SWC1(%[ftmp2], %[addr0], 0x04)
579  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
580  MMI_SWC1(%[ftmp3], %[addr0], 0x04)
581  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
582  MMI_SWC1(%[ftmp4], %[addr0], 0x04)
583 
584  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
585  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
586  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
587  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
588  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
589  [tmp0]"=&r"(tmp[0]),
590  RESTRICT_ASM_LOW32
591  [addr0]"=&r"(addr[0])
592  : [src]"r"(src), [dest]"r"(dest),
593  [linesize]"r"((mips_reg)linesize),
594  [ff_pw_17]"f"(ff_pw_17), [ff_pw_22]"f"(ff_pw_22),
595  [ff_pw_10]"f"(ff_pw_10), [ff_pw_64]"f"(ff_pw_64)
596  : "memory"
597  );
598 }
599 #endif
600 
601 /* Do inverse transform on 4x8 parts of block */
602 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
603 {
604  int dc = block[0];
605  double ftmp[9];
606  DECLARE_VAR_LOW32;
607 
608  dc = (17 * dc + 4) >> 3;
609  dc = (12 * dc + 64) >> 7;
610 
611  __asm__ volatile(
612  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
613  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
614 
615  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
616  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
617  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
618  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
619  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
620  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
621  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
622  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
623 
624  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
625  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
626  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
627  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
628  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
629  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
630  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
631  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
632 
633  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
634  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
635  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
636  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
637  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
638  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
639  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
640  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
641 
642  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
643  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
644  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
645  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
646  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
647  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
648  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
649  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
650 
651  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
652  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
653  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
654  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
655  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
656  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
657  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
658  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
659  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
660  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
661  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
662  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
663  RESTRICT_ASM_LOW32
664  [ftmp8]"=&f"(ftmp[8])
665  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
666  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
667  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
668  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
669  [dc]"f"(dc)
670  : "memory"
671  );
672 }
673 
674 #if _MIPS_SIM != _ABIO32
675 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
676 {
677  int16_t *src = block;
678  int16_t *dst = block;
679  double ftmp[16];
680  uint32_t count, tmp[1];
681  mips_reg addr[1];
682  DECLARE_VAR_LOW32;
683 
684  // 1st loop
685  __asm__ volatile (
686  "li %[count], 0x02 \n\t"
687  "li %[tmp0], 0x03 \n\t"
688  "mtc1 %[tmp0], %[ftmp0] \n\t"
689 
690  "1: \n\t"
691  MMI_LDC1(%[ftmp1], %[src], 0x00)
692  MMI_LDC1(%[ftmp2], %[src], 0x10)
693  MMI_LDC1(%[ftmp3], %[src], 0x20)
694  MMI_LDC1(%[ftmp4], %[src], 0x30)
695 
696  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
697  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
698  %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
699 
700  // t1 t2 t3 t4
701  VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
702  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
703  %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
704  %[ff_pw_4])
705 
706  PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
707 
708  TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
709  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
710  %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
711 
712  MMI_SDC1(%[ftmp1], %[dst], 0x00)
713  MMI_SDC1(%[ftmp3], %[dst], 0x10)
714  MMI_SDC1(%[ftmp4], %[dst], 0x20)
715  MMI_SDC1(%[ftmp2], %[dst], 0x30)
716 
717  "addiu %[count], %[count], -0x01 \n\t"
718  PTR_ADDIU "%[src], %[src], 0x40 \n\t"
719  PTR_ADDIU "%[dst], %[dst], 0x40 \n\t"
720  "bnez %[count], 1b \n\t"
721  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
722  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
723  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
724  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
725  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
726  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
727  [tmp0]"=&r"(tmp[0]),
728  [count]"=&r"(count),
729  [src]"+&r"(src), [dst]"+&r"(dst)
730  : [ff_pw_17]"f"(ff_pw_17), [ff_pw_10]"f"(ff_pw_10),
731  [ff_pw_22]"f"(ff_pw_22), [ff_pw_4]"f"(ff_pw_4)
732  : "memory"
733  );
734 
735  src = block;
736 
737  // 2nd loop
738  __asm__ volatile (
739  "li %[tmp0], 0x07 \n\t"
740  "mtc1 %[tmp0], %[ftmp0] \n\t"
741 
742  MMI_LDC1(%[ftmp5], %[src], 0x10)
743  MMI_LDC1(%[ftmp6], %[src], 0x30)
744  MMI_LDC1(%[ftmp7], %[src], 0x50)
745  MMI_LDC1(%[ftmp8], %[src], 0x70)
746 
747  VC1_INV_TRANCS_8_STEP1_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
748  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
749  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
750  %[ff_pw_16], %[ff_pw_15], %[ff_pw_9],
751  %[ff_pw_4])
752 
753  MMI_LDC1(%[ftmp1], %[src], 0x00)
754  MMI_LDC1(%[ftmp2], %[src], 0x40)
755  MMI_LDC1(%[ftmp3], %[src], 0x20)
756  MMI_LDC1(%[ftmp4], %[src], 0x60)
757 
758  VC1_INV_TRANCS_8_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
759  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
760  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
761  %[ff_pw_12], %[ff_pw_16], %[ff_pw_6],
762  %[ff_pw_64])
763 
764  "paddh %[ftmp4], %[ftmp4], %[ff_pw_1] \n\t"
765  "paddh %[ftmp3], %[ftmp3], %[ff_pw_1] \n\t"
766  "paddh %[ftmp2], %[ftmp2], %[ff_pw_1] \n\t"
767  "paddh %[ftmp1], %[ftmp1], %[ff_pw_1] \n\t"
768 
769  PSRAH_8_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
770  %[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1], %[ftmp0])
771 
772  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
773 
774  // dest low
775  MMI_LWC1(%[ftmp9], %[dest], 0x00)
776  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
777  MMI_LWC1(%[ftmp10], %[addr0], 0x00)
778  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
779  MMI_LWC1(%[ftmp11], %[addr0], 0x00)
780  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
781  MMI_LWC1(%[ftmp12], %[addr0], 0x00)
782  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
783 
784  VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
785  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
786  %[ftmp0])
787 
788  // dest high
789  MMI_LWC1(%[ftmp9], %[addr0], 0x00)
790  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
791  MMI_LWC1(%[ftmp10], %[addr0], 0x00)
792  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
793  MMI_LWC1(%[ftmp11], %[addr0], 0x00)
794  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
795  MMI_LWC1(%[ftmp12], %[addr0], 0x00)
796 
797  VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp4], %[ftmp3], %[ftmp2], %[ftmp1],
798  %[ftmp9], %[ftmp10], %[ftmp11], %[ftmp12],
799  %[ftmp0])
800 
801  // dest low
802  MMI_SWC1(%[ftmp5], %[dest], 0x00)
803  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
804  MMI_SWC1(%[ftmp6], %[addr0], 0x00)
805  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
806  MMI_SWC1(%[ftmp7], %[addr0], 0x00)
807  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
808  MMI_SWC1(%[ftmp8], %[addr0], 0x00)
809  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
810 
811  // dest high
812  MMI_SWC1(%[ftmp4], %[addr0], 0x00)
813  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
814  MMI_SWC1(%[ftmp3], %[addr0], 0x00)
815  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
816  MMI_SWC1(%[ftmp2], %[addr0], 0x00)
817  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
818  MMI_SWC1(%[ftmp1], %[addr0], 0x00)
819  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
820  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
821  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
822  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
823  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
824  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
825  [ftmp12]"=&f"(ftmp[12]),
826  [tmp0]"=&r"(tmp[0]),
827  RESTRICT_ASM_LOW32
828  [addr0]"=&r"(addr[0]),
829  [dest]"+&r"(dest)
830  : [src]"r"(src), [linesize]"r"(linesize),
831  [ff_pw_1]"f"(ff_pw_1), [ff_pw_4]"f"(ff_pw_4),
832  [ff_pw_6]"f"(ff_pw_6), [ff_pw_9]"f"(ff_pw_9),
833  [ff_pw_12]"f"(ff_pw_12), [ff_pw_15]"f"(ff_pw_15),
834  [ff_pw_16]"f"(ff_pw_16), [ff_pw_64]"f"(ff_pw_64)
835  : "memory"
836  );
837 }
838 #endif
839 
840 /* Do inverse transform on 4x4 part of block */
841 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
842 {
843  int dc = block[0];
844  double ftmp[5];
845  DECLARE_VAR_LOW32;
846 
847  dc = (17 * dc + 4) >> 3;
848  dc = (17 * dc + 64) >> 7;
849 
850  __asm__ volatile(
851  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
852  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
853 
854  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
855  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
856  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
857  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
858 
859  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
860  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
861  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
862  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
863 
864  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
865  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
866  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
867  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
868 
869  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
870  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
871  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
872  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
873 
874  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
875  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
876  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
877  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
878  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
879  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
880  RESTRICT_ASM_LOW32
881  [ftmp4]"=&f"(ftmp[4])
882  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
883  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
884  [dc]"f"(dc)
885  : "memory"
886  );
887 }
888 
889 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
890 {
891  int16_t *src = block;
892  int16_t *dst = block;
893  double ftmp[12];
894  uint32_t tmp[1];
895  mips_reg addr[1];
896  DECLARE_VAR_LOW32;
897 
898  // 1st loop
899  __asm__ volatile (
900  "li %[tmp0], 0x03 \n\t"
901  "mtc1 %[tmp0], %[ftmp0] \n\t"
902 
903  MMI_LDC1(%[ftmp1], %[src], 0x00)
904  MMI_LDC1(%[ftmp2], %[src], 0x10)
905  MMI_LDC1(%[ftmp3], %[src], 0x20)
906  MMI_LDC1(%[ftmp4], %[src], 0x30)
907 
908  TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
909  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
910  %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
911 
912  // t1 t2 t3 t4
913  VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
914  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
915  %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
916  %[ff_pw_4])
917 
918  PSRAH_4_MMI(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2], %[ftmp0])
919 
920  TRANSPOSE_4H(%[ftmp1], %[ftmp3], %[ftmp4], %[ftmp2],
921  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
922  %[ftmp9], %[tmp0], %[ftmp10], %[ftmp11])
923 
924  MMI_SDC1(%[ftmp1], %[dst], 0x00)
925  MMI_SDC1(%[ftmp3], %[dst], 0x10)
926  MMI_SDC1(%[ftmp4], %[dst], 0x20)
927  MMI_SDC1(%[ftmp2], %[dst], 0x30)
928  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
929  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
930  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
931  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
932  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
933  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
934  [tmp0]"=&r"(tmp[0]),
935  [src]"+&r"(src), [dst]"+&r"(dst)
936  : [ff_pw_17]"f"(ff_pw_17), [ff_pw_10]"f"(ff_pw_10),
937  [ff_pw_22]"f"(ff_pw_22), [ff_pw_4]"f"(ff_pw_4)
938  : "memory"
939  );
940 
941  src = block;
942 
943  // 2nd loop
944  __asm__ volatile (
945  "li %[tmp0], 0x07 \n\t"
946  "mtc1 %[tmp0], %[ftmp0] \n\t"
947 
948  // dest low 32bit
949  MMI_LDC1(%[ftmp1], %[src], 0x00)
950  MMI_LDC1(%[ftmp2], %[src], 0x20)
951  MMI_LDC1(%[ftmp3], %[src], 0x30)
952  MMI_LDC1(%[ftmp4], %[src], 0x10)
953 
954  VC1_INV_TRANCS_4_STEP1_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
955  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
956  %[ff_pw_17], %[ff_pw_10], %[ff_pw_22],
957  %[ff_pw_64])
958 
959  PSRAH_4_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp0])
960 
961  MMI_LWC1(%[ftmp5], %[dest], 0x00)
962  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
963  MMI_LWC1(%[ftmp6], %[addr0], 0x00)
964  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
965  MMI_LWC1(%[ftmp7], %[addr0], 0x00)
966  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
967  MMI_LWC1(%[ftmp8], %[addr0], 0x00)
968 
969  "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
970 
971  VC1_INV_TRANCS_4_STEP2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
972  %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8],
973  %[ftmp9])
974 
975  MMI_SWC1(%[ftmp1], %[dest], 0x00)
976  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
977  MMI_SWC1(%[ftmp2], %[addr0], 0x00)
978  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
979  MMI_SWC1(%[ftmp3], %[addr0], 0x00)
980  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
981  MMI_SWC1(%[ftmp4], %[addr0], 0x00)
982  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
983  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
984  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
985  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
986  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
987  [tmp0]"=&r"(tmp[0]),
988  RESTRICT_ASM_LOW32
989  [addr0]"=&r"(addr[0])
990  : [src]"r"(src), [dest]"r"(dest),
991  [linesize]"r"((mips_reg)linesize),
992  [ff_pw_17]"f"(ff_pw_17), [ff_pw_22]"f"(ff_pw_22),
993  [ff_pw_10]"f"(ff_pw_10), [ff_pw_64]"f"(ff_pw_64)
994  : "memory"
995  );
996 }
997 
998 /* Apply overlap transform to horizontal edge */
1000 {
1001  int i;
1002  int a, b, c, d;
1003  int d1, d2;
1004  int rnd = 1;
1005  for (i = 0; i < 8; i++) {
1006  a = src[-2];
1007  b = src[-1];
1008  c = src[0];
1009  d = src[1];
1010  d1 = (a - d + 3 + rnd) >> 3;
1011  d2 = (a - d + b - c + 4 - rnd) >> 3;
1012 
1013  src[-2] = a - d1;
1014  src[-1] = av_clip_uint8(b - d2);
1015  src[0] = av_clip_uint8(c + d2);
1016  src[1] = d + d1;
1017  src += stride;
1018  rnd = !rnd;
1019  }
1020 }
1021 
1022 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right)
1023 {
1024  int i;
1025  int a, b, c, d;
1026  int d1, d2;
1027  int rnd1 = 4, rnd2 = 3;
1028  for (i = 0; i < 8; i++) {
1029  a = left[6];
1030  b = left[7];
1031  c = right[0];
1032  d = right[1];
1033  d1 = a - d;
1034  d2 = a - d + b - c;
1035 
1036  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1037  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1038  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1039  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1040 
1041  right += 8;
1042  left += 8;
1043  rnd2 = 7 - rnd2;
1044  rnd1 = 7 - rnd1;
1045  }
1046 }
1047 
1048 /* Apply overlap transform to vertical edge */
1050 {
1051  int i;
1052  int a, b, c, d;
1053  int d1, d2;
1054  int rnd = 1;
1055  for (i = 0; i < 8; i++) {
1056  a = src[-2 * stride];
1057  b = src[-stride];
1058  c = src[0];
1059  d = src[stride];
1060  d1 = (a - d + 3 + rnd) >> 3;
1061  d2 = (a - d + b - c + 4 - rnd) >> 3;
1062 
1063  src[-2 * stride] = a - d1;
1064  src[-stride] = av_clip_uint8(b - d2);
1065  src[0] = av_clip_uint8(c + d2);
1066  src[stride] = d + d1;
1067  src++;
1068  rnd = !rnd;
1069  }
1070 }
1071 
1072 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1073 {
1074  int i;
1075  int a, b, c, d;
1076  int d1, d2;
1077  int rnd1 = 4, rnd2 = 3;
1078  for (i = 0; i < 8; i++) {
1079  a = top[48];
1080  b = top[56];
1081  c = bottom[0];
1082  d = bottom[8];
1083  d1 = a - d;
1084  d2 = a - d + b - c;
1085 
1086  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1087  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1088  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1089  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1090 
1091  bottom++;
1092  top++;
1093  rnd2 = 7 - rnd2;
1094  rnd1 = 7 - rnd1;
1095  }
1096 }
1097 
1098 /**
1099  * VC-1 in-loop deblocking filter for one line
1100  * @param src source block type
1101  * @param stride block stride
1102  * @param pq block quantizer
1103  * @return whether other 3 pairs should be filtered or not
1104  * @see 8.6
1105  */
1106 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1107 {
1108  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1109  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1110  int a0_sign = a0 >> 31; /* Store sign */
1111 
1112  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1113  if (a0 < pq) {
1114  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1115  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1116  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1117  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1118  if (a1 < a0 || a2 < a0) {
1119  int clip = src[-1 * stride] - src[0 * stride];
1120  int clip_sign = clip >> 31;
1121 
1122  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1123  if (clip) {
1124  int a3 = FFMIN(a1, a2);
1125  int d = 5 * (a3 - a0);
1126  int d_sign = (d >> 31);
1127 
1128  d = ((d ^ d_sign) - d_sign) >> 3;
1129  d_sign ^= a0_sign;
1130 
1131  if (d_sign ^ clip_sign)
1132  d = 0;
1133  else {
1134  d = FFMIN(d, clip);
1135  d = (d ^ d_sign) - d_sign; /* Restore sign */
1136  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1137  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1138  }
1139  return 1;
1140  }
1141  }
1142  }
1143  return 0;
1144 }
1145 
1146 /**
1147  * VC-1 in-loop deblocking filter
1148  * @param src source block type
1149  * @param step distance between horizontally adjacent elements
1150  * @param stride distance between vertically adjacent elements
1151  * @param len edge length to filter (4 or 8 pixels)
1152  * @param pq block quantizer
1153  * @see 8.6
1154  */
1155 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1156  int len, int pq)
1157 {
1158  int i;
1159  int filt3;
1160 
1161  for (i = 0; i < len; i += 4) {
1162  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1163  if (filt3) {
1164  vc1_filter_line(src + 0 * step, stride, pq);
1165  vc1_filter_line(src + 1 * step, stride, pq);
1166  vc1_filter_line(src + 3 * step, stride, pq);
1167  }
1168  src += step * 4;
1169  }
1170 }
1171 
1173 {
1174  vc1_loop_filter(src, 1, stride, 4, pq);
1175 }
1176 
1178 {
1179  vc1_loop_filter(src, stride, 1, 4, pq);
1180 }
1181 
1183 {
1184  vc1_loop_filter(src, 1, stride, 8, pq);
1185 }
1186 
1188 {
1189  vc1_loop_filter(src, stride, 1, 8, pq);
1190 }
1191 
1193 {
1194  vc1_loop_filter(src, 1, stride, 16, pq);
1195 }
1196 
1198 {
1199  vc1_loop_filter(src, stride, 1, 16, pq);
1200 }
1201 
1203  ptrdiff_t stride, int rnd)
1204 {
1205  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1206 }
1208  ptrdiff_t stride, int rnd)
1209 {
1210  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1211 }
1213  ptrdiff_t stride, int rnd)
1214 {
1215  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1216 }
1218  ptrdiff_t stride, int rnd)
1219 {
1220  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1221 }
1222 
1223 #define OP_PUT(S, D)
1224 #define OP_AVG(S, D) \
1225  "ldc1 $f16, "#S" \n\t" \
1226  "pavgb "#D", "#D", $f16 \n\t"
1227 
1228 /** Add rounder from $f14 to $f6 and pack result at destination */
1229 #define NORMALIZE_MMI(SHIFT) \
1230  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1231  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1232  "psrah $f6, $f6, "SHIFT" \n\t" \
1233  "psrah $f8, $f8, "SHIFT" \n\t"
1234 
1235 #define TRANSFER_DO_PACK(OP) \
1236  "packushb $f6, $f6, $f8 \n\t" \
1237  OP((%[dst]), $f6) \
1238  "sdc1 $f6, 0x00(%[dst]) \n\t"
1239 
1240 #define TRANSFER_DONT_PACK(OP) \
1241  OP(0(%[dst]), $f6) \
1242  OP(8(%[dst]), $f8) \
1243  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1244  "sdc1 $f8, 0x08(%[dst]) \n\t"
1245 
1246 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1247 #define DO_UNPACK(reg) \
1248  "punpcklbh "reg", "reg", $f0 \n\t"
1249 #define DONT_UNPACK(reg)
1250 
1251 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1252 #define LOAD_ROUNDER_MMI(ROUND) \
1253  "lwc1 $f14, "ROUND" \n\t" \
1254  "punpcklhw $f14, $f14, $f14 \n\t" \
1255  "punpcklwd $f14, $f14, $f14 \n\t"
1256 
1257 
1258 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1259  "paddh "#R1", "#R1", "#R2" \n\t" \
1260  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1261  MMI_ULWC1(R0, $9, 0x00) \
1262  "pmullh "#R1", "#R1", $f6 \n\t" \
1263  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1264  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1265  MMI_ULWC1(R3, $9, 0x00) \
1266  "psubh "#R1", "#R1", "#R0" \n\t" \
1267  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1268  "paddh "#R1", "#R1", $f14 \n\t" \
1269  "psubh "#R1", "#R1", "#R3" \n\t" \
1270  "psrah "#R1", "#R1", %[shift] \n\t" \
1271  MMI_SDC1(R1, %[dst], OFF) \
1272  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1273 
1274 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1275 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1276  const uint8_t *src, mips_reg stride,
1277  int rnd, int64_t shift)
1278 {
1279  DECLARE_VAR_LOW32;
1280  DECLARE_VAR_ADDRT;
1281 
1282  __asm__ volatile(
1283  "xor $f0, $f0, $f0 \n\t"
1284  "li $8, 0x03 \n\t"
1285  LOAD_ROUNDER_MMI("%[rnd]")
1286  "ldc1 $f12, %[ff_pw_9] \n\t"
1287  "1: \n\t"
1288  MMI_ULWC1($f4, %[src], 0x00)
1289  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1290  MMI_ULWC1($f6, %[src], 0x00)
1291  "punpcklbh $f4, $f4, $f0 \n\t"
1292  "punpcklbh $f6, $f6, $f0 \n\t"
1293  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1294  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1295  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1296  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1297  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1298  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1299  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1300  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1301  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1302  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1303  "addiu $8, $8, -0x01 \n\t"
1304  "bnez $8, 1b \n\t"
1305  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1306  [src]"+r"(src), [dst]"+r"(dst)
1307  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1308  [shift]"f"(shift), [rnd]"m"(rnd),
1309  [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1310  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1311  "$f14", "$f16", "memory"
1312  );
1313 }
1314 
1315 /**
1316  * Data is already unpacked, so some operations can directly be made from
1317  * memory.
1318  */
1319 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1320 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1321  const int16_t *src, int rnd) \
1322 { \
1323  int h = 8; \
1324  DECLARE_VAR_ALL64; \
1325  DECLARE_VAR_ADDRT; \
1326  \
1327  src -= 1; \
1328  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1329  \
1330  __asm__ volatile( \
1331  LOAD_ROUNDER_MMI("%[rnd]") \
1332  "ldc1 $f12, %[ff_pw_128] \n\t" \
1333  "ldc1 $f10, %[ff_pw_9] \n\t" \
1334  "1: \n\t" \
1335  MMI_ULDC1($f2, %[src], 0x00) \
1336  MMI_ULDC1($f4, %[src], 0x08) \
1337  MMI_ULDC1($f6, %[src], 0x02) \
1338  MMI_ULDC1($f8, %[src], 0x0a) \
1339  MMI_ULDC1($f0, %[src], 0x06) \
1340  "paddh $f2, $f2, $f0 \n\t" \
1341  MMI_ULDC1($f0, %[src], 0x0e) \
1342  "paddh $f4, $f4, $f0 \n\t" \
1343  MMI_ULDC1($f0, %[src], 0x04) \
1344  "paddh $f6, $f6, $f0 \n\t" \
1345  MMI_ULDC1($f0, %[src], 0x0b) \
1346  "paddh $f8, $f8, $f0 \n\t" \
1347  "pmullh $f6, $f6, $f10 \n\t" \
1348  "pmullh $f8, $f8, $f10 \n\t" \
1349  "psubh $f6, $f6, $f2 \n\t" \
1350  "psubh $f8, $f8, $f4 \n\t" \
1351  "li $8, 0x07 \n\t" \
1352  "mtc1 $8, $f16 \n\t" \
1353  NORMALIZE_MMI("$f16") \
1354  /* Remove bias */ \
1355  "paddh $f6, $f6, $f12 \n\t" \
1356  "paddh $f8, $f8, $f12 \n\t" \
1357  TRANSFER_DO_PACK(OP) \
1358  "addiu %[h], %[h], -0x01 \n\t" \
1359  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1360  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1361  "bnez %[h], 1b \n\t" \
1362  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1363  [h]"+r"(h), \
1364  [src]"+r"(src), [dst]"+r"(dst) \
1365  : [stride]"r"(stride), [rnd]"m"(rnd), \
1366  [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1367  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1368  "$f16", "memory" \
1369  ); \
1370 }
1371 
1374 
1375 /**
1376  * Purely vertical or horizontal 1/2 shift interpolation.
1377  * Sacrify $f12 for *9 factor.
1378  */
1379 #define VC1_SHIFT2(OP, OPNAME)\
1380 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1381  mips_reg stride, int rnd, \
1382  mips_reg offset) \
1383 { \
1384  DECLARE_VAR_LOW32; \
1385  DECLARE_VAR_ADDRT; \
1386  \
1387  rnd = 8 - rnd; \
1388  \
1389  __asm__ volatile( \
1390  "xor $f0, $f0, $f0 \n\t" \
1391  "li $10, 0x08 \n\t" \
1392  LOAD_ROUNDER_MMI("%[rnd]") \
1393  "ldc1 $f12, %[ff_pw_9] \n\t" \
1394  "1: \n\t" \
1395  MMI_ULWC1($f6, %[src], 0x00) \
1396  MMI_ULWC1($f8, %[src], 0x04) \
1397  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1398  MMI_ULWC1($f2, $9, 0x00) \
1399  MMI_ULWC1($f4, $9, 0x04) \
1400  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1401  "punpcklbh $f6, $f6, $f0 \n\t" \
1402  "punpcklbh $f8, $f8, $f0 \n\t" \
1403  "punpcklbh $f2, $f2, $f0 \n\t" \
1404  "punpcklbh $f4, $f4, $f0 \n\t" \
1405  "paddh $f6, $f6, $f2 \n\t" \
1406  "paddh $f8, $f8, $f4 \n\t" \
1407  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1408  MMI_ULWC1($f2, $9, 0x00) \
1409  MMI_ULWC1($f4, $9, 0x04) \
1410  "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1411  "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1412  "punpcklbh $f2, $f2, $f0 \n\t" \
1413  "punpcklbh $f4, $f4, $f0 \n\t" \
1414  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1415  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1416  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1417  MMI_ULWC1($f2, $9, 0x00) \
1418  MMI_ULWC1($f4, $9, 0x04) \
1419  "punpcklbh $f2, $f2, $f0 \n\t" \
1420  "punpcklbh $f4, $f4, $f0 \n\t" \
1421  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1422  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1423  "li $8, 0x04 \n\t" \
1424  "mtc1 $8, $f16 \n\t" \
1425  NORMALIZE_MMI("$f16") \
1426  "packushb $f6, $f6, $f8 \n\t" \
1427  OP((%[dst]), $f6) \
1428  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1429  "addiu $10, $10, -0x01 \n\t" \
1430  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1431  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1432  "bnez $10, 1b \n\t" \
1433  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1434  [src]"+r"(src), [dst]"+r"(dst) \
1435  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1436  [stride]"g"(stride), [rnd]"m"(rnd), \
1437  [stride1]"g"(stride-offset), \
1438  [ff_pw_9]"m"(ff_pw_9) \
1439  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1440  "$f12", "$f14", "$f16", "memory" \
1441  ); \
1442 }
1443 
1444 VC1_SHIFT2(OP_PUT, put_)
1445 VC1_SHIFT2(OP_AVG, avg_)
1446 
1447 /**
1448  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1449  *
1450  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1451  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1452  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1453  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1454  * @param A2 Stride address of 2nd tap
1455  * @param A3 Stride address of 3rd tap
1456  * @param A4 Stride address of 4th tap
1457  */
1458 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1459  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1460  LOAD($f2, $9, M*0) \
1461  LOAD($f4, $9, M*4) \
1462  UNPACK("$f2") \
1463  UNPACK("$f4") \
1464  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1465  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1466  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1467  LOAD($f6, $9, M*0) \
1468  LOAD($f8, $9, M*4) \
1469  UNPACK("$f6") \
1470  UNPACK("$f8") \
1471  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1472  "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1473  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1474  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1475  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1476  LOAD($f2, $9, M*0) \
1477  LOAD($f4, $9, M*4) \
1478  UNPACK("$f2") \
1479  UNPACK("$f4") \
1480  "li $8, 0x02 \n\t" \
1481  "mtc1 $8, $f16 \n\t" \
1482  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1483  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1484  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1485  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1486  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1487  LOAD($f2, $9, M*0) \
1488  LOAD($f4, $9, M*4) \
1489  UNPACK("$f2") \
1490  UNPACK("$f4") \
1491  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1492  "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1493  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1494  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1495 
1496 /**
1497  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1498  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1499  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1500  *
1501  * @param NAME Either 1 or 3
1502  * @see MSPEL_FILTER13_CORE for information on A1->A4
1503  */
1504 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1505 static void \
1506 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1507  mips_reg src_stride, \
1508  int rnd, int64_t shift) \
1509 { \
1510  int h = 8; \
1511  DECLARE_VAR_LOW32; \
1512  DECLARE_VAR_ADDRT; \
1513  \
1514  src -= src_stride; \
1515  \
1516  __asm__ volatile( \
1517  "xor $f0, $f0, $f0 \n\t" \
1518  LOAD_ROUNDER_MMI("%[rnd]") \
1519  "ldc1 $f10, %[ff_pw_53] \n\t" \
1520  "ldc1 $f12, %[ff_pw_18] \n\t" \
1521  ".p2align 3 \n\t" \
1522  "1: \n\t" \
1523  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1524  NORMALIZE_MMI("%[shift]") \
1525  TRANSFER_DONT_PACK(OP_PUT) \
1526  /* Last 3 (in fact 4) bytes on the line */ \
1527  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1528  MMI_ULWC1($f2, $9, 0x08) \
1529  DO_UNPACK("$f2") \
1530  "mov.d $f6, $f2 \n\t" \
1531  "paddh $f2, $f2, $f2 \n\t" \
1532  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1533  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1534  MMI_ULWC1($f6, $9, 0x08) \
1535  DO_UNPACK("$f6") \
1536  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1537  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1538  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1539  MMI_ULWC1($f2, $9, 0x08) \
1540  DO_UNPACK("$f2") \
1541  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1542  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1543  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1544  MMI_ULWC1($f2, $9, 0x08) \
1545  DO_UNPACK("$f2") \
1546  "li $8, 0x02 \n\t" \
1547  "mtc1 $8, $f16 \n\t" \
1548  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1549  "psubh $f6, $f6, $f2 \n\t" \
1550  "paddh $f6, $f6, $f14 \n\t" \
1551  "li $8, 0x06 \n\t" \
1552  "mtc1 $8, $f16 \n\t" \
1553  "psrah $f6, $f6, $f16 \n\t" \
1554  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1555  "addiu %[h], %[h], -0x01 \n\t" \
1556  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1557  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1558  "bnez %[h], 1b \n\t" \
1559  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1560  [h]"+r"(h), \
1561  [src]"+r"(src), [dst]"+r"(dst) \
1562  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1563  [stride_x3]"r"(3*src_stride), \
1564  [rnd]"m"(rnd), [shift]"f"(shift), \
1565  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1566  [ff_pw_3]"f"(ff_pw_3) \
1567  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1568  "$f14", "$f16", "memory" \
1569  ); \
1570 }
1571 
1572 /**
1573  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1574  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1575  *
1576  * @param NAME Either 1 or 3
1577  * @see MSPEL_FILTER13_CORE for information on A1->A4
1578  */
1579 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1580 static void \
1581 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1582  const int16_t *src, int rnd) \
1583 { \
1584  int h = 8; \
1585  DECLARE_VAR_ALL64; \
1586  DECLARE_VAR_ADDRT; \
1587  \
1588  src -= 1; \
1589  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1590  \
1591  __asm__ volatile( \
1592  "xor $f0, $f0, $f0 \n\t" \
1593  LOAD_ROUNDER_MMI("%[rnd]") \
1594  "ldc1 $f10, %[ff_pw_53] \n\t" \
1595  "ldc1 $f12, %[ff_pw_18] \n\t" \
1596  ".p2align 3 \n\t" \
1597  "1: \n\t" \
1598  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1599  "li $8, 0x07 \n\t" \
1600  "mtc1 $8, $f16 \n\t" \
1601  NORMALIZE_MMI("$f16") \
1602  /* Remove bias */ \
1603  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1604  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1605  TRANSFER_DO_PACK(OP) \
1606  "addiu %[h], %[h], -0x01 \n\t" \
1607  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1608  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1609  "bnez %[h], 1b \n\t" \
1610  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1611  [h]"+r"(h), \
1612  [src]"+r"(src), [dst]"+r"(dst) \
1613  : [stride]"r"(stride), [rnd]"m"(rnd), \
1614  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1615  [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
1616  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1617  "$f14", "$f16", "memory" \
1618  ); \
1619 }
1620 
1621 /**
1622  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
1623  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1624  * %3 (offset), %4 (2*offset) and %5 (3*offset).
1625  *
1626  * @param NAME Either 1 or 3
1627  * @see MSPEL_FILTER13_CORE for information on A1->A4
1628  */
1629 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1630 static void \
1631 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
1632  mips_reg stride, int rnd, mips_reg offset) \
1633 { \
1634  int h = 8; \
1635  DECLARE_VAR_LOW32; \
1636  DECLARE_VAR_ADDRT; \
1637  \
1638  src -= offset; \
1639  rnd = 32-rnd; \
1640  \
1641  __asm__ volatile ( \
1642  "xor $f0, $f0, $f0 \n\t" \
1643  LOAD_ROUNDER_MMI("%[rnd]") \
1644  "ldc1 $f10, %[ff_pw_53] \n\t" \
1645  "ldc1 $f12, %[ff_pw_18] \n\t" \
1646  ".p2align 3 \n\t" \
1647  "1: \n\t" \
1648  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1649  "li $8, 0x06 \n\t" \
1650  "mtc1 $8, $f16 \n\t" \
1651  NORMALIZE_MMI("$f16") \
1652  TRANSFER_DO_PACK(OP) \
1653  "addiu %[h], %[h], -0x01 \n\t" \
1654  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
1655  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1656  "bnez %[h], 1b \n\t" \
1657  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1658  [h]"+r"(h), \
1659  [src]"+r"(src), [dst]"+r"(dst) \
1660  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
1661  [offset_x3]"r"(3*offset), [stride]"g"(stride), \
1662  [rnd]"m"(rnd), \
1663  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1664  [ff_pw_3]"f"(ff_pw_3) \
1665  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1666  "$f14", "$f16", "memory" \
1667  ); \
1668 }
1669 
1670 
1671 /** 1/4 shift bicubic interpolation */
1672 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
1673 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
1674 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
1675 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
1676 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
1677 
1678 /** 3/4 shift bicubic interpolation */
1679 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
1680 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
1681 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
1682 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
1683 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
1684 
1685 typedef void (*vc1_mspel_mc_filter_ver_16bits)
1686  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
1687  int64_t shift);
1688 typedef void (*vc1_mspel_mc_filter_hor_16bits)
1689  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
1690 typedef void (*vc1_mspel_mc_filter_8bits)
1691  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
1692  mips_reg offset);
1693 
1694 /**
1695  * Interpolate fractional pel values by applying proper vertical then
1696  * horizontal filter.
1697  *
1698  * @param dst Destination buffer for interpolated pels.
1699  * @param src Source buffer.
1700  * @param stride Stride for both src and dst buffers.
1701  * @param hmode Horizontal filter (expressed in quarter pixels shift).
1702  * @param hmode Vertical filter.
1703  * @param rnd Rounding bias.
1704  */
1705 #define VC1_MSPEL_MC(OP) \
1706 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
1707  int hmode, int vmode, int rnd) \
1708 { \
1709  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
1710  { NULL, vc1_put_ver_16b_shift1_mmi, \
1711  vc1_put_ver_16b_shift2_mmi, \
1712  vc1_put_ver_16b_shift3_mmi }; \
1713  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
1714  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
1715  OP ## vc1_hor_16b_shift2_mmi, \
1716  OP ## vc1_hor_16b_shift3_mmi }; \
1717  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
1718  { NULL, OP ## vc1_shift1_mmi, \
1719  OP ## vc1_shift2_mmi, \
1720  OP ## vc1_shift3_mmi }; \
1721  \
1722  if (vmode) { /* Vertical filter to apply */ \
1723  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
1724  static const int shift_value[] = { 0, 5, 1, 5 }; \
1725  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
1726  int r; \
1727  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
1728  \
1729  r = (1<<(shift-1)) + rnd-1; \
1730  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
1731  \
1732  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
1733  return; \
1734  } \
1735  else { /* No horizontal filter, output 8 lines to dst */ \
1736  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
1737  return; \
1738  } \
1739  } \
1740  \
1741  /* Horizontal mode with no vertical mode */ \
1742  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
1743 } \
1744 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
1745  int stride, int hmode, int vmode, int rnd)\
1746 { \
1747  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
1748  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
1749  dst += 8*stride; src += 8*stride; \
1750  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
1751  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
1752 }
1753 
1754 VC1_MSPEL_MC(put_)
1755 VC1_MSPEL_MC(avg_)
1756 
1757 /** Macro to ease bicubic filter interpolation functions declarations */
1758 #define DECLARE_FUNCTION(a, b) \
1759 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
1760  const uint8_t *src, \
1761  ptrdiff_t stride, \
1762  int rnd) \
1763 { \
1764  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
1765 } \
1766 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
1767  const uint8_t *src, \
1768  ptrdiff_t stride, \
1769  int rnd) \
1770 { \
1771  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
1772 } \
1773 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
1774  const uint8_t *src, \
1775  ptrdiff_t stride, \
1776  int rnd) \
1777 { \
1778  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
1779 } \
1780 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
1781  const uint8_t *src, \
1782  ptrdiff_t stride, \
1783  int rnd) \
1784 { \
1785  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
1786 }
1787 
1788 DECLARE_FUNCTION(0, 1)
1789 DECLARE_FUNCTION(0, 2)
1790 DECLARE_FUNCTION(0, 3)
1791 
1792 DECLARE_FUNCTION(1, 0)
1793 DECLARE_FUNCTION(1, 1)
1794 DECLARE_FUNCTION(1, 2)
1795 DECLARE_FUNCTION(1, 3)
1796 
1797 DECLARE_FUNCTION(2, 0)
1798 DECLARE_FUNCTION(2, 1)
1799 DECLARE_FUNCTION(2, 2)
1800 DECLARE_FUNCTION(2, 3)
1801 
1802 DECLARE_FUNCTION(3, 0)
1803 DECLARE_FUNCTION(3, 1)
1804 DECLARE_FUNCTION(3, 2)
1805 DECLARE_FUNCTION(3, 3)
1806 
1807 #define CHROMA_MC_8_MMI \
1808  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
1809  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
1810  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
1811  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
1812  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
1813  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
1814  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
1815  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
1816  \
1817  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
1818  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
1819  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
1820  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
1821  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
1822  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
1823  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
1824  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
1825  \
1826  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
1827  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
1828  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
1829  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
1830  \
1831  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
1832  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
1833  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
1834  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
1835  \
1836  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
1837  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
1838  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1839 
1840 
1841 #define CHROMA_MC_4_MMI \
1842  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
1843  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
1844  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
1845  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
1846  \
1847  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
1848  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
1849  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
1850  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
1851  \
1852  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
1853  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
1854  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
1855  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
1856  \
1857  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
1858  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1859 
1860 
1862  uint8_t *src /* align 1 */,
1863  int stride, int h, int x, int y)
1864 {
1865  const int A = (8 - x) * (8 - y);
1866  const int B = (x) * (8 - y);
1867  const int C = (8 - x) * (y);
1868  const int D = (x) * (y);
1869  double ftmp[10];
1870  uint32_t tmp[1];
1871  DECLARE_VAR_ALL64;
1872  DECLARE_VAR_ADDRT;
1873 
1874  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1875 
1876  __asm__ volatile(
1877  "li %[tmp0], 0x06 \n\t"
1878  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1879  "mtc1 %[tmp0], %[ftmp9] \n\t"
1880  "pshufh %[A], %[A], %[ftmp0] \n\t"
1881  "pshufh %[B], %[B], %[ftmp0] \n\t"
1882  "pshufh %[C], %[C], %[ftmp0] \n\t"
1883  "pshufh %[D], %[D], %[ftmp0] \n\t"
1884 
1885  "1: \n\t"
1886  MMI_ULDC1(%[ftmp1], %[src], 0x00)
1887  MMI_ULDC1(%[ftmp2], %[src], 0x01)
1888  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1889  MMI_ULDC1(%[ftmp3], %[src], 0x00)
1890  MMI_ULDC1(%[ftmp4], %[src], 0x01)
1891 
1893 
1894  MMI_SDC1(%[ftmp1], %[dst], 0x00)
1895  "addiu %[h], %[h], -0x01 \n\t"
1896  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
1897  "bnez %[h], 1b \n\t"
1898  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1899  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1900  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1901  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1902  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1903  RESTRICT_ASM_ALL64
1904  RESTRICT_ASM_ADDRT
1905  [tmp0]"=&r"(tmp[0]),
1906  [src]"+&r"(src), [dst]"+&r"(dst),
1907  [h]"+&r"(h)
1908  : [stride]"r"((mips_reg)stride),
1909  [A]"f"(A), [B]"f"(B),
1910  [C]"f"(C), [D]"f"(D),
1911  [ff_pw_28]"f"(ff_pw_28)
1912  : "memory"
1913  );
1914 }
1915 
1917  uint8_t *src /* align 1 */,
1918  int stride, int h, int x, int y)
1919 {
1920  const int A = (8 - x) * (8 - y);
1921  const int B = (x) * (8 - y);
1922  const int C = (8 - x) * (y);
1923  const int D = (x) * (y);
1924  double ftmp[6];
1925  uint32_t tmp[1];
1926  DECLARE_VAR_LOW32;
1927  DECLARE_VAR_ADDRT;
1928 
1929  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1930 
1931  __asm__ volatile(
1932  "li %[tmp0], 0x06 \n\t"
1933  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1934  "mtc1 %[tmp0], %[ftmp5] \n\t"
1935  "pshufh %[A], %[A], %[ftmp0] \n\t"
1936  "pshufh %[B], %[B], %[ftmp0] \n\t"
1937  "pshufh %[C], %[C], %[ftmp0] \n\t"
1938  "pshufh %[D], %[D], %[ftmp0] \n\t"
1939 
1940  "1: \n\t"
1941  MMI_ULWC1(%[ftmp1], %[src], 0x00)
1942  MMI_ULWC1(%[ftmp2], %[src], 0x01)
1943  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1944  MMI_ULWC1(%[ftmp3], %[src], 0x00)
1945  MMI_ULWC1(%[ftmp4], %[src], 0x01)
1946 
1948 
1949  MMI_SWC1(%[ftmp1], %[dst], 0x00)
1950  "addiu %[h], %[h], -0x01 \n\t"
1951  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
1952  "bnez %[h], 1b \n\t"
1953  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1954  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1955  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1956  [tmp0]"=&r"(tmp[0]),
1957  RESTRICT_ASM_LOW32
1958  RESTRICT_ASM_ADDRT
1959  [src]"+&r"(src), [dst]"+&r"(dst),
1960  [h]"+&r"(h)
1961  : [stride]"r"((mips_reg)stride),
1962  [A]"f"(A), [B]"f"(B),
1963  [C]"f"(C), [D]"f"(D),
1964  [ff_pw_28]"f"(ff_pw_28)
1965  : "memory"
1966  );
1967 }
1968 
1970  uint8_t *src /* align 1 */,
1971  int stride, int h, int x, int y)
1972 {
1973  const int A = (8 - x) * (8 - y);
1974  const int B = (x) * (8 - y);
1975  const int C = (8 - x) * (y);
1976  const int D = (x) * (y);
1977  double ftmp[10];
1978  uint32_t tmp[1];
1979  DECLARE_VAR_ALL64;
1980  DECLARE_VAR_ADDRT;
1981 
1982  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983 
1984  __asm__ volatile(
1985  "li %[tmp0], 0x06 \n\t"
1986  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1987  "mtc1 %[tmp0], %[ftmp9] \n\t"
1988  "pshufh %[A], %[A], %[ftmp0] \n\t"
1989  "pshufh %[B], %[B], %[ftmp0] \n\t"
1990  "pshufh %[C], %[C], %[ftmp0] \n\t"
1991  "pshufh %[D], %[D], %[ftmp0] \n\t"
1992 
1993  "1: \n\t"
1994  MMI_ULDC1(%[ftmp1], %[src], 0x00)
1995  MMI_ULDC1(%[ftmp2], %[src], 0x01)
1996  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1997  MMI_ULDC1(%[ftmp3], %[src], 0x00)
1998  MMI_ULDC1(%[ftmp4], %[src], 0x01)
1999 
2001 
2002  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2003  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2004 
2005  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2006  "addiu %[h], %[h], -0x01 \n\t"
2007  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2008  "bnez %[h], 1b \n\t"
2009  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2010  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2011  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2012  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2013  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2014  [tmp0]"=&r"(tmp[0]),
2015  RESTRICT_ASM_ALL64
2016  RESTRICT_ASM_ADDRT
2017  [src]"+&r"(src), [dst]"+&r"(dst),
2018  [h]"+&r"(h)
2019  : [stride]"r"((mips_reg)stride),
2020  [A]"f"(A), [B]"f"(B),
2021  [C]"f"(C), [D]"f"(D),
2022  [ff_pw_28]"f"(ff_pw_28)
2023  : "memory"
2024  );
2025 }
2026 
2028  uint8_t *src /* align 1 */,
2029  int stride, int h, int x, int y)
2030 {
2031  const int A = (8 - x) * (8 - y);
2032  const int B = ( x) * (8 - y);
2033  const int C = (8 - x) * ( y);
2034  const int D = ( x) * ( y);
2035  double ftmp[6];
2036  uint32_t tmp[1];
2037  DECLARE_VAR_LOW32;
2038  DECLARE_VAR_ADDRT;
2039 
2040  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2041 
2042  __asm__ volatile(
2043  "li %[tmp0], 0x06 \n\t"
2044  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2045  "mtc1 %[tmp0], %[ftmp5] \n\t"
2046  "pshufh %[A], %[A], %[ftmp0] \n\t"
2047  "pshufh %[B], %[B], %[ftmp0] \n\t"
2048  "pshufh %[C], %[C], %[ftmp0] \n\t"
2049  "pshufh %[D], %[D], %[ftmp0] \n\t"
2050 
2051  "1: \n\t"
2052  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2053  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2054  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2055  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2056  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2057 
2059 
2060  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2061  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2062 
2063  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2064  "addiu %[h], %[h], -0x01 \n\t"
2065  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2066  "bnez %[h], 1b \n\t"
2067  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2068  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2069  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2070  [tmp0]"=&r"(tmp[0]),
2071  RESTRICT_ASM_LOW32
2072  RESTRICT_ASM_ADDRT
2073  [src]"+&r"(src), [dst]"+&r"(dst),
2074  [h]"+&r"(h)
2075  : [stride]"r"((mips_reg)stride),
2076  [A]"f"(A), [B]"f"(B),
2077  [C]"f"(C), [D]"f"(D),
2078  [ff_pw_28]"f"(ff_pw_28)
2079  : "memory"
2080  );
2081 }
#define mips_reg
Definition: asmdefs.h:44
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1182
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1224
VC-1 and WMV3 decoder.
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:1861
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
static int shift(int a, int b)
Definition: sonic.c:82
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:1916
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:889
#define C
else temp
Definition: vf_mcdeint.c:259
Definition: vf_geq.c:46
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1177
#define a0
Definition: regdef.h:46
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2027
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:1758
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1155
const char * b
Definition: vf_curves.c:113
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1072
#define a1
Definition: regdef.h:47
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:602
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:1689
#define src
Definition: vp8dsp.c:254
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1223
const uint64_t ff_pw_17
Definition: constants.c:38
static int16_t block[64]
Definition: dct.c:115
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1049
#define TRANSPOSE_4H(m1, m2, m3, m4, t1, t2, t3, t4, t5, r1, zero, shift)
Definition: mmiutils.h:204
#define a3
Definition: regdef.h:49
uint8_t
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:144
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1252
const uint64_t ff_pw_22
Definition: constants.c:41
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1379
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:104
const uint64_t ff_pw_64
Definition: constants.c:45
#define VC1_INV_TRANCS_8_STEP2_MMI(fp1,fp2,fp3,fp4,fp5,fp6,fp7,fp8,o1,o2,o3,o4,ff_p1, ff_p2, ff_p3, ff_pw)
Definition: vc1dsp_mmi.c:68
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:999
#define A(x)
Definition: vp56_arith.h:28
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1275
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1217
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
static const int shift1[6]
Definition: dxa.c:50
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
const uint64_t ff_pw_4
Definition: constants.c:29
const char * r
Definition: vf_curves.c:111
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1106
simple assert() macros that are a bit more flexible than ISO C assert().
#define VC1_INV_TRANCS_4_STEP1_MMI(fp1,fp2,fp3,fp4,fp5,fp6,fp7,fp8,ff_p1, ff_p2, ff_p3, ff_pw)
Definition: vc1dsp_mmi.c:102
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1172
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1197
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
GLsizei count
Definition: opengl_enc.c:109
const uint64_t ff_pw_1
Definition: constants.c:26
const uint64_t ff_pw_10
Definition: constants.c:34
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1207
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define PTR_SUBU
Definition: asmdefs.h:50
#define VC1_INV_TRANCS_8_STEP1_MMI(fp1,fp2,fp3,fp4,o1,o2,o3,o4,t1,t2,t3,t4,ff_p1, ff_p2, ff_p3, ff_p4)
Definition: vc1dsp_mmi.c:31
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:1686
#define FFMIN(a, b)
Definition: common.h:96
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:371
#define a2
Definition: regdef.h:48
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:1705
int n
Definition: avisynth_c.h:684
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:1841
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1212
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:841
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1629
#define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift)
Definition: mmiutils.h:230
#define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift)
Definition: mmiutils.h:236
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1202
const uint64_t ff_pw_28
Definition: constants.c:42
#define PTR_ADDIU
Definition: asmdefs.h:48
const uint64_t ff_pw_12
Definition: constants.c:35
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define VC1_INV_TRANCS_4_STEP2_MMI(fp1, fp2, fp3, fp4,fp5, fp6, fp7, fp8, zero)
Definition: vc1dsp_mmi.c:125
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1258
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:1807
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right)
Definition: vc1dsp_mmi.c:1022
const uint64_t ff_pw_15
Definition: constants.c:36
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1187
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
const uint64_t ff_pw_6
Definition: constants.c:31
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:157
D(D(float, sse)
Definition: rematrix_init.c:28
static double c[64]
#define rnd()
Definition: checkasm.h:74
const uint64_t ff_pw_16
Definition: constants.c:37
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1192
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:1969
int len
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:1691
const uint64_t ff_pw_9
Definition: constants.c:33
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> dc
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1319
#define PTR_ADDU
Definition: asmdefs.h:47
#define av_always_inline
Definition: attributes.h:39
#define stride
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1504
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1579
static uint8_t tmp[11]
Definition: aes_ctr.c:26