FFmpeg
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/attributes.h"
24 #include "libavutil/avassert.h"
25 #include "libavutil/mem_internal.h"
26 
27 #include "libavcodec/vc1dsp.h"
28 #include "constants.h"
29 #include "vc1dsp_mips.h"
30 #include "hpeldsp_mips.h"
32 
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
34  "li %[tmp0], "#r1" \n\t" \
35  "mtc1 %[tmp0], %[ftmp13] \n\t" \
36  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
37  "li %[tmp0], "#r2" \n\t" \
38  "mtc1 %[tmp0], %[ftmp14] \n\t" \
39  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
40  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
41  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
42  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
43  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
44  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
45  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
46  \
47  "li %[tmp0], "#r3" \n\t" \
48  "mtc1 %[tmp0], %[ftmp13] \n\t" \
49  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
50  "li %[tmp0], "#r4" \n\t" \
51  "mtc1 %[tmp0], %[ftmp14] \n\t" \
52  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
53  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
54  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
55  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
56  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
57  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
58  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
59  \
60  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
61  "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
62  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
63  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
64  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
65  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
66  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
67  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
68  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
69  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
70  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
71  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
72  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
73  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
74  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
75  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
76 
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
78  "li %[tmp0], "#r1" \n\t" \
79  "mtc1 %[tmp0], %[ftmp13] \n\t" \
80  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
81  "li %[tmp0], "#r2" \n\t" \
82  "mtc1 %[tmp0], %[ftmp14] \n\t" \
83  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
84  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
85  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
86  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
87  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
88  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
89  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
90  \
91  "li %[tmp0], "#r3" \n\t" \
92  "mtc1 %[tmp0], %[ftmp13] \n\t" \
93  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
94  "li %[tmp0], "#r4" \n\t" \
95  "mtc1 %[tmp0], %[ftmp14] \n\t" \
96  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
97  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
98  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
99  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
100  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
101  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
102  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
103  \
104  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
105  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
106  "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
107  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
108  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
109  "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
110  "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
111  "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
112  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
113  "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
114  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
115  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
116  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
117  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
118  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
119  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
120  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
121  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
122  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
123  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
124 
125 /* Do inverse transform on 8x8 block */
126 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
127 {
128  int dc = block[0];
129  double ftmp[9];
130  mips_reg addr[1];
131  int count;
132  union mmi_intfloat64 dc_u;
133 
134  dc = (3 * dc + 1) >> 1;
135  dc = (3 * dc + 16) >> 5;
136  dc_u.i = dc;
137 
138  __asm__ volatile(
139  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
140  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
141  "li %[count], 0x02 \n\t"
142 
143  "1: \n\t"
144  MMI_LDC1(%[ftmp1], %[dest], 0x00)
145  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
146  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
147  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
148  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
149  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
150  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
151 
152  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
153  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
154  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
155  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
156  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
157  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
158  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
159  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
160 
161  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
162  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
163  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
164  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
165  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
166  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
167  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
168  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
169 
170  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
171  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
172  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
173  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
174 
175  MMI_SDC1(%[ftmp1], %[dest], 0x00)
176  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
177  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
178  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
179  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
180  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
181  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
182 
183  "addiu %[count], %[count], -0x01 \n\t"
184  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
185  "bnez %[count], 1b \n\t"
186  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
187  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
188  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
189  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
190  [ftmp8]"=&f"(ftmp[8]),
191  [addr0]"=&r"(addr[0]),
192  [count]"=&r"(count), [dest]"+&r"(dest)
193  : [linesize]"r"((mips_reg)linesize),
194  [dc]"f"(dc_u.f)
195  : "memory"
196  );
197 }
198 
199 #if _MIPS_SIM != _ABIO32
200 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
201 {
202  DECLARE_ALIGNED(16, int16_t, temp[64]);
203  double ftmp[23];
204  uint64_t tmp[1];
205 
206  __asm__ volatile (
207  /* 1st loop: start */
208  "li %[tmp0], 0x03 \n\t"
209  "mtc1 %[tmp0], %[ftmp0] \n\t"
210 
211  // 1st part
212  MMI_LDC1(%[ftmp1], %[block], 0x00)
213  MMI_LDC1(%[ftmp11], %[block], 0x10)
214  MMI_LDC1(%[ftmp2], %[block], 0x20)
215  MMI_LDC1(%[ftmp12], %[block], 0x30)
216  MMI_LDC1(%[ftmp3], %[block], 0x40)
217  MMI_LDC1(%[ftmp13], %[block], 0x50)
218  MMI_LDC1(%[ftmp4], %[block], 0x60)
219  MMI_LDC1(%[ftmp14], %[block], 0x70)
220  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
221  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
222  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
223  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
224 
225  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
226  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
227  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
228  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
229 
230  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
231  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
232  0x000f0010, 0x00040009, %[ff_pw_4])
233 
234  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
235  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
236  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
237 
238  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
239  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
240  0xfff00009, 0x000f0004, %[ff_pw_4])
241 
242  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
243  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
244  0xfff70004, 0xfff0000f, %[ff_pw_4])
245 
246  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
247  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
248 
249  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
250  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
251 
252  MMI_SDC1(%[ftmp15], %[temp], 0x00)
253  MMI_SDC1(%[ftmp19], %[temp], 0x08)
254  MMI_SDC1(%[ftmp16], %[temp], 0x10)
255  MMI_SDC1(%[ftmp20], %[temp], 0x18)
256  MMI_SDC1(%[ftmp17], %[temp], 0x20)
257  MMI_SDC1(%[ftmp21], %[temp], 0x28)
258  MMI_SDC1(%[ftmp18], %[temp], 0x30)
259  MMI_SDC1(%[ftmp22], %[temp], 0x38)
260 
261  // 2nd part
262  MMI_LDC1(%[ftmp1], %[block], 0x08)
263  MMI_LDC1(%[ftmp11], %[block], 0x18)
264  MMI_LDC1(%[ftmp2], %[block], 0x28)
265  MMI_LDC1(%[ftmp12], %[block], 0x38)
266  MMI_LDC1(%[ftmp3], %[block], 0x48)
267  MMI_LDC1(%[ftmp13], %[block], 0x58)
268  MMI_LDC1(%[ftmp4], %[block], 0x68)
269  MMI_LDC1(%[ftmp14], %[block], 0x78)
270  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
271  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
272  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
273  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
274 
275  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
276  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
277  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
278  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
279 
280  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
281  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
282  0x000f0010, 0x00040009, %[ff_pw_4])
283 
284  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
285  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
286  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
287 
288  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
289  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
290  0xfff00009, 0x000f0004, %[ff_pw_4])
291 
292  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
293  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
294  0xfff70004, 0xfff0000f, %[ff_pw_4])
295 
296  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
297  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
298 
299  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
300  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
301 
302  MMI_SDC1(%[ftmp19], %[temp], 0x48)
303  MMI_SDC1(%[ftmp20], %[temp], 0x58)
304  MMI_SDC1(%[ftmp21], %[temp], 0x68)
305  MMI_SDC1(%[ftmp22], %[temp], 0x78)
306  /* 1st loop: end */
307 
308  /* 2nd loop: start */
309  "li %[tmp0], 0x07 \n\t"
310  "mtc1 %[tmp0], %[ftmp0] \n\t"
311 
312  // 1st part
313  MMI_LDC1(%[ftmp1], %[temp], 0x00)
314  MMI_LDC1(%[ftmp11], %[temp], 0x10)
315  MMI_LDC1(%[ftmp2], %[temp], 0x20)
316  MMI_LDC1(%[ftmp12], %[temp], 0x30)
317  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
318  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
319  "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
320  "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
321 
322  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
323  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
324  "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
325  "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
326 
327  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
328  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
329  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
330 
331  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
332  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
333  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
334 
335  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
336  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
337  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
338 
339  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
340  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
341  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
342 
343  MMI_SDC1(%[ftmp15], %[block], 0x00)
344  MMI_SDC1(%[ftmp16], %[block], 0x10)
345  MMI_SDC1(%[ftmp17], %[block], 0x20)
346  MMI_SDC1(%[ftmp18], %[block], 0x30)
347  MMI_SDC1(%[ftmp19], %[block], 0x40)
348  MMI_SDC1(%[ftmp20], %[block], 0x50)
349  MMI_SDC1(%[ftmp21], %[block], 0x60)
350  MMI_SDC1(%[ftmp22], %[block], 0x70)
351 
352  // 2nd part
353  MMI_LDC1(%[ftmp1], %[temp], 0x08)
354  MMI_LDC1(%[ftmp11], %[temp], 0x18)
355  MMI_LDC1(%[ftmp2], %[temp], 0x28)
356  MMI_LDC1(%[ftmp12], %[temp], 0x38)
357  MMI_LDC1(%[ftmp3], %[temp], 0x48)
358  MMI_LDC1(%[ftmp13], %[temp], 0x58)
359  MMI_LDC1(%[ftmp4], %[temp], 0x68)
360  MMI_LDC1(%[ftmp14], %[temp], 0x78)
361  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
362  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
363  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
364  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
365 
366  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
367  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
368  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
369  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
370 
371  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
372  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
373  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
374 
375  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
376  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
377  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
378 
379  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
380  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
381  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
382 
383  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
384  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
385  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
386 
387  MMI_SDC1(%[ftmp15], %[block], 0x08)
388  MMI_SDC1(%[ftmp16], %[block], 0x18)
389  MMI_SDC1(%[ftmp17], %[block], 0x28)
390  MMI_SDC1(%[ftmp18], %[block], 0x38)
391  MMI_SDC1(%[ftmp19], %[block], 0x48)
392  MMI_SDC1(%[ftmp20], %[block], 0x58)
393  MMI_SDC1(%[ftmp21], %[block], 0x68)
394  MMI_SDC1(%[ftmp22], %[block], 0x78)
395  /* 2nd loop: end */
396  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
397  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
398  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
399  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
400  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
401  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
402  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
403  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
404  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
405  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
406  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
407  [ftmp22]"=&f"(ftmp[22]),
408  [tmp0]"=&r"(tmp[0])
409  : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f),
410  [ff_pw_4]"f"(ff_pw_32_4.f), [block]"r"(block),
411  [temp]"r"(temp)
412  : "memory"
413  );
414 }
415 #endif
416 
417 /* Do inverse transform on 8x4 part of block */
418 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
419 {
420  int dc = block[0];
421  double ftmp[9];
422  union mmi_intfloat64 dc_u;
423 
424  dc = ( 3 * dc + 1) >> 1;
425  dc = (17 * dc + 64) >> 7;
426  dc_u.i = dc;
427 
428  __asm__ volatile(
429  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
430  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
431 
432  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
433  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
434  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
435  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
436 
437  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
438  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
439  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
440  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
441  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
442  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
443  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
444  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
445 
446  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
447  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
448  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
449  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
450  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
451  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
452  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
453  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
454 
455  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
456  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
457  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
458  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
459 
460  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
461  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
462  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
463  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
464  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
465  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
466  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
467  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
468  [ftmp8]"=&f"(ftmp[8])
469  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
470  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
471  [dc]"f"(dc_u.f)
472  : "memory"
473  );
474 }
475 
476 #if _MIPS_SIM != _ABIO32
477 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
478 {
479  int16_t *src = block;
480  int16_t *dst = block;
481  double ftmp[16];
482  uint32_t tmp[1];
483  int16_t count = 4;
484  int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
485  12, 15, 6, -4, -12, -16, -16, -9,
486  12, 9, -6, -16, -12, 4, 16, 15,
487  12, 4, -16, -9, 12, 15, -6, -16,
488  12, -4, -16, 9, 12, -15, -6, 16,
489  12, -9, -6, 16, -12, -4, 16, -15,
490  12, -15, 6, 4, -12, 16, -16, 9,
491  12, -16, 16, -15, 12, -9, 6, -4};
492 
493  // 1st loop
494  __asm__ volatile (
495  "li %[tmp0], 0x03 \n\t"
496  "mtc1 %[tmp0], %[ftmp0] \n\t"
497 
498  "1: \n\t"
499  MMI_LDC1(%[ftmp1], %[src], 0x00)
500  MMI_LDC1(%[ftmp2], %[src], 0x08)
501 
502  /* ftmp11: dst1,dst0 */
503  MMI_LDC1(%[ftmp3], %[coeff], 0x00)
504  MMI_LDC1(%[ftmp4], %[coeff], 0x08)
505  MMI_LDC1(%[ftmp5], %[coeff], 0x10)
506  MMI_LDC1(%[ftmp6], %[coeff], 0x18)
507  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
508  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
509  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
510  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
511  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
512  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
513  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
514  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
515  "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
516  "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
517 
518  /* ftmp12: dst3,dst2 */
519  MMI_LDC1(%[ftmp3], %[coeff], 0x20)
520  MMI_LDC1(%[ftmp4], %[coeff], 0x28)
521  MMI_LDC1(%[ftmp5], %[coeff], 0x30)
522  MMI_LDC1(%[ftmp6], %[coeff], 0x38)
523  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
524  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
525  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
526  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
527  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
528  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
529  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
530  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
531  "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
532  "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
533 
534  /* ftmp13: dst5,dst4 */
535  MMI_LDC1(%[ftmp3], %[coeff], 0x40)
536  MMI_LDC1(%[ftmp4], %[coeff], 0x48)
537  MMI_LDC1(%[ftmp5], %[coeff], 0x50)
538  MMI_LDC1(%[ftmp6], %[coeff], 0x58)
539  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
540  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
541  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
542  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
543  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
544  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
545  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
546  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
547  "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
548  "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
549 
550  /* ftmp14: dst7,dst6 */
551  MMI_LDC1(%[ftmp3], %[coeff], 0x60)
552  MMI_LDC1(%[ftmp4], %[coeff], 0x68)
553  MMI_LDC1(%[ftmp5], %[coeff], 0x70)
554  MMI_LDC1(%[ftmp6], %[coeff], 0x78)
555  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
556  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
557  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
558  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
559  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
560  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
561  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
562  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
563  "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
564  "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
565 
566  /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
567  "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
568  "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
569  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
570  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
571  "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
572  "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
573  "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
574  "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
575  "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
576  "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
577  MMI_SDC1(%[ftmp9], %[dst], 0x00)
578  MMI_SDC1(%[ftmp10], %[dst], 0x08)
579 
580  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
581  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
582  "addiu %[count], %[count], -0x01 \n\t"
583  "bnez %[count], 1b \n\t"
584  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
585  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
586  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
587  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
588  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
589  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
590  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
591  [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
592  [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
593  : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
594  : "memory"
595  );
596 
597  src = block;
598 
599  // 2nd loop
600  __asm__ volatile (
601  "li %[tmp0], 0x44 \n\t"
602  "mtc1 %[tmp0], %[ftmp15] \n\t"
603 
604  // 1st part
605  "li %[tmp0], 0x07 \n\t"
606  "mtc1 %[tmp0], %[ftmp0] \n\t"
607  MMI_LDC1(%[ftmp1], %[src], 0x00)
608  MMI_LDC1(%[ftmp2], %[src], 0x10)
609  MMI_LDC1(%[ftmp3], %[src], 0x20)
610  MMI_LDC1(%[ftmp4], %[src], 0x30)
611  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
612  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
613  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
614  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
615 
616  /* ftmp11: dst03,dst02,dst01,dst00 */
617  "li %[tmp0], 0x00160011 \n\t"
618  "mtc1 %[tmp0], %[ftmp3] \n\t"
619  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
620  "li %[tmp0], 0x000a0011 \n\t"
621  "mtc1 %[tmp0], %[ftmp4] \n\t"
622  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
623  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
624  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
625  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
626  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
627  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
628  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
629  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
630  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
631  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
632  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
633  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
634  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
635  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
636 
637  /* ftmp12: dst13,dst12,dst11,dst10 */
638  "li %[tmp0], 0x000a0011 \n\t"
639  "mtc1 %[tmp0], %[ftmp3] \n\t"
640  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
641  "li %[tmp0], 0xffeaffef \n\t"
642  "mtc1 %[tmp0], %[ftmp4] \n\t"
643  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
644  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
645  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
646  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
647  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
648  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
649  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
650  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
651  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
652  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
653  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
654  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
655  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
656  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
657 
658  /* ftmp13: dst23,dst22,dst21,dst20 */
659  "li %[tmp0], 0xfff60011 \n\t"
660  "mtc1 %[tmp0], %[ftmp3] \n\t"
661  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
662  "li %[tmp0], 0x0016ffef \n\t"
663  "mtc1 %[tmp0], %[ftmp4] \n\t"
664  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
665  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
666  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
667  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
668  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
669  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
670  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
671  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
672  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
673  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
674  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
675  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
676  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
677  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
678 
679  /* ftmp14: dst33,dst32,dst31,dst30 */
680  "li %[tmp0], 0xffea0011 \n\t"
681  "mtc1 %[tmp0], %[ftmp3] \n\t"
682  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
683  "li %[tmp0], 0xfff60011 \n\t"
684  "mtc1 %[tmp0], %[ftmp4] \n\t"
685  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
686  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
687  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
688  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
689  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
690  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
691  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
692  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
693  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
694  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
695  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
696  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
697  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
698  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
699 
700  MMI_LWC1(%[ftmp1], %[dest], 0x00)
701  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
702  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
703  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
705  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
706  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
707  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
708  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
709  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
710  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
711  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
712  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
713  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
714  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
715  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
716  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
717  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
718  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
719  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
720  MMI_SWC1(%[ftmp1], %[dest], 0x00)
721  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
722  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
723  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
725  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
726  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
727 
728  // 2nd part
729  "li %[tmp0], 0x07 \n\t"
730  "mtc1 %[tmp0], %[ftmp0] \n\t"
731  MMI_LDC1(%[ftmp1], %[src], 0x08)
732  MMI_LDC1(%[ftmp2], %[src], 0x18)
733  MMI_LDC1(%[ftmp3], %[src], 0x28)
734  MMI_LDC1(%[ftmp4], %[src], 0x38)
735  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
736  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
737  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
738  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
739 
740  /* ftmp11: dst03,dst02,dst01,dst00 */
741  "li %[tmp0], 0x00160011 \n\t"
742  "mtc1 %[tmp0], %[ftmp3] \n\t"
743  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
744  "li %[tmp0], 0x000a0011 \n\t"
745  "mtc1 %[tmp0], %[ftmp4] \n\t"
746  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
747  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
748  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
749  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
750  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
751  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
752  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
753  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
754  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
755  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
756  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
757  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
758  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
759  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
760 
761  /* ftmp12: dst13,dst12,dst11,dst10 */
762  "li %[tmp0], 0x000a0011 \n\t"
763  "mtc1 %[tmp0], %[ftmp3] \n\t"
764  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
765  "li %[tmp0], 0xffeaffef \n\t"
766  "mtc1 %[tmp0], %[ftmp4] \n\t"
767  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
768  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
769  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
770  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
771  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
772  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
773  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
774  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
775  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
776  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
777  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
778  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
779  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
780  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
781 
782  /* ftmp13: dst23,dst22,dst21,dst20 */
783  "li %[tmp0], 0xfff60011 \n\t"
784  "mtc1 %[tmp0], %[ftmp3] \n\t"
785  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
786  "li %[tmp0], 0x0016ffef \n\t"
787  "mtc1 %[tmp0], %[ftmp4] \n\t"
788  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
789  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
790  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
791  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
792  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
793  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
794  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
795  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
796  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
797  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
798  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
799  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
800  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
801  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
802 
803  /* ftmp14: dst33,dst32,dst31,dst30 */
804  "li %[tmp0], 0xffea0011 \n\t"
805  "mtc1 %[tmp0], %[ftmp3] \n\t"
806  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
807  "li %[tmp0], 0xfff60011 \n\t"
808  "mtc1 %[tmp0], %[ftmp4] \n\t"
809  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
810  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
811  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
812  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
813  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
814  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
815  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
816  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
817  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
818  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
819  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
820  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
821  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
822  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
823 
824  MMI_LWC1(%[ftmp1], %[dest], 0x04)
825  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
826  MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
827  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828  MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
829  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
830  MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
831  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
832  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
833  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
834  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
835  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
836  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
837  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
838  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
839  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
840  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
841  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
842  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
843  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
844  MMI_SWC1(%[ftmp1], %[dest], 0x04)
845  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
846  MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
847  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848  MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
849  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
850  MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
851 
852  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
853  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
854  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
855  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
856  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
857  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
858  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
859  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
860  [tmp0]"=&r"(tmp[0])
861  : [ff_pw_64]"f"(ff_pw_32_64.f),
862  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
863  :"memory"
864  );
865 }
866 #endif
867 
868 /* Do inverse transform on 4x8 parts of block */
869 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
870 {
871  int dc = block[0];
872  double ftmp[9];
873  union mmi_intfloat64 dc_u;
875 
876  dc = (17 * dc + 4) >> 3;
877  dc = (12 * dc + 64) >> 7;
878  dc_u.i = dc;
879 
880  __asm__ volatile(
881  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
882  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
883 
884  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
885  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
886  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
887  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
888  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
889  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
890  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
891  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
892 
893  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
894  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
895  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
896  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
897  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
898  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
899  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
900  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
901 
902  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
903  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
904  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
905  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
906  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
907  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
908  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
909  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
910 
911  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
912  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
913  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
914  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
915  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
916  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
917  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
918  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
919 
920  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
921  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
922  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
923  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
924  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
925  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
926  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
927  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
928  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
929  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
930  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
931  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
933  [ftmp8]"=&f"(ftmp[8])
934  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
935  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
936  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
937  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
938  [dc]"f"(dc_u.f)
939  : "memory"
940  );
941 }
942 
943 #if _MIPS_SIM != _ABIO32
944 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
945 {
946  int16_t *src = block;
947  int16_t *dst = block;
948  double ftmp[23];
949  uint64_t count = 8, tmp[1];
950  int16_t coeff[16] = {17, 22, 17, 10,
951  17, 10,-17,-22,
952  17,-10,-17, 22,
953  17,-22, 17,-10};
954 
955  // 1st loop
956  __asm__ volatile (
957 
958  "li %[tmp0], 0x03 \n\t"
959  "mtc1 %[tmp0], %[ftmp0] \n\t"
960 
961  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
962  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
963  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
964  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
965  "1: \n\t"
966  /* ftmp8: dst3,dst2,dst1,dst0 */
967  MMI_LDC1(%[ftmp1], %[src], 0x00)
968  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
969  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
970  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
971  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
972  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
973  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
974  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
975  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
976  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
977  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
978  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
979  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
980  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
981  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
982  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
983  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
984  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
985  MMI_SDC1(%[ftmp8], %[dst], 0x00)
986 
987  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
988  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
989  "addiu %[count], %[count], -0x01 \n\t"
990  "bnez %[count], 1b \n\t"
991  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
992  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
993  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
994  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
995  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
996  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
997  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
998  [src]"+&r"(src), [dst]"+&r"(dst)
999  : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
1000  : "memory"
1001  );
1002 
1003  src = block;
1004 
1005  // 2nd loop
1006  __asm__ volatile (
1007  "li %[tmp0], 0x07 \n\t"
1008  "mtc1 %[tmp0], %[ftmp0] \n\t"
1009 
1010  MMI_LDC1(%[ftmp1], %[src], 0x00)
1011  MMI_LDC1(%[ftmp2], %[src], 0x20)
1012  MMI_LDC1(%[ftmp3], %[src], 0x40)
1013  MMI_LDC1(%[ftmp4], %[src], 0x60)
1014  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1015  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1016  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1017  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1018 
1019  MMI_LDC1(%[ftmp1], %[src], 0x10)
1020  MMI_LDC1(%[ftmp2], %[src], 0x30)
1021  MMI_LDC1(%[ftmp3], %[src], 0x50)
1022  MMI_LDC1(%[ftmp4], %[src], 0x70)
1023  "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1024  "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1025  "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1026  "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1027 
1028  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1029  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1030  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1031 
1032  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1033  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1034  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1035 
1036  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1037  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1038  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1039 
1040  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1041  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1042  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1043 
1044  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1045  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1046  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1047  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1048  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1049  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1050  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1051  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1052  MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1053  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1054  MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1055  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1056  MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1057  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1058  MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1059  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1060  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1061  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1062  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1063  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1064  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1065  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1066  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1067  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1068 
1069  "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1070  "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1071  "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1072  "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1073  "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1074  "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1075  "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1076  "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1077 
1078  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1079  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1080  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1081  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1082  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1083  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1084  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1085  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1086 
1087  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1088  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1089  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1090  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1091  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1092  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1093  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1094  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1095  MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1096  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1097  MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1098  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1099  MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1100  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1101  MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1102 
1103  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1104  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1105  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1106  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1107  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1108  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1109  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1110  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1111  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1112  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1113  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1114  [ftmp22]"=&f"(ftmp[22]),
1115  [tmp0]"=&r"(tmp[0])
1116  : [ff_pw_1]"f"(ff_pw_32_1.f), [ff_pw_64]"f"(ff_pw_32_64.f),
1117  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1118  : "memory"
1119  );
1120 }
1121 #endif
1122 
1123 /* Do inverse transform on 4x4 part of block */
1124 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1125 {
1126  int dc = block[0];
1127  double ftmp[5];
1128  union mmi_intfloat64 dc_u;
1130 
1131  dc = (17 * dc + 4) >> 3;
1132  dc = (17 * dc + 64) >> 7;
1133  dc_u.i = dc;
1134 
1135  __asm__ volatile(
1136  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1137  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1138 
1139  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1143 
1144  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1145  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1146  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1147  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1148 
1149  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1150  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1151  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1152  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1153 
1154  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1155  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1156  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1157  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1158 
1159  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1164  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1166  [ftmp4]"=&f"(ftmp[4])
1167  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1168  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1169  [dc]"f"(dc_u.f)
1170  : "memory"
1171  );
1172 }
1173 
1174 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1175 {
1176  int16_t *src = block;
1177  int16_t *dst = block;
1178  double ftmp[16];
1179  uint32_t count = 4, tmp[1];
1180  int16_t coeff[16] = {17, 22, 17, 10,
1181  17, 10,-17,-22,
1182  17,-10,-17, 22,
1183  17,-22, 17,-10};
1184  // 1st loop
1185  __asm__ volatile (
1186 
1187  "li %[tmp0], 0x03 \n\t"
1188  "mtc1 %[tmp0], %[ftmp0] \n\t"
1189  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1190  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1191  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1192  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1193  "1: \n\t"
1194  /* ftmp8: dst3,dst2,dst1,dst0 */
1195  MMI_LDC1(%[ftmp1], %[src], 0x00)
1196  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1197  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1198  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1199  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1200  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1201  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1202  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1203  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1204  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1205  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1206  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1207  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1208  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1209  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1210  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1211  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1212  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1213  MMI_SDC1(%[ftmp8], %[dst], 0x00)
1214 
1215  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1216  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1217  "addiu %[count], %[count], -0x01 \n\t"
1218  "bnez %[count], 1b \n\t"
1219  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1220  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1221  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1222  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1223  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1224  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1225  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1226  [src]"+&r"(src), [dst]"+&r"(dst)
1227  : [ff_pw_4]"f"(ff_pw_32_4.f), [coeff]"r"(coeff)
1228  : "memory"
1229  );
1230 
1231  src = block;
1232 
1233  // 2nd loop
1234  __asm__ volatile (
1235  "li %[tmp0], 0x07 \n\t"
1236  "mtc1 %[tmp0], %[ftmp0] \n\t"
1237  "li %[tmp0], 0x44 \n\t"
1238  "mtc1 %[tmp0], %[ftmp15] \n\t"
1239 
1240  MMI_LDC1(%[ftmp1], %[src], 0x00)
1241  MMI_LDC1(%[ftmp2], %[src], 0x10)
1242  MMI_LDC1(%[ftmp3], %[src], 0x20)
1243  MMI_LDC1(%[ftmp4], %[src], 0x30)
1244  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1245  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1246  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1247  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1248 
1249  /* ftmp11: dst03,dst02,dst01,dst00 */
1250  "li %[tmp0], 0x00160011 \n\t"
1251  "mtc1 %[tmp0], %[ftmp3] \n\t"
1252  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1253  "li %[tmp0], 0x000a0011 \n\t"
1254  "mtc1 %[tmp0], %[ftmp4] \n\t"
1255  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1256  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1257  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1258  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1259  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1260  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1261  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1262  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1263  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1264  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1265  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1266  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1267  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1268  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1269 
1270  /* ftmp12: dst13,dst12,dst11,dst10 */
1271  "li %[tmp0], 0x000a0011 \n\t"
1272  "mtc1 %[tmp0], %[ftmp3] \n\t"
1273  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1274  "li %[tmp0], 0xffeaffef \n\t"
1275  "mtc1 %[tmp0], %[ftmp4] \n\t"
1276  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1277  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1278  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1279  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1280  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1281  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1282  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1283  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1284  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1285  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1286  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1287  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1288  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1289  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1290 
1291  /* ftmp13: dst23,dst22,dst21,dst20 */
1292  "li %[tmp0], 0xfff60011 \n\t"
1293  "mtc1 %[tmp0], %[ftmp3] \n\t"
1294  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1295  "li %[tmp0], 0x0016ffef \n\t"
1296  "mtc1 %[tmp0], %[ftmp4] \n\t"
1297  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1298  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1299  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1300  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1301  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1302  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1303  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1304  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1305  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1306  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1307  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1308  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1309  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1310  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1311 
1312  /* ftmp14: dst33,dst32,dst31,dst30 */
1313  "li %[tmp0], 0xffea0011 \n\t"
1314  "mtc1 %[tmp0], %[ftmp3] \n\t"
1315  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1316  "li %[tmp0], 0xfff60011 \n\t"
1317  "mtc1 %[tmp0], %[ftmp4] \n\t"
1318  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1319  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1320  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1321  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1322  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1323  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1324  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1325  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1326  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1327  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1328  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1329  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1330  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1331  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1332 
1333  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1334  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1335  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1336  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1337  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1338  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1339  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1340  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1341  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1342  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1343  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1344  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1345  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1346  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1347  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1348  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1349  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1350  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1351  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1352  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1353 
1354  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1355  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1356  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1357  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1358  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1359  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1360  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1361 
1362  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1363  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1364  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1365  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1366  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1367  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1368  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1369  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1370  [tmp0]"=&r"(tmp[0])
1371  : [ff_pw_64]"f"(ff_pw_32_64.f),
1372  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1373  :"memory"
1374  );
1375 }
1376 
1377 /* Apply overlap transform to horizontal edge */
1378 void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1379 {
1380  int i;
1381  int a, b, c, d;
1382  int d1, d2;
1383  int rnd = 1;
1384  for (i = 0; i < 8; i++) {
1385  a = src[-2];
1386  b = src[-1];
1387  c = src[0];
1388  d = src[1];
1389  d1 = (a - d + 3 + rnd) >> 3;
1390  d2 = (a - d + b - c + 4 - rnd) >> 3;
1391 
1392  src[-2] = a - d1;
1393  src[-1] = av_clip_uint8(b - d2);
1394  src[0] = av_clip_uint8(c + d2);
1395  src[1] = d + d1;
1396  src += stride;
1397  rnd = !rnd;
1398  }
1399 }
1400 
1401 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
1402 {
1403  int i;
1404  int a, b, c, d;
1405  int d1, d2;
1406  int rnd1 = flags & 2 ? 3 : 4;
1407  int rnd2 = 7 - rnd1;
1408  for (i = 0; i < 8; i++) {
1409  a = left[6];
1410  b = left[7];
1411  c = right[0];
1412  d = right[1];
1413  d1 = a - d;
1414  d2 = a - d + b - c;
1415 
1416  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1417  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1418  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1419  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1420 
1421  right += right_stride;
1422  left += left_stride;
1423  if (flags & 1) {
1424  rnd2 = 7 - rnd2;
1425  rnd1 = 7 - rnd1;
1426  }
1427  }
1428 }
1429 
1430 /* Apply overlap transform to vertical edge */
1431 void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
1432 {
1433  int i;
1434  int a, b, c, d;
1435  int d1, d2;
1436  int rnd = 1;
1437  for (i = 0; i < 8; i++) {
1438  a = src[-2 * stride];
1439  b = src[-stride];
1440  c = src[0];
1441  d = src[stride];
1442  d1 = (a - d + 3 + rnd) >> 3;
1443  d2 = (a - d + b - c + 4 - rnd) >> 3;
1444 
1445  src[-2 * stride] = a - d1;
1446  src[-stride] = av_clip_uint8(b - d2);
1447  src[0] = av_clip_uint8(c + d2);
1448  src[stride] = d + d1;
1449  src++;
1450  rnd = !rnd;
1451  }
1452 }
1453 
1454 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1455 {
1456  int i;
1457  int a, b, c, d;
1458  int d1, d2;
1459  int rnd1 = 4, rnd2 = 3;
1460  for (i = 0; i < 8; i++) {
1461  a = top[48];
1462  b = top[56];
1463  c = bottom[0];
1464  d = bottom[8];
1465  d1 = a - d;
1466  d2 = a - d + b - c;
1467 
1468  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1469  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1470  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1471  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1472 
1473  bottom++;
1474  top++;
1475  rnd2 = 7 - rnd2;
1476  rnd1 = 7 - rnd1;
1477  }
1478 }
1479 
1480 /**
1481  * VC-1 in-loop deblocking filter for one line
1482  * @param src source block type
1483  * @param stride block stride
1484  * @param pq block quantizer
1485  * @return whether other 3 pairs should be filtered or not
1486  * @see 8.6
1487  */
1488 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1489 {
1490  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1491  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1492  int a0_sign = a0 >> 31; /* Store sign */
1493 
1494  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1495  if (a0 < pq) {
1496  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1497  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1498  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1499  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1500  if (a1 < a0 || a2 < a0) {
1501  int clip = src[-1 * stride] - src[0 * stride];
1502  int clip_sign = clip >> 31;
1503 
1504  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1505  if (clip) {
1506  int a3 = FFMIN(a1, a2);
1507  int d = 5 * (a3 - a0);
1508  int d_sign = (d >> 31);
1509 
1510  d = ((d ^ d_sign) - d_sign) >> 3;
1511  d_sign ^= a0_sign;
1512 
1513  if (d_sign ^ clip_sign)
1514  d = 0;
1515  else {
1516  d = FFMIN(d, clip);
1517  d = (d ^ d_sign) - d_sign; /* Restore sign */
1518  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1519  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1520  }
1521  return 1;
1522  }
1523  }
1524  }
1525  return 0;
1526 }
1527 
1528 /**
1529  * VC-1 in-loop deblocking filter
1530  * @param src source block type
1531  * @param step distance between horizontally adjacent elements
1532  * @param stride distance between vertically adjacent elements
1533  * @param len edge length to filter (4 or 8 pixels)
1534  * @param pq block quantizer
1535  * @see 8.6
1536  */
1537 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1538  int len, int pq)
1539 {
1540  int i;
1541  int filt3;
1542 
1543  for (i = 0; i < len; i += 4) {
1544  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1545  if (filt3) {
1546  vc1_filter_line(src + 0 * step, stride, pq);
1547  vc1_filter_line(src + 1 * step, stride, pq);
1548  vc1_filter_line(src + 3 * step, stride, pq);
1549  }
1550  src += step * 4;
1551  }
1552 }
1553 
1554 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1555 {
1556  vc1_loop_filter(src, 1, stride, 4, pq);
1557 }
1558 
1559 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1560 {
1561  vc1_loop_filter(src, stride, 1, 4, pq);
1562 }
1563 
1564 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1565 {
1566  vc1_loop_filter(src, 1, stride, 8, pq);
1567 }
1568 
1569 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1570 {
1571  vc1_loop_filter(src, stride, 1, 8, pq);
1572 }
1573 
1574 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1575 {
1576  vc1_loop_filter(src, 1, stride, 16, pq);
1577 }
1578 
1579 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
1580 {
1581  vc1_loop_filter(src, stride, 1, 16, pq);
1582 }
1583 
1584 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1585  ptrdiff_t stride, int rnd)
1586 {
1587  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1588 }
1589 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1590  ptrdiff_t stride, int rnd)
1591 {
1592  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1593 }
1594 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1595  ptrdiff_t stride, int rnd)
1596 {
1597  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1598 }
1599 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1600  ptrdiff_t stride, int rnd)
1601 {
1602  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1603 }
1604 
1605 #define OP_PUT(S, D)
1606 #define OP_AVG(S, D) \
1607  "ldc1 $f16, "#S" \n\t" \
1608  "pavgb "#D", "#D", $f16 \n\t"
1609 
1610 /** Add rounder from $f14 to $f6 and pack result at destination */
1611 #define NORMALIZE_MMI(SHIFT) \
1612  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1613  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1614  "psrah $f6, $f6, "SHIFT" \n\t" \
1615  "psrah $f8, $f8, "SHIFT" \n\t"
1616 
1617 #define TRANSFER_DO_PACK(OP) \
1618  "packushb $f6, $f6, $f8 \n\t" \
1619  OP((%[dst]), $f6) \
1620  "sdc1 $f6, 0x00(%[dst]) \n\t"
1621 
1622 #define TRANSFER_DONT_PACK(OP) \
1623  OP(0(%[dst]), $f6) \
1624  OP(8(%[dst]), $f8) \
1625  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1626  "sdc1 $f8, 0x08(%[dst]) \n\t"
1627 
1628 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1629 #define DO_UNPACK(reg) \
1630  "punpcklbh "reg", "reg", $f0 \n\t"
1631 #define DONT_UNPACK(reg)
1632 
1633 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1634 #define LOAD_ROUNDER_MMI(ROUND) \
1635  "lwc1 $f14, "ROUND" \n\t" \
1636  "punpcklhw $f14, $f14, $f14 \n\t" \
1637  "punpcklwd $f14, $f14, $f14 \n\t"
1638 
1639 
1640 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1641  "paddh "#R1", "#R1", "#R2" \n\t" \
1642  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1643  MMI_ULWC1(R0, $9, 0x00) \
1644  "pmullh "#R1", "#R1", $f6 \n\t" \
1645  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1646  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1647  MMI_ULWC1(R3, $9, 0x00) \
1648  "psubh "#R1", "#R1", "#R0" \n\t" \
1649  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1650  "paddh "#R1", "#R1", $f14 \n\t" \
1651  "psubh "#R1", "#R1", "#R3" \n\t" \
1652  "psrah "#R1", "#R1", %[shift] \n\t" \
1653  MMI_SDC1(R1, %[dst], OFF) \
1654  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1655 
1656 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1657 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1658  const uint8_t *src, mips_reg stride,
1659  int rnd, int64_t shift)
1660 {
1661  union mmi_intfloat64 shift_u;
1664  shift_u.i = shift;
1665 
1666  __asm__ volatile(
1667  "pxor $f0, $f0, $f0 \n\t"
1668  "li $8, 0x03 \n\t"
1669  LOAD_ROUNDER_MMI("%[rnd]")
1670  "1: \n\t"
1671  MMI_ULWC1($f4, %[src], 0x00)
1672  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1673  MMI_ULWC1($f6, %[src], 0x00)
1674  "punpcklbh $f4, $f4, $f0 \n\t"
1675  "punpcklbh $f6, $f6, $f0 \n\t"
1676  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1677  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1678  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1679  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1680  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1681  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1682  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1683  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1684  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1685  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1686  "addiu $8, $8, -0x01 \n\t"
1687  "bnez $8, 1b \n\t"
1689  [src]"+r"(src), [dst]"+r"(dst)
1690  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1691  [shift]"f"(shift_u.f), [rnd]"m"(rnd),
1692  [stride2]"r"(9*stride-4)
1693  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
1694  "$f14", "$f16", "memory"
1695  );
1696 }
1697 
1698 /**
1699  * Data is already unpacked, so some operations can directly be made from
1700  * memory.
1701  */
1702 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1703 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1704  const int16_t *src, int rnd) \
1705 { \
1706  int h = 8; \
1707  DECLARE_VAR_ALL64; \
1708  DECLARE_VAR_ADDRT; \
1709  \
1710  src -= 1; \
1711  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1712  \
1713  __asm__ volatile( \
1714  LOAD_ROUNDER_MMI("%[rnd]") \
1715  "1: \n\t" \
1716  MMI_ULDC1($f2, %[src], 0x00) \
1717  MMI_ULDC1($f4, %[src], 0x08) \
1718  MMI_ULDC1($f6, %[src], 0x02) \
1719  MMI_ULDC1($f8, %[src], 0x0a) \
1720  MMI_ULDC1($f0, %[src], 0x06) \
1721  "paddh $f2, $f2, $f0 \n\t" \
1722  MMI_ULDC1($f0, %[src], 0x0e) \
1723  "paddh $f4, $f4, $f0 \n\t" \
1724  MMI_ULDC1($f0, %[src], 0x04) \
1725  "paddh $f6, $f6, $f0 \n\t" \
1726  MMI_ULDC1($f0, %[src], 0x0b) \
1727  "paddh $f8, $f8, $f0 \n\t" \
1728  "pmullh $f6, $f6, %[ff_pw_9] \n\t" \
1729  "pmullh $f8, $f8, %[ff_pw_9] \n\t" \
1730  "psubh $f6, $f6, $f2 \n\t" \
1731  "psubh $f8, $f8, $f4 \n\t" \
1732  "li $8, 0x07 \n\t" \
1733  "mtc1 $8, $f16 \n\t" \
1734  NORMALIZE_MMI("$f16") \
1735  /* Remove bias */ \
1736  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1737  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1738  TRANSFER_DO_PACK(OP) \
1739  "addiu %[h], %[h], -0x01 \n\t" \
1740  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742  "bnez %[h], 1b \n\t" \
1743  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1744  [h]"+r"(h), \
1745  [src]"+r"(src), [dst]"+r"(dst) \
1746  : [stride]"r"(stride), [rnd]"m"(rnd), \
1747  [ff_pw_9]"f"(ff_pw_9.f), [ff_pw_128]"f"(ff_pw_128.f) \
1748  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14", \
1749  "$f16", "memory" \
1750  ); \
1751 }
1752 
1755 
1756 /**
1757  * Purely vertical or horizontal 1/2 shift interpolation.
1758  * Sacrify $f12 for *9 factor.
1759  */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762  mips_reg stride, int rnd, \
1763  mips_reg offset) \
1764 { \
1765  DECLARE_VAR_LOW32; \
1766  DECLARE_VAR_ADDRT; \
1767  \
1768  rnd = 8 - rnd; \
1769  \
1770  __asm__ volatile( \
1771  "pxor $f0, $f0, $f0 \n\t" \
1772  "li $10, 0x08 \n\t" \
1773  LOAD_ROUNDER_MMI("%[rnd]") \
1774  "1: \n\t" \
1775  MMI_ULWC1($f6, %[src], 0x00) \
1776  MMI_ULWC1($f8, %[src], 0x04) \
1777  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1778  MMI_ULWC1($f2, $9, 0x00) \
1779  MMI_ULWC1($f4, $9, 0x04) \
1780  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1781  "punpcklbh $f6, $f6, $f0 \n\t" \
1782  "punpcklbh $f8, $f8, $f0 \n\t" \
1783  "punpcklbh $f2, $f2, $f0 \n\t" \
1784  "punpcklbh $f4, $f4, $f0 \n\t" \
1785  "paddh $f6, $f6, $f2 \n\t" \
1786  "paddh $f8, $f8, $f4 \n\t" \
1787  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1788  MMI_ULWC1($f2, $9, 0x00) \
1789  MMI_ULWC1($f4, $9, 0x04) \
1790  "pmullh $f6, $f6, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \
1791  "pmullh $f8, $f8, %[ff_pw_9] \n\t" /* 0,9,9,0*/ \
1792  "punpcklbh $f2, $f2, $f0 \n\t" \
1793  "punpcklbh $f4, $f4, $f0 \n\t" \
1794  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1795  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1796  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1797  MMI_ULWC1($f2, $9, 0x00) \
1798  MMI_ULWC1($f4, $9, 0x04) \
1799  "punpcklbh $f2, $f2, $f0 \n\t" \
1800  "punpcklbh $f4, $f4, $f0 \n\t" \
1801  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1802  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1803  "li $8, 0x04 \n\t" \
1804  "mtc1 $8, $f16 \n\t" \
1805  NORMALIZE_MMI("$f16") \
1806  "packushb $f6, $f6, $f8 \n\t" \
1807  OP((%[dst]), $f6) \
1808  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1809  "addiu $10, $10, -0x01 \n\t" \
1810  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1811  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1812  "bnez $10, 1b \n\t" \
1813  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1814  [src]"+r"(src), [dst]"+r"(dst) \
1815  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1816  [stride]"r"(stride), [rnd]"m"(rnd), \
1817  [stride1]"r"(stride-offset), \
1818  [ff_pw_9]"f"(ff_pw_9.f) \
1819  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1820  "$f14", "$f16", "memory" \
1821  ); \
1822 }
1823 
1824 VC1_SHIFT2(OP_PUT, put_)
1825 VC1_SHIFT2(OP_AVG, avg_)
1826 
1827 /**
1828  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1829  *
1830  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1831  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1832  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1833  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1834  * @param A2 Stride address of 2nd tap
1835  * @param A3 Stride address of 3rd tap
1836  * @param A4 Stride address of 4th tap
1837  */
1838 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1839  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1840  LOAD($f2, $9, M*0) \
1841  LOAD($f4, $9, M*4) \
1842  UNPACK("$f2") \
1843  UNPACK("$f4") \
1844  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1845  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1846  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1847  LOAD($f6, $9, M*0) \
1848  LOAD($f8, $9, M*4) \
1849  UNPACK("$f6") \
1850  UNPACK("$f8") \
1851  "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \
1852  "pmullh $f8, $f8, %[ff_pw_18] \n\t" /* *18 */ \
1853  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1854  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1855  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1856  LOAD($f2, $9, M*0) \
1857  LOAD($f4, $9, M*4) \
1858  UNPACK("$f2") \
1859  UNPACK("$f4") \
1860  "li $8, 0x02 \n\t" \
1861  "mtc1 $8, $f16 \n\t" \
1862  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1863  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1864  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1865  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1866  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1867  LOAD($f2, $9, M*0) \
1868  LOAD($f4, $9, M*4) \
1869  UNPACK("$f2") \
1870  UNPACK("$f4") \
1871  "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \
1872  "pmullh $f4, $f4, %[ff_pw_53] \n\t" /* *53 */ \
1873  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1874  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1875 
1876 /**
1877  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1878  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1879  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1880  *
1881  * @param NAME Either 1 or 3
1882  * @see MSPEL_FILTER13_CORE for information on A1->A4
1883  */
1884 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1885 static void \
1886 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1887  mips_reg src_stride, \
1888  int rnd, int64_t shift) \
1889 { \
1890  int h = 8; \
1891  union mmi_intfloat64 shift_u; \
1892  DECLARE_VAR_LOW32; \
1893  DECLARE_VAR_ADDRT; \
1894  shift_u.i = shift; \
1895  \
1896  src -= src_stride; \
1897  \
1898  __asm__ volatile( \
1899  "pxor $f0, $f0, $f0 \n\t" \
1900  LOAD_ROUNDER_MMI("%[rnd]") \
1901  ".p2align 3 \n\t" \
1902  "1: \n\t" \
1903  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1904  NORMALIZE_MMI("%[shift]") \
1905  TRANSFER_DONT_PACK(OP_PUT) \
1906  /* Last 3 (in fact 4) bytes on the line */ \
1907  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1908  MMI_ULWC1($f2, $9, 0x08) \
1909  DO_UNPACK("$f2") \
1910  "mov.d $f6, $f2 \n\t" \
1911  "paddh $f2, $f2, $f2 \n\t" \
1912  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1913  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1914  MMI_ULWC1($f6, $9, 0x08) \
1915  DO_UNPACK("$f6") \
1916  "pmullh $f6, $f6, %[ff_pw_18] \n\t" /* *18 */ \
1917  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1918  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1919  MMI_ULWC1($f2, $9, 0x08) \
1920  DO_UNPACK("$f2") \
1921  "pmullh $f2, $f2, %[ff_pw_53] \n\t" /* *53 */ \
1922  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1923  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1924  MMI_ULWC1($f2, $9, 0x08) \
1925  DO_UNPACK("$f2") \
1926  "li $8, 0x02 \n\t" \
1927  "mtc1 $8, $f16 \n\t" \
1928  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1929  "psubh $f6, $f6, $f2 \n\t" \
1930  "paddh $f6, $f6, $f14 \n\t" \
1931  "li $8, 0x06 \n\t" \
1932  "mtc1 $8, $f16 \n\t" \
1933  "psrah $f6, $f6, $f16 \n\t" \
1934  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1935  "addiu %[h], %[h], -0x01 \n\t" \
1936  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1937  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1938  "bnez %[h], 1b \n\t" \
1939  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1940  [h]"+r"(h), \
1941  [src]"+r"(src), [dst]"+r"(dst) \
1942  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1943  [stride_x3]"r"(3*src_stride), \
1944  [rnd]"m"(rnd), [shift]"f"(shift_u.f), \
1945  [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1946  [ff_pw_3]"f"(ff_pw_3.f) \
1947  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1948  "$f14", "$f16", "memory" \
1949  ); \
1950 }
1951 
1952 /**
1953  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1954  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1955  *
1956  * @param NAME Either 1 or 3
1957  * @see MSPEL_FILTER13_CORE for information on A1->A4
1958  */
1959 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1960 static void \
1961 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1962  const int16_t *src, int rnd) \
1963 { \
1964  int h = 8; \
1965  DECLARE_VAR_ALL64; \
1966  DECLARE_VAR_ADDRT; \
1967  \
1968  src -= 1; \
1969  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1970  \
1971  __asm__ volatile( \
1972  "pxor $f0, $f0, $f0 \n\t" \
1973  LOAD_ROUNDER_MMI("%[rnd]") \
1974  ".p2align 3 \n\t" \
1975  "1: \n\t" \
1976  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1977  "li $8, 0x07 \n\t" \
1978  "mtc1 $8, $f16 \n\t" \
1979  NORMALIZE_MMI("$f16") \
1980  /* Remove bias */ \
1981  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1982  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1983  TRANSFER_DO_PACK(OP) \
1984  "addiu %[h], %[h], -0x01 \n\t" \
1985  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1986  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1987  "bnez %[h], 1b \n\t" \
1988  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1989  [h]"+r"(h), \
1990  [src]"+r"(src), [dst]"+r"(dst) \
1991  : [stride]"r"(stride), [rnd]"m"(rnd), \
1992  [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1993  [ff_pw_3]"f"(ff_pw_3.f), [ff_pw_128]"f"(ff_pw_128.f) \
1994  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1995  "$f14", "$f16", "memory" \
1996  ); \
1997 }
1998 
1999 /**
2000  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2001  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2002  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2003  *
2004  * @param NAME Either 1 or 3
2005  * @see MSPEL_FILTER13_CORE for information on A1->A4
2006  */
2007 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2008 static void \
2009 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2010  mips_reg stride, int rnd, mips_reg offset) \
2011 { \
2012  int h = 8; \
2013  DECLARE_VAR_LOW32; \
2014  DECLARE_VAR_ADDRT; \
2015  \
2016  src -= offset; \
2017  rnd = 32-rnd; \
2018  \
2019  __asm__ volatile ( \
2020  "pxor $f0, $f0, $f0 \n\t" \
2021  LOAD_ROUNDER_MMI("%[rnd]") \
2022  ".p2align 3 \n\t" \
2023  "1: \n\t" \
2024  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2025  "li $8, 0x06 \n\t" \
2026  "mtc1 $8, $f16 \n\t" \
2027  NORMALIZE_MMI("$f16") \
2028  TRANSFER_DO_PACK(OP) \
2029  "addiu %[h], %[h], -0x01 \n\t" \
2030  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2031  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2032  "bnez %[h], 1b \n\t" \
2033  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2034  [h]"+r"(h), \
2035  [src]"+r"(src), [dst]"+r"(dst) \
2036  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2037  [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2038  [rnd]"m"(rnd), \
2039  [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
2040  [ff_pw_3]"f"(ff_pw_3.f) \
2041  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
2042  "$f14", "$f16", "memory" \
2043  ); \
2044 }
2045 
2046 
2047 /** 1/4 shift bicubic interpolation */
2048 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2049 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2050 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2051 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2052 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2053 
2054 /** 3/4 shift bicubic interpolation */
2055 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2056 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2057 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2058 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2059 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2060 
2061 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2062  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2063  int64_t shift);
2064 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2065  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2066 typedef void (*vc1_mspel_mc_filter_8bits)
2067  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2068  mips_reg offset);
2069 
2070 /**
2071  * Interpolate fractional pel values by applying proper vertical then
2072  * horizontal filter.
2073  *
2074  * @param dst Destination buffer for interpolated pels.
2075  * @param src Source buffer.
2076  * @param stride Stride for both src and dst buffers.
2077  * @param hmode Horizontal filter (expressed in quarter pixels shift).
2078  * @param hmode Vertical filter.
2079  * @param rnd Rounding bias.
2080  */
2081 #define VC1_MSPEL_MC(OP) \
2082 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2083  int hmode, int vmode, int rnd) \
2084 { \
2085  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2086  { NULL, vc1_put_ver_16b_shift1_mmi, \
2087  vc1_put_ver_16b_shift2_mmi, \
2088  vc1_put_ver_16b_shift3_mmi }; \
2089  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2090  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2091  OP ## vc1_hor_16b_shift2_mmi, \
2092  OP ## vc1_hor_16b_shift3_mmi }; \
2093  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2094  { NULL, OP ## vc1_shift1_mmi, \
2095  OP ## vc1_shift2_mmi, \
2096  OP ## vc1_shift3_mmi }; \
2097  \
2098  if (vmode) { /* Vertical filter to apply */ \
2099  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2100  static const int shift_value[] = { 0, 5, 1, 5 }; \
2101  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2102  int r; \
2103  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2104  \
2105  r = (1<<(shift-1)) + rnd-1; \
2106  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2107  \
2108  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2109  return; \
2110  } \
2111  else { /* No horizontal filter, output 8 lines to dst */ \
2112  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2113  return; \
2114  } \
2115  } \
2116  \
2117  /* Horizontal mode with no vertical mode */ \
2118  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2119 } \
2120 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2121  int stride, int hmode, int vmode, int rnd)\
2122 { \
2123  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2124  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2125  dst += 8*stride; src += 8*stride; \
2126  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2127  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2128 }
2129 
2130 VC1_MSPEL_MC(put_)
2131 VC1_MSPEL_MC(avg_)
2132 
2133 /** Macro to ease bicubic filter interpolation functions declarations */
2134 #define DECLARE_FUNCTION(a, b) \
2135 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2136  const uint8_t *src, \
2137  ptrdiff_t stride, \
2138  int rnd) \
2139 { \
2140  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2141 } \
2142 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2143  const uint8_t *src, \
2144  ptrdiff_t stride, \
2145  int rnd) \
2146 { \
2147  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2148 } \
2149 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2150  const uint8_t *src, \
2151  ptrdiff_t stride, \
2152  int rnd) \
2153 { \
2154  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2155 } \
2156 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2157  const uint8_t *src, \
2158  ptrdiff_t stride, \
2159  int rnd) \
2160 { \
2161  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2162 }
2163 
2164 DECLARE_FUNCTION(0, 1)
2165 DECLARE_FUNCTION(0, 2)
2166 DECLARE_FUNCTION(0, 3)
2167 
2168 DECLARE_FUNCTION(1, 0)
2169 DECLARE_FUNCTION(1, 1)
2170 DECLARE_FUNCTION(1, 2)
2171 DECLARE_FUNCTION(1, 3)
2172 
2173 DECLARE_FUNCTION(2, 0)
2174 DECLARE_FUNCTION(2, 1)
2175 DECLARE_FUNCTION(2, 2)
2176 DECLARE_FUNCTION(2, 3)
2177 
2178 DECLARE_FUNCTION(3, 0)
2179 DECLARE_FUNCTION(3, 1)
2180 DECLARE_FUNCTION(3, 2)
2181 DECLARE_FUNCTION(3, 3)
2182 
2183 #define CHROMA_MC_8_MMI \
2184  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2185  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2186  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2187  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2188  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2189  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2190  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2191  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2192  \
2193  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2194  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2195  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2196  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2197  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2198  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2199  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2200  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2201  \
2202  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2203  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2204  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2205  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2206  \
2207  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2208  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2209  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2210  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2211  \
2212  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2213  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2214  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2215 
2216 
2217 #define CHROMA_MC_4_MMI \
2218  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2219  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2220  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2221  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2222  \
2223  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2224  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2225  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2226  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2227  \
2228  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2229  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2230  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2231  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2232  \
2233  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2234  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2235 
2236 
2237 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2238  const uint8_t *src /* align 1 */,
2239  ptrdiff_t stride, int h, int x, int y)
2240 {
2241  union mmi_intfloat64 A, B, C, D;
2242  double ftmp[10];
2243  uint32_t tmp[1];
2246  A.i = (8 - x) * (8 - y);
2247  B.i = (x) * (8 - y);
2248  C.i = (8 - x) * (y);
2249  D.i = (x) * (y);
2250 
2251  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2252 
2253  __asm__ volatile(
2254  "li %[tmp0], 0x06 \n\t"
2255  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2256  "mtc1 %[tmp0], %[ftmp9] \n\t"
2257  "pshufh %[A], %[A], %[ftmp0] \n\t"
2258  "pshufh %[B], %[B], %[ftmp0] \n\t"
2259  "pshufh %[C], %[C], %[ftmp0] \n\t"
2260  "pshufh %[D], %[D], %[ftmp0] \n\t"
2261 
2262  "1: \n\t"
2263  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2264  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2265  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2266  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2267  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2268 
2270 
2271  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2272  "addiu %[h], %[h], -0x01 \n\t"
2273  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2274  "bnez %[h], 1b \n\t"
2275  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2276  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2277  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2278  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2279  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2282  [tmp0]"=&r"(tmp[0]),
2283  [src]"+&r"(src), [dst]"+&r"(dst),
2284  [h]"+&r"(h)
2285  : [stride]"r"((mips_reg)stride),
2286  [A]"f"(A.f), [B]"f"(B.f),
2287  [C]"f"(C.f), [D]"f"(D.f),
2288  [ff_pw_28]"f"(ff_pw_28.f)
2289  : "memory"
2290  );
2291 }
2292 
2293 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2294  const uint8_t *src /* align 1 */,
2295  ptrdiff_t stride, int h, int x, int y)
2296 {
2297  union mmi_intfloat64 A, B, C, D;
2298  double ftmp[6];
2299  uint32_t tmp[1];
2302  A.i = (8 - x) * (8 - y);
2303  B.i = (x) * (8 - y);
2304  C.i = (8 - x) * (y);
2305  D.i = (x) * (y);
2306 
2307  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2308 
2309  __asm__ volatile(
2310  "li %[tmp0], 0x06 \n\t"
2311  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2312  "mtc1 %[tmp0], %[ftmp5] \n\t"
2313  "pshufh %[A], %[A], %[ftmp0] \n\t"
2314  "pshufh %[B], %[B], %[ftmp0] \n\t"
2315  "pshufh %[C], %[C], %[ftmp0] \n\t"
2316  "pshufh %[D], %[D], %[ftmp0] \n\t"
2317 
2318  "1: \n\t"
2319  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2320  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2321  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2322  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2323  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2324 
2326 
2327  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2328  "addiu %[h], %[h], -0x01 \n\t"
2329  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2330  "bnez %[h], 1b \n\t"
2331  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2332  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2333  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2334  [tmp0]"=&r"(tmp[0]),
2337  [src]"+&r"(src), [dst]"+&r"(dst),
2338  [h]"+&r"(h)
2339  : [stride]"r"((mips_reg)stride),
2340  [A]"f"(A.f), [B]"f"(B.f),
2341  [C]"f"(C.f), [D]"f"(D.f),
2342  [ff_pw_28]"f"(ff_pw_28.f)
2343  : "memory"
2344  );
2345 }
2346 
2347 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2348  const uint8_t *src /* align 1 */,
2349  ptrdiff_t stride, int h, int x, int y)
2350 {
2351  union mmi_intfloat64 A, B, C, D;
2352  double ftmp[10];
2353  uint32_t tmp[1];
2356  A.i = (8 - x) * (8 - y);
2357  B.i = (x) * (8 - y);
2358  C.i = (8 - x) * (y);
2359  D.i = (x) * (y);
2360 
2361  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2362 
2363  __asm__ volatile(
2364  "li %[tmp0], 0x06 \n\t"
2365  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2366  "mtc1 %[tmp0], %[ftmp9] \n\t"
2367  "pshufh %[A], %[A], %[ftmp0] \n\t"
2368  "pshufh %[B], %[B], %[ftmp0] \n\t"
2369  "pshufh %[C], %[C], %[ftmp0] \n\t"
2370  "pshufh %[D], %[D], %[ftmp0] \n\t"
2371 
2372  "1: \n\t"
2373  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2374  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2375  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2376  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2377  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2378 
2380 
2381  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2382  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2383 
2384  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2385  "addiu %[h], %[h], -0x01 \n\t"
2386  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2387  "bnez %[h], 1b \n\t"
2388  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2389  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2390  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2391  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2392  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2393  [tmp0]"=&r"(tmp[0]),
2396  [src]"+&r"(src), [dst]"+&r"(dst),
2397  [h]"+&r"(h)
2398  : [stride]"r"((mips_reg)stride),
2399  [A]"f"(A.f), [B]"f"(B.f),
2400  [C]"f"(C.f), [D]"f"(D.f),
2401  [ff_pw_28]"f"(ff_pw_28.f)
2402  : "memory"
2403  );
2404 }
2405 
2406 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2407  const uint8_t *src /* align 1 */,
2408  ptrdiff_t stride, int h, int x, int y)
2409 {
2410  union mmi_intfloat64 A, B, C, D;
2411  double ftmp[6];
2412  uint32_t tmp[1];
2415  A.i = (8 - x) * (8 - y);
2416  B.i = (x) * (8 - y);
2417  C.i = (8 - x) * (y);
2418  D.i = (x) * (y);
2419 
2420  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2421 
2422  __asm__ volatile(
2423  "li %[tmp0], 0x06 \n\t"
2424  "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2425  "mtc1 %[tmp0], %[ftmp5] \n\t"
2426  "pshufh %[A], %[A], %[ftmp0] \n\t"
2427  "pshufh %[B], %[B], %[ftmp0] \n\t"
2428  "pshufh %[C], %[C], %[ftmp0] \n\t"
2429  "pshufh %[D], %[D], %[ftmp0] \n\t"
2430 
2431  "1: \n\t"
2432  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2433  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2434  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2435  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2436  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2437 
2439 
2440  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2441  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2442 
2443  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2444  "addiu %[h], %[h], -0x01 \n\t"
2445  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2446  "bnez %[h], 1b \n\t"
2447  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2448  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2449  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2450  [tmp0]"=&r"(tmp[0]),
2453  [src]"+&r"(src), [dst]"+&r"(dst),
2454  [h]"+&r"(h)
2455  : [stride]"r"((mips_reg)stride),
2456  [A]"f"(A.f), [B]"f"(B.f),
2457  [C]"f"(C.f), [D]"f"(D.f),
2458  [ff_pw_28]"f"(ff_pw_28.f)
2459  : "memory"
2460  );
2461 }
A
#define A(x)
Definition: vpx_arith.h:28
vc1_filter_line
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1488
OP_AVG
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1606
mem_internal.h
ff_vc1_h_loop_filter4_mmi
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1559
ff_vc1_v_s_overlap_mmi
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1454
vc1dsp.h
ff_pw_4
const union av_intfloat64 ff_pw_4
Definition: constants.c:28
DECLARE_VAR_LOW32
#define DECLARE_VAR_LOW32
Definition: mmiutils.h:37
ff_vc1_inv_trans_8x4_mmi
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
ff_avg_no_rnd_vc1_chroma_mc4_mmi
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2406
ff_put_no_rnd_vc1_chroma_mc4_mmi
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2293
b
#define b
Definition: input.c:41
mips_reg
#define mips_reg
Definition: asmdefs.h:46
ff_vc1_inv_trans_8x8_dc_mmi
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:126
D
D(D(float, sse)
Definition: rematrix_init.c:29
ff_pw_1
const union av_intfloat64 ff_pw_1
Definition: constants.c:25
ff_avg_pixels8_8_mmi
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
DECLARE_FUNCTION
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:2134
ff_pw_64
const union av_intfloat64 ff_pw_64
Definition: constants.c:44
vc1dsp_mips.h
MSPEL_FILTER13_VER_16B
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1884
MSPEL_FILTER13_HOR_16B
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1959
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:295
constants.h
ff_vc1_v_loop_filter8_mmi
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1564
mmiutils.h
vc1_mspel_mc_filter_ver_16bits
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:2062
a1
#define a1
Definition: regdef.h:47
ff_vc1_inv_trans_4x4_dc_mmi
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1124
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
rnd
#define rnd()
Definition: checkasm.h:122
ff_put_pixels8_8_mmi
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
ff_vc1_v_loop_filter4_mmi
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1554
CHROMA_MC_8_MMI
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:2183
ff_vc1_inv_trans_4x8_dc_mmi
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:869
vc1_mspel_mc_filter_8bits
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:2067
clip
clip
Definition: af_crystalizer.c:121
vc1_mspel_mc_filter_hor_16bits
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:2065
B
#define B
Definition: huffyuv.h:42
ff_vc1_h_s_overlap_mmi
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
Definition: vc1dsp_mmi.c:1401
ff_pw_32_1
const union av_intfloat64 ff_pw_32_1
Definition: constants.c:55
CHROMA_MC_4_MMI
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:2217
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:65
ff_put_vc1_mspel_mc00_mmi
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1584
ff_vc1_inv_trans_8x8_mmi
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
ff_vc1_h_overlap_mmi
void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
Definition: vc1dsp_mmi.c:1378
SHIFT2_LINE
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1640
VC1_INV_TRANCS_8_TYPE2
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
Definition: vc1dsp_mmi.c:77
MSPEL_FILTER13_8B
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:2007
ff_vc1_inv_trans_4x4_mmi
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1174
VC1_MSPEL_MC
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:2081
mmi_intfloat64::f
double f
Definition: asmdefs.h:105
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
f
f
Definition: af_crystalizer.c:121
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
hpeldsp_mips.h
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:87
shift
static int shift(int a, int b)
Definition: bonk.c:262
ff_vc1_v_overlap_mmi
void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
Definition: vc1dsp_mmi.c:1431
ff_pw_32_64
const union av_intfloat64 ff_pw_32_64
Definition: constants.c:57
ff_pw_32_4
const union av_intfloat64 ff_pw_32_4
Definition: constants.c:56
VC1_HOR_16B_SHIFT2
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1702
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:52
DECLARE_VAR_ALL64
#define DECLARE_VAR_ALL64
Definition: mmiutils.h:39
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
a0
#define a0
Definition: regdef.h:46
ff_put_vc1_mspel_mc00_16_mmi
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1589
ff_pw_28
const union av_intfloat64 ff_pw_28
Definition: constants.c:41
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:67
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
ff_avg_no_rnd_vc1_chroma_mc8_mmi
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2347
a2
#define a2
Definition: regdef.h:48
ff_vc1_h_loop_filter8_mmi
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1569
ff_vc1_inv_trans_4x8_mmi
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
len
int len
Definition: vorbis_enc_data.h:426
vc1_loop_filter
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1537
av_intfloat64::f
double f
Definition: intfloat.h:34
stride
#define stride
Definition: h264pred_template.c:537
ff_vc1_h_loop_filter16_mmi
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1579
ff_avg_pixels16_8_mmi
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:49
VC1_INV_TRANCS_8_TYPE1
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
Definition: vc1dsp_mmi.c:33
VC1_SHIFT2
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1760
ff_put_no_rnd_vc1_chroma_mc8_mmi
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2237
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
ff_vc1_inv_trans_8x4_dc_mmi
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:418
RESTRICT_ASM_LOW32
#define RESTRICT_ASM_LOW32
Definition: mmiutils.h:38
DECLARE_VAR_ADDRT
#define DECLARE_VAR_ADDRT
Definition: mmiutils.h:41
temp
else temp
Definition: vf_mcdeint.c:263
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
ff_avg_vc1_mspel_mc00_16_mmi
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1599
av_clip_uint8
#define av_clip_uint8
Definition: common.h:102
OP_PUT
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1605
mmi_intfloat64
Definition: asmdefs.h:103
LOAD_ROUNDER_MMI
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1634
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
ff_put_pixels16_8_mmi
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
d
d
Definition: ffmpeg_filter.c:368
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:474
ff_vc1_v_loop_filter16_mmi
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
Definition: vc1dsp_mmi.c:1574
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:79
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
mmi_intfloat64::i
int64_t i
Definition: asmdefs.h:104
RESTRICT_ASM_ADDRT
#define RESTRICT_ASM_ADDRT
Definition: mmiutils.h:42
ff_avg_vc1_mspel_mc00_mmi
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1594
shift1
static const uint8_t shift1[6]
Definition: dxa.c:48
RESTRICT_ASM_ALL64
#define RESTRICT_ASM_ALL64
Definition: mmiutils.h:40
a3
#define a3
Definition: regdef.h:49
vc1_put_ver_16b_shift2_mmi
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1657