FFmpeg
vc1dsp_mmi.c
Go to the documentation of this file.
1 /*
2  * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3  *
4  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
29 
30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
31  "li %[tmp0], "#r1" \n\t" \
32  "mtc1 %[tmp0], %[ftmp13] \n\t" \
33  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
34  "li %[tmp0], "#r2" \n\t" \
35  "mtc1 %[tmp0], %[ftmp14] \n\t" \
36  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
37  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
38  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
39  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
40  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
41  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
42  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
43  \
44  "li %[tmp0], "#r3" \n\t" \
45  "mtc1 %[tmp0], %[ftmp13] \n\t" \
46  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
47  "li %[tmp0], "#r4" \n\t" \
48  "mtc1 %[tmp0], %[ftmp14] \n\t" \
49  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
50  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
51  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
52  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
53  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
54  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
55  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
56  \
57  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
58  "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
59  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
60  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
61  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
62  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
63  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
64  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
65  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
66  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
67  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
68  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
69  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
70  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
71  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
72  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
73 
74 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
75  "li %[tmp0], "#r1" \n\t" \
76  "mtc1 %[tmp0], %[ftmp13] \n\t" \
77  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
78  "li %[tmp0], "#r2" \n\t" \
79  "mtc1 %[tmp0], %[ftmp14] \n\t" \
80  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
81  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
82  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
83  "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
84  "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
85  "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
86  "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
87  \
88  "li %[tmp0], "#r3" \n\t" \
89  "mtc1 %[tmp0], %[ftmp13] \n\t" \
90  "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
91  "li %[tmp0], "#r4" \n\t" \
92  "mtc1 %[tmp0], %[ftmp14] \n\t" \
93  "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
94  "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
95  "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
96  "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
97  "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
98  "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
99  "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
100  \
101  "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
102  "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
103  "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
104  "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
105  "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
106  "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
107  "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
108  "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
109  "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
110  "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
111  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
112  "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
113  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
114  "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
115  "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
116  "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
117  "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
118  "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
119  "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
120  "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
121 
122 /* Do inverse transform on 8x8 block */
123 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
124 {
125  int dc = block[0];
126  double ftmp[9];
127  mips_reg addr[1];
128  int count;
129 
130  dc = (3 * dc + 1) >> 1;
131  dc = (3 * dc + 16) >> 5;
132 
133  __asm__ volatile(
134  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
135  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
136  "li %[count], 0x02 \n\t"
137 
138  "1: \n\t"
139  MMI_LDC1(%[ftmp1], %[dest], 0x00)
140  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
141  MMI_LDC1(%[ftmp2], %[addr0], 0x00)
142  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
143  MMI_LDC1(%[ftmp3], %[addr0], 0x00)
144  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
145  MMI_LDC1(%[ftmp4], %[addr0], 0x00)
146 
147  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
148  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
149  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
150  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
151  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
152  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
153  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
154  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
155 
156  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
157  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
158  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
159  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
160  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
161  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
162  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
163  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
164 
165  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
166  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
167  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
168  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
169 
170  MMI_SDC1(%[ftmp1], %[dest], 0x00)
171  PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
172  MMI_SDC1(%[ftmp2], %[addr0], 0x00)
173  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
174  MMI_SDC1(%[ftmp3], %[addr0], 0x00)
175  PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
176  MMI_SDC1(%[ftmp4], %[addr0], 0x00)
177 
178  "addiu %[count], %[count], -0x01 \n\t"
179  PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
180  "bnez %[count], 1b \n\t"
181  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
182  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
183  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
184  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
185  [ftmp8]"=&f"(ftmp[8]),
186  [addr0]"=&r"(addr[0]),
187  [count]"=&r"(count), [dest]"+&r"(dest)
188  : [linesize]"r"((mips_reg)linesize),
189  [dc]"f"(dc)
190  : "memory"
191  );
192 }
193 
194 #if _MIPS_SIM != _ABIO32
195 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
196 {
197  DECLARE_ALIGNED(16, int16_t, temp[64]);
198  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
199  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
200  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
201  double ftmp[23];
202  uint64_t tmp[1];
203 
204  __asm__ volatile (
205  /* 1st loop: start */
206  "li %[tmp0], 0x03 \n\t"
207  "mtc1 %[tmp0], %[ftmp0] \n\t"
208 
209  // 1st part
210  MMI_LDC1(%[ftmp1], %[block], 0x00)
211  MMI_LDC1(%[ftmp11], %[block], 0x10)
212  MMI_LDC1(%[ftmp2], %[block], 0x20)
213  MMI_LDC1(%[ftmp12], %[block], 0x30)
214  MMI_LDC1(%[ftmp3], %[block], 0x40)
215  MMI_LDC1(%[ftmp13], %[block], 0x50)
216  MMI_LDC1(%[ftmp4], %[block], 0x60)
217  MMI_LDC1(%[ftmp14], %[block], 0x70)
218  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
219  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
220  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
221  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
222 
223  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
224  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
225  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
226  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
227 
228  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
229  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
230  0x000f0010, 0x00040009, %[ff_pw_4])
231 
232  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
233  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
234  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
235 
236  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
237  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
238  0xfff00009, 0x000f0004, %[ff_pw_4])
239 
240  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
241  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
242  0xfff70004, 0xfff0000f, %[ff_pw_4])
243 
244  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
245  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
246 
247  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
248  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249 
250  MMI_SDC1(%[ftmp15], %[temp], 0x00)
251  MMI_SDC1(%[ftmp19], %[temp], 0x08)
252  MMI_SDC1(%[ftmp16], %[temp], 0x10)
253  MMI_SDC1(%[ftmp20], %[temp], 0x18)
254  MMI_SDC1(%[ftmp17], %[temp], 0x20)
255  MMI_SDC1(%[ftmp21], %[temp], 0x28)
256  MMI_SDC1(%[ftmp18], %[temp], 0x30)
257  MMI_SDC1(%[ftmp22], %[temp], 0x38)
258 
259  // 2nd part
260  MMI_LDC1(%[ftmp1], %[block], 0x08)
261  MMI_LDC1(%[ftmp11], %[block], 0x18)
262  MMI_LDC1(%[ftmp2], %[block], 0x28)
263  MMI_LDC1(%[ftmp12], %[block], 0x38)
264  MMI_LDC1(%[ftmp3], %[block], 0x48)
265  MMI_LDC1(%[ftmp13], %[block], 0x58)
266  MMI_LDC1(%[ftmp4], %[block], 0x68)
267  MMI_LDC1(%[ftmp14], %[block], 0x78)
268  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
269  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
270  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
271  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
272 
273  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
274  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
275  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
276  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
277 
278  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
279  VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
280  0x000f0010, 0x00040009, %[ff_pw_4])
281 
282  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
283  VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
284  0xfffc000f, 0xfff7fff0, %[ff_pw_4])
285 
286  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
287  VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
288  0xfff00009, 0x000f0004, %[ff_pw_4])
289 
290  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
291  VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
292  0xfff70004, 0xfff0000f, %[ff_pw_4])
293 
294  TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
295  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
296 
297  TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
298  %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299 
300  MMI_SDC1(%[ftmp19], %[temp], 0x48)
301  MMI_SDC1(%[ftmp20], %[temp], 0x58)
302  MMI_SDC1(%[ftmp21], %[temp], 0x68)
303  MMI_SDC1(%[ftmp22], %[temp], 0x78)
304  /* 1st loop: end */
305 
306  /* 2nd loop: start */
307  "li %[tmp0], 0x07 \n\t"
308  "mtc1 %[tmp0], %[ftmp0] \n\t"
309 
310  // 1st part
311  MMI_LDC1(%[ftmp1], %[temp], 0x00)
312  MMI_LDC1(%[ftmp11], %[temp], 0x10)
313  MMI_LDC1(%[ftmp2], %[temp], 0x20)
314  MMI_LDC1(%[ftmp12], %[temp], 0x30)
315  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
316  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
317  "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
318  "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
319 
320  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
321  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
322  "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
323  "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
324 
325  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
326  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
327  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
328 
329  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
330  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
331  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
332 
333  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
334  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
335  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
336 
337  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
338  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
339  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
340 
341  MMI_SDC1(%[ftmp15], %[block], 0x00)
342  MMI_SDC1(%[ftmp16], %[block], 0x10)
343  MMI_SDC1(%[ftmp17], %[block], 0x20)
344  MMI_SDC1(%[ftmp18], %[block], 0x30)
345  MMI_SDC1(%[ftmp19], %[block], 0x40)
346  MMI_SDC1(%[ftmp20], %[block], 0x50)
347  MMI_SDC1(%[ftmp21], %[block], 0x60)
348  MMI_SDC1(%[ftmp22], %[block], 0x70)
349 
350  // 2nd part
351  MMI_LDC1(%[ftmp1], %[temp], 0x08)
352  MMI_LDC1(%[ftmp11], %[temp], 0x18)
353  MMI_LDC1(%[ftmp2], %[temp], 0x28)
354  MMI_LDC1(%[ftmp12], %[temp], 0x38)
355  MMI_LDC1(%[ftmp3], %[temp], 0x48)
356  MMI_LDC1(%[ftmp13], %[temp], 0x58)
357  MMI_LDC1(%[ftmp4], %[temp], 0x68)
358  MMI_LDC1(%[ftmp14], %[temp], 0x78)
359  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
360  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
361  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
362  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
363 
364  "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
365  "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
366  "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
367  "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
368 
369  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
370  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
371  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
372 
373  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
374  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
375  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
376 
377  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
378  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
379  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
380 
381  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
382  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
383  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
384 
385  MMI_SDC1(%[ftmp15], %[block], 0x08)
386  MMI_SDC1(%[ftmp16], %[block], 0x18)
387  MMI_SDC1(%[ftmp17], %[block], 0x28)
388  MMI_SDC1(%[ftmp18], %[block], 0x38)
389  MMI_SDC1(%[ftmp19], %[block], 0x48)
390  MMI_SDC1(%[ftmp20], %[block], 0x58)
391  MMI_SDC1(%[ftmp21], %[block], 0x68)
392  MMI_SDC1(%[ftmp22], %[block], 0x78)
393  /* 2nd loop: end */
394  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
395  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
396  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
397  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
398  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
399  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
400  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
401  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
402  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
403  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
404  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
405  [ftmp22]"=&f"(ftmp[22]),
406  [tmp0]"=&r"(tmp[0])
407  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
408  [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
409  [temp]"r"(temp)
410  : "memory"
411  );
412 }
413 #endif
414 
415 /* Do inverse transform on 8x4 part of block */
416 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
417 {
418  int dc = block[0];
419  double ftmp[9];
420 
421  dc = ( 3 * dc + 1) >> 1;
422  dc = (17 * dc + 64) >> 7;
423 
424  __asm__ volatile(
425  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
426  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
427 
428  MMI_LDC1(%[ftmp1], %[dest0], 0x00)
429  MMI_LDC1(%[ftmp2], %[dest1], 0x00)
430  MMI_LDC1(%[ftmp3], %[dest2], 0x00)
431  MMI_LDC1(%[ftmp4], %[dest3], 0x00)
432 
433  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
434  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
435  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
436  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
437  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
438  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
439  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
440  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
441 
442  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
443  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
444  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
445  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
446  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
447  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
448  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
449  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
450 
451  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
452  "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
453  "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
454  "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
455 
456  MMI_SDC1(%[ftmp1], %[dest0], 0x00)
457  MMI_SDC1(%[ftmp2], %[dest1], 0x00)
458  MMI_SDC1(%[ftmp3], %[dest2], 0x00)
459  MMI_SDC1(%[ftmp4], %[dest3], 0x00)
460  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
461  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
462  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
463  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
464  [ftmp8]"=&f"(ftmp[8])
465  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
466  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
467  [dc]"f"(dc)
468  : "memory"
469  );
470 }
471 
472 #if _MIPS_SIM != _ABIO32
473 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
474 {
475  int16_t *src = block;
476  int16_t *dst = block;
477  double ftmp[16];
478  uint32_t tmp[1];
479  int16_t count = 4;
480  DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
481  DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
482  int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
483  12, 15, 6, -4, -12, -16, -16, -9,
484  12, 9, -6, -16, -12, 4, 16, 15,
485  12, 4, -16, -9, 12, 15, -6, -16,
486  12, -4, -16, 9, 12, -15, -6, 16,
487  12, -9, -6, 16, -12, -4, 16, -15,
488  12, -15, 6, 4, -12, 16, -16, 9,
489  12, -16, 16, -15, 12, -9, 6, -4};
490 
491  // 1st loop
492  __asm__ volatile (
493  "li %[tmp0], 0x03 \n\t"
494  "mtc1 %[tmp0], %[ftmp0] \n\t"
495 
496  "1: \n\t"
497  MMI_LDC1(%[ftmp1], %[src], 0x00)
498  MMI_LDC1(%[ftmp2], %[src], 0x08)
499 
500  /* ftmp11: dst1,dst0 */
501  MMI_LDC1(%[ftmp3], %[coeff], 0x00)
502  MMI_LDC1(%[ftmp4], %[coeff], 0x08)
503  MMI_LDC1(%[ftmp5], %[coeff], 0x10)
504  MMI_LDC1(%[ftmp6], %[coeff], 0x18)
505  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
506  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
507  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
508  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
509  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
510  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
511  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
512  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
513  "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
514  "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
515 
516  /* ftmp12: dst3,dst2 */
517  MMI_LDC1(%[ftmp3], %[coeff], 0x20)
518  MMI_LDC1(%[ftmp4], %[coeff], 0x28)
519  MMI_LDC1(%[ftmp5], %[coeff], 0x30)
520  MMI_LDC1(%[ftmp6], %[coeff], 0x38)
521  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
522  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
523  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
524  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
525  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
526  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
527  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
528  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
529  "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
530  "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
531 
532  /* ftmp13: dst5,dst4 */
533  MMI_LDC1(%[ftmp3], %[coeff], 0x40)
534  MMI_LDC1(%[ftmp4], %[coeff], 0x48)
535  MMI_LDC1(%[ftmp5], %[coeff], 0x50)
536  MMI_LDC1(%[ftmp6], %[coeff], 0x58)
537  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
538  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
539  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
540  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
541  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
542  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
543  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
544  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
545  "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
546  "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
547 
548  /* ftmp14: dst7,dst6 */
549  MMI_LDC1(%[ftmp3], %[coeff], 0x60)
550  MMI_LDC1(%[ftmp4], %[coeff], 0x68)
551  MMI_LDC1(%[ftmp5], %[coeff], 0x70)
552  MMI_LDC1(%[ftmp6], %[coeff], 0x78)
553  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
554  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
555  "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
556  "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
557  "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
558  "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
559  "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
560  "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
561  "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
562  "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
563 
564  /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
565  "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
566  "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
567  "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
568  "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
569  "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
570  "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
571  "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
572  "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
573  "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
574  "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
575  MMI_SDC1(%[ftmp9], %[dst], 0x00)
576  MMI_SDC1(%[ftmp10], %[dst], 0x08)
577 
578  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
579  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
580  "addiu %[count], %[count], -0x01 \n\t"
581  "bnez %[count], 1b \n\t"
582  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
583  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
584  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
585  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
586  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
587  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
588  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
589  [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
590  [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
591  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
592  : "memory"
593  );
594 
595  src = block;
596 
597  // 2nd loop
598  __asm__ volatile (
599  "li %[tmp0], 0x44 \n\t"
600  "mtc1 %[tmp0], %[ftmp15] \n\t"
601 
602  // 1st part
603  "li %[tmp0], 0x07 \n\t"
604  "mtc1 %[tmp0], %[ftmp0] \n\t"
605  MMI_LDC1(%[ftmp1], %[src], 0x00)
606  MMI_LDC1(%[ftmp2], %[src], 0x10)
607  MMI_LDC1(%[ftmp3], %[src], 0x20)
608  MMI_LDC1(%[ftmp4], %[src], 0x30)
609  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
610  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
611  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
612  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
613 
614  /* ftmp11: dst03,dst02,dst01,dst00 */
615  "li %[tmp0], 0x00160011 \n\t"
616  "mtc1 %[tmp0], %[ftmp3] \n\t"
617  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
618  "li %[tmp0], 0x000a0011 \n\t"
619  "mtc1 %[tmp0], %[ftmp4] \n\t"
620  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
621  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
622  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
623  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
624  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
625  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
626  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
627  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
628  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
629  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
630  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
631  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
632  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
633  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
634 
635  /* ftmp12: dst13,dst12,dst11,dst10 */
636  "li %[tmp0], 0x000a0011 \n\t"
637  "mtc1 %[tmp0], %[ftmp3] \n\t"
638  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
639  "li %[tmp0], 0xffeaffef \n\t"
640  "mtc1 %[tmp0], %[ftmp4] \n\t"
641  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
642  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
643  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
644  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
645  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
646  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
647  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
648  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
649  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
650  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
651  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
652  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
653  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
654  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
655 
656  /* ftmp13: dst23,dst22,dst21,dst20 */
657  "li %[tmp0], 0xfff60011 \n\t"
658  "mtc1 %[tmp0], %[ftmp3] \n\t"
659  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
660  "li %[tmp0], 0x0016ffef \n\t"
661  "mtc1 %[tmp0], %[ftmp4] \n\t"
662  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
663  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
664  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
665  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
666  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
667  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
668  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
669  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
670  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
671  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
672  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
673  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
674  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
675  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
676 
677  /* ftmp14: dst33,dst32,dst31,dst30 */
678  "li %[tmp0], 0xffea0011 \n\t"
679  "mtc1 %[tmp0], %[ftmp3] \n\t"
680  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
681  "li %[tmp0], 0xfff60011 \n\t"
682  "mtc1 %[tmp0], %[ftmp4] \n\t"
683  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
684  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
685  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
686  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
687  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
688  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
689  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
690  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
691  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
692  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
693  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
694  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
695  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
696  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
697 
698  MMI_LWC1(%[ftmp1], %[dest], 0x00)
699  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
700  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
701  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
702  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
703  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
705  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
706  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
707  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
708  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
709  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
710  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
711  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
712  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
713  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
714  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
715  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
716  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
717  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
718  MMI_SWC1(%[ftmp1], %[dest], 0x00)
719  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
720  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
721  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
722  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
723  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
725 
726  // 2nd part
727  "li %[tmp0], 0x07 \n\t"
728  "mtc1 %[tmp0], %[ftmp0] \n\t"
729  MMI_LDC1(%[ftmp1], %[src], 0x08)
730  MMI_LDC1(%[ftmp2], %[src], 0x18)
731  MMI_LDC1(%[ftmp3], %[src], 0x28)
732  MMI_LDC1(%[ftmp4], %[src], 0x38)
733  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
734  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
735  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
736  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
737 
738  /* ftmp11: dst03,dst02,dst01,dst00 */
739  "li %[tmp0], 0x00160011 \n\t"
740  "mtc1 %[tmp0], %[ftmp3] \n\t"
741  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
742  "li %[tmp0], 0x000a0011 \n\t"
743  "mtc1 %[tmp0], %[ftmp4] \n\t"
744  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
745  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
746  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
747  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
748  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
749  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
750  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
751  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
752  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
753  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
754  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
755  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
756  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
757  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
758 
759  /* ftmp12: dst13,dst12,dst11,dst10 */
760  "li %[tmp0], 0x000a0011 \n\t"
761  "mtc1 %[tmp0], %[ftmp3] \n\t"
762  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
763  "li %[tmp0], 0xffeaffef \n\t"
764  "mtc1 %[tmp0], %[ftmp4] \n\t"
765  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
766  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
767  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
768  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
769  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
770  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
771  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
772  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
773  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
774  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
775  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
776  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
777  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
778  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
779 
780  /* ftmp13: dst23,dst22,dst21,dst20 */
781  "li %[tmp0], 0xfff60011 \n\t"
782  "mtc1 %[tmp0], %[ftmp3] \n\t"
783  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
784  "li %[tmp0], 0x0016ffef \n\t"
785  "mtc1 %[tmp0], %[ftmp4] \n\t"
786  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
787  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
788  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
789  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
790  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
791  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
792  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
793  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
794  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
795  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
796  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
797  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
798  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
799  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
800 
801  /* ftmp14: dst33,dst32,dst31,dst30 */
802  "li %[tmp0], 0xffea0011 \n\t"
803  "mtc1 %[tmp0], %[ftmp3] \n\t"
804  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
805  "li %[tmp0], 0xfff60011 \n\t"
806  "mtc1 %[tmp0], %[ftmp4] \n\t"
807  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
808  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
809  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
810  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
811  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
812  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
813  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
814  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
815  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
816  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
817  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
818  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
819  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
820  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
821 
822  MMI_LWC1(%[ftmp1], %[dest], 0x04)
823  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
824  MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
825  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
826  MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
827  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828  MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
829  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
830  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
831  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
832  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
833  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
834  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
835  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
836  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
837  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
838  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
839  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
840  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
841  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
842  MMI_SWC1(%[ftmp1], %[dest], 0x04)
843  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
844  MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
845  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
846  MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
847  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848  MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
849 
850  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
851  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
852  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
853  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
854  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
855  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
856  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
857  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
858  [tmp0]"=&r"(tmp[0])
859  : [ff_pw_64]"f"(ff_pw_64_local),
860  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
861  :"memory"
862  );
863 }
864 #endif
865 
866 /* Do inverse transform on 4x8 parts of block */
867 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
868 {
869  int dc = block[0];
870  double ftmp[9];
871  DECLARE_VAR_LOW32;
872 
873  dc = (17 * dc + 4) >> 3;
874  dc = (12 * dc + 64) >> 7;
875 
876  __asm__ volatile(
877  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
878  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
879 
880  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
881  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
882  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
883  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
884  MMI_LWC1(%[ftmp5], %[dest4], 0x00)
885  MMI_LWC1(%[ftmp6], %[dest5], 0x00)
886  MMI_LWC1(%[ftmp7], %[dest6], 0x00)
887  MMI_LWC1(%[ftmp8], %[dest7], 0x00)
888 
889  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
890  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
891  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
892  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
893  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
894  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
895  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
896  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
897 
898  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
899  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
900  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
901  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
902  "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
903  "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
904  "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
905  "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
906 
907  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
908  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
909  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
910  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
911  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
912  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
913  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
914  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
915 
916  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
917  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
918  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
919  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
920  MMI_SWC1(%[ftmp5], %[dest4], 0x00)
921  MMI_SWC1(%[ftmp6], %[dest5], 0x00)
922  MMI_SWC1(%[ftmp7], %[dest6], 0x00)
923  MMI_SWC1(%[ftmp8], %[dest7], 0x00)
924  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
925  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
926  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
927  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
928  RESTRICT_ASM_LOW32
929  [ftmp8]"=&f"(ftmp[8])
930  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
931  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
932  [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
933  [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
934  [dc]"f"(dc)
935  : "memory"
936  );
937 }
938 
939 #if _MIPS_SIM != _ABIO32
940 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
941 {
942  int16_t *src = block;
943  int16_t *dst = block;
944  double ftmp[23];
945  uint32_t count = 8, tmp[1];
946  int16_t coeff[16] = {17, 22, 17, 10,
947  17, 10,-17,-22,
948  17,-10,-17, 22,
949  17,-22, 17,-10};
950  DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
951  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
952  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
953 
954  // 1st loop
955  __asm__ volatile (
956 
957  "li %[tmp0], 0x03 \n\t"
958  "mtc1 %[tmp0], %[ftmp0] \n\t"
959 
960  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
961  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
962  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
963  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
964  "1: \n\t"
965  /* ftmp8: dst3,dst2,dst1,dst0 */
966  MMI_LDC1(%[ftmp1], %[src], 0x00)
967  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
968  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
969  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
970  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
971  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
972  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
973  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
974  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
975  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
976  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
977  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
978  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
979  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
980  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
981  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
982  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
983  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
984  MMI_SDC1(%[ftmp8], %[dst], 0x00)
985 
986  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
987  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
988  "addiu %[count], %[count], -0x01 \n\t"
989  "bnez %[count], 1b \n\t"
990  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
991  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
992  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
993  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
994  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
995  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
996  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
997  [src]"+&r"(src), [dst]"+&r"(dst)
998  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
999  : "memory"
1000  );
1001 
1002  src = block;
1003 
1004  // 2nd loop
1005  __asm__ volatile (
1006  "li %[tmp0], 0x07 \n\t"
1007  "mtc1 %[tmp0], %[ftmp0] \n\t"
1008 
1009  MMI_LDC1(%[ftmp1], %[src], 0x00)
1010  MMI_LDC1(%[ftmp2], %[src], 0x20)
1011  MMI_LDC1(%[ftmp3], %[src], 0x40)
1012  MMI_LDC1(%[ftmp4], %[src], 0x60)
1013  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1014  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1015  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1016  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1017 
1018  MMI_LDC1(%[ftmp1], %[src], 0x10)
1019  MMI_LDC1(%[ftmp2], %[src], 0x30)
1020  MMI_LDC1(%[ftmp3], %[src], 0x50)
1021  MMI_LDC1(%[ftmp4], %[src], 0x70)
1022  "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1023  "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1024  "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1025  "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1026 
1027  /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1028  VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1029  0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1030 
1031  /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1032  VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1033  0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1034 
1035  /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1036  VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1037  0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1038 
1039  /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1040  VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1041  0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1042 
1043  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1044  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1045  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1046  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1047  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1048  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1049  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1050  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1051  MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1052  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1053  MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1054  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1055  MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1056  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1057  MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1058  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1059  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1060  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1061  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1062  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1063  "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1064  "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1065  "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1066  "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1067 
1068  "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1069  "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1070  "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1071  "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1072  "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1073  "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1074  "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1075  "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1076 
1077  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1078  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1079  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1080  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1081  "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1082  "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1083  "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1084  "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1085 
1086  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1087  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1088  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1089  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1090  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1091  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1092  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1093  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1094  MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1095  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1096  MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1097  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1098  MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1099  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1100  MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1101 
1102  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1103  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1104  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1105  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1106  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1107  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1108  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1109  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1110  [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1111  [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1112  [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1113  [ftmp22]"=&f"(ftmp[22]),
1114  [tmp0]"=&r"(tmp[0])
1115  : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1116  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1117  : "memory"
1118  );
1119 }
1120 #endif
1121 
1122 /* Do inverse transform on 4x4 part of block */
1123 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1124 {
1125  int dc = block[0];
1126  double ftmp[5];
1127  DECLARE_VAR_LOW32;
1128 
1129  dc = (17 * dc + 4) >> 3;
1130  dc = (17 * dc + 64) >> 7;
1131 
1132  __asm__ volatile(
1133  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1134  "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1135 
1136  MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1137  MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1138  MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1139  MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1140 
1141  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1142  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1143  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1144  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1145 
1146  "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1147  "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1148  "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1149  "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1150 
1151  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1152  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1153  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1154  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1155 
1156  MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1157  MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1158  MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1159  MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1160  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1161  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1162  RESTRICT_ASM_LOW32
1163  [ftmp4]"=&f"(ftmp[4])
1164  : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1165  [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1166  [dc]"f"(dc)
1167  : "memory"
1168  );
1169 }
1170 
1171 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1172 {
1173  int16_t *src = block;
1174  int16_t *dst = block;
1175  double ftmp[16];
1176  uint32_t count = 4, tmp[1];
1177  int16_t coeff[16] = {17, 22, 17, 10,
1178  17, 10,-17,-22,
1179  17,-10,-17, 22,
1180  17,-22, 17,-10};
1181  DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1182  DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1183  // 1st loop
1184  __asm__ volatile (
1185 
1186  "li %[tmp0], 0x03 \n\t"
1187  "mtc1 %[tmp0], %[ftmp0] \n\t"
1188  MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1189  MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1190  MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1191  MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1192  "1: \n\t"
1193  /* ftmp8: dst3,dst2,dst1,dst0 */
1194  MMI_LDC1(%[ftmp1], %[src], 0x00)
1195  "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1196  "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1197  "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1198  "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1199  "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1200  "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1201  "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1202  "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1203  "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1204  "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1205  "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1206  "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1207  "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1208  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1209  "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1210  "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1211  "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1212  MMI_SDC1(%[ftmp8], %[dst], 0x00)
1213 
1214  PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1215  PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1216  "addiu %[count], %[count], -0x01 \n\t"
1217  "bnez %[count], 1b \n\t"
1218  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1219  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1220  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1221  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1222  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1223  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1224  [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1225  [src]"+&r"(src), [dst]"+&r"(dst)
1226  : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1227  : "memory"
1228  );
1229 
1230  src = block;
1231 
1232  // 2nd loop
1233  __asm__ volatile (
1234  "li %[tmp0], 0x07 \n\t"
1235  "mtc1 %[tmp0], %[ftmp0] \n\t"
1236  "li %[tmp0], 0x44 \n\t"
1237  "mtc1 %[tmp0], %[ftmp15] \n\t"
1238 
1239  MMI_LDC1(%[ftmp1], %[src], 0x00)
1240  MMI_LDC1(%[ftmp2], %[src], 0x10)
1241  MMI_LDC1(%[ftmp3], %[src], 0x20)
1242  MMI_LDC1(%[ftmp4], %[src], 0x30)
1243  "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1244  "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1245  "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1246  "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1247 
1248  /* ftmp11: dst03,dst02,dst01,dst00 */
1249  "li %[tmp0], 0x00160011 \n\t"
1250  "mtc1 %[tmp0], %[ftmp3] \n\t"
1251  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1252  "li %[tmp0], 0x000a0011 \n\t"
1253  "mtc1 %[tmp0], %[ftmp4] \n\t"
1254  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1255  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1256  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1257  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1258  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1259  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1260  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1261  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1262  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1263  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1264  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1265  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1266  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1267  "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1268 
1269  /* ftmp12: dst13,dst12,dst11,dst10 */
1270  "li %[tmp0], 0x000a0011 \n\t"
1271  "mtc1 %[tmp0], %[ftmp3] \n\t"
1272  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1273  "li %[tmp0], 0xffeaffef \n\t"
1274  "mtc1 %[tmp0], %[ftmp4] \n\t"
1275  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1276  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1277  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1278  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1279  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1280  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1281  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1282  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1283  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1284  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1285  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1286  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1287  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1288  "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1289 
1290  /* ftmp13: dst23,dst22,dst21,dst20 */
1291  "li %[tmp0], 0xfff60011 \n\t"
1292  "mtc1 %[tmp0], %[ftmp3] \n\t"
1293  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1294  "li %[tmp0], 0x0016ffef \n\t"
1295  "mtc1 %[tmp0], %[ftmp4] \n\t"
1296  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1297  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1298  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1299  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1300  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1301  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1302  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1303  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1304  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1305  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1306  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1307  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1308  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1309  "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1310 
1311  /* ftmp14: dst33,dst32,dst31,dst30 */
1312  "li %[tmp0], 0xffea0011 \n\t"
1313  "mtc1 %[tmp0], %[ftmp3] \n\t"
1314  "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1315  "li %[tmp0], 0xfff60011 \n\t"
1316  "mtc1 %[tmp0], %[ftmp4] \n\t"
1317  "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1318  "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1319  "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1320  "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1321  "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1322  "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1323  "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1324  "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1325  "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1326  "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1327  "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1328  "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1329  "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1330  "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1331 
1332  MMI_LWC1(%[ftmp1], %[dest], 0x00)
1333  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1334  MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1335  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1336  MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1337  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1338  MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1339  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1340  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1341  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1342  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1343  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1344  "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1345  "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1346  "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1347  "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1348  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1349  "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1350  "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1351  "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1352 
1353  MMI_SWC1(%[ftmp1], %[dest], 0x00)
1354  PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1355  MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1356  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1357  MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1358  PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1359  MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1360 
1361  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1362  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1363  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1364  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1365  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1366  [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1367  [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1368  [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1369  [tmp0]"=&r"(tmp[0])
1370  : [ff_pw_64]"f"(ff_pw_64_local),
1371  [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1372  :"memory"
1373  );
1374 }
1375 
1376 /* Apply overlap transform to horizontal edge */
1378 {
1379  int i;
1380  int a, b, c, d;
1381  int d1, d2;
1382  int rnd = 1;
1383  for (i = 0; i < 8; i++) {
1384  a = src[-2];
1385  b = src[-1];
1386  c = src[0];
1387  d = src[1];
1388  d1 = (a - d + 3 + rnd) >> 3;
1389  d2 = (a - d + b - c + 4 - rnd) >> 3;
1390 
1391  src[-2] = a - d1;
1392  src[-1] = av_clip_uint8(b - d2);
1393  src[0] = av_clip_uint8(c + d2);
1394  src[1] = d + d1;
1395  src += stride;
1396  rnd = !rnd;
1397  }
1398 }
1399 
1400 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1401 {
1402  int i;
1403  int a, b, c, d;
1404  int d1, d2;
1405  int rnd1 = flags & 2 ? 3 : 4;
1406  int rnd2 = 7 - rnd1;
1407  for (i = 0; i < 8; i++) {
1408  a = left[6];
1409  b = left[7];
1410  c = right[0];
1411  d = right[1];
1412  d1 = a - d;
1413  d2 = a - d + b - c;
1414 
1415  left[6] = ((a << 3) - d1 + rnd1) >> 3;
1416  left[7] = ((b << 3) - d2 + rnd2) >> 3;
1417  right[0] = ((c << 3) + d2 + rnd1) >> 3;
1418  right[1] = ((d << 3) + d1 + rnd2) >> 3;
1419 
1420  right += right_stride;
1421  left += left_stride;
1422  if (flags & 1) {
1423  rnd2 = 7 - rnd2;
1424  rnd1 = 7 - rnd1;
1425  }
1426  }
1427 }
1428 
1429 /* Apply overlap transform to vertical edge */
1431 {
1432  int i;
1433  int a, b, c, d;
1434  int d1, d2;
1435  int rnd = 1;
1436  for (i = 0; i < 8; i++) {
1437  a = src[-2 * stride];
1438  b = src[-stride];
1439  c = src[0];
1440  d = src[stride];
1441  d1 = (a - d + 3 + rnd) >> 3;
1442  d2 = (a - d + b - c + 4 - rnd) >> 3;
1443 
1444  src[-2 * stride] = a - d1;
1445  src[-stride] = av_clip_uint8(b - d2);
1446  src[0] = av_clip_uint8(c + d2);
1447  src[stride] = d + d1;
1448  src++;
1449  rnd = !rnd;
1450  }
1451 }
1452 
1453 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1454 {
1455  int i;
1456  int a, b, c, d;
1457  int d1, d2;
1458  int rnd1 = 4, rnd2 = 3;
1459  for (i = 0; i < 8; i++) {
1460  a = top[48];
1461  b = top[56];
1462  c = bottom[0];
1463  d = bottom[8];
1464  d1 = a - d;
1465  d2 = a - d + b - c;
1466 
1467  top[48] = ((a << 3) - d1 + rnd1) >> 3;
1468  top[56] = ((b << 3) - d2 + rnd2) >> 3;
1469  bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1470  bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1471 
1472  bottom++;
1473  top++;
1474  rnd2 = 7 - rnd2;
1475  rnd1 = 7 - rnd1;
1476  }
1477 }
1478 
1479 /**
1480  * VC-1 in-loop deblocking filter for one line
1481  * @param src source block type
1482  * @param stride block stride
1483  * @param pq block quantizer
1484  * @return whether other 3 pairs should be filtered or not
1485  * @see 8.6
1486  */
1488 {
1489  int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1490  5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1491  int a0_sign = a0 >> 31; /* Store sign */
1492 
1493  a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1494  if (a0 < pq) {
1495  int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1496  5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1497  int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1498  5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1499  if (a1 < a0 || a2 < a0) {
1500  int clip = src[-1 * stride] - src[0 * stride];
1501  int clip_sign = clip >> 31;
1502 
1503  clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1504  if (clip) {
1505  int a3 = FFMIN(a1, a2);
1506  int d = 5 * (a3 - a0);
1507  int d_sign = (d >> 31);
1508 
1509  d = ((d ^ d_sign) - d_sign) >> 3;
1510  d_sign ^= a0_sign;
1511 
1512  if (d_sign ^ clip_sign)
1513  d = 0;
1514  else {
1515  d = FFMIN(d, clip);
1516  d = (d ^ d_sign) - d_sign; /* Restore sign */
1517  src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1518  src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1519  }
1520  return 1;
1521  }
1522  }
1523  }
1524  return 0;
1525 }
1526 
1527 /**
1528  * VC-1 in-loop deblocking filter
1529  * @param src source block type
1530  * @param step distance between horizontally adjacent elements
1531  * @param stride distance between vertically adjacent elements
1532  * @param len edge length to filter (4 or 8 pixels)
1533  * @param pq block quantizer
1534  * @see 8.6
1535  */
1536 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1537  int len, int pq)
1538 {
1539  int i;
1540  int filt3;
1541 
1542  for (i = 0; i < len; i += 4) {
1543  filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1544  if (filt3) {
1545  vc1_filter_line(src + 0 * step, stride, pq);
1546  vc1_filter_line(src + 1 * step, stride, pq);
1547  vc1_filter_line(src + 3 * step, stride, pq);
1548  }
1549  src += step * 4;
1550  }
1551 }
1552 
1554 {
1555  vc1_loop_filter(src, 1, stride, 4, pq);
1556 }
1557 
1559 {
1560  vc1_loop_filter(src, stride, 1, 4, pq);
1561 }
1562 
1564 {
1565  vc1_loop_filter(src, 1, stride, 8, pq);
1566 }
1567 
1569 {
1570  vc1_loop_filter(src, stride, 1, 8, pq);
1571 }
1572 
1574 {
1575  vc1_loop_filter(src, 1, stride, 16, pq);
1576 }
1577 
1579 {
1580  vc1_loop_filter(src, stride, 1, 16, pq);
1581 }
1582 
1584  ptrdiff_t stride, int rnd)
1585 {
1586  ff_put_pixels8_8_mmi(dst, src, stride, 8);
1587 }
1589  ptrdiff_t stride, int rnd)
1590 {
1591  ff_put_pixels16_8_mmi(dst, src, stride, 16);
1592 }
1594  ptrdiff_t stride, int rnd)
1595 {
1596  ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1597 }
1599  ptrdiff_t stride, int rnd)
1600 {
1601  ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1602 }
1603 
1604 #define OP_PUT(S, D)
1605 #define OP_AVG(S, D) \
1606  "ldc1 $f16, "#S" \n\t" \
1607  "pavgb "#D", "#D", $f16 \n\t"
1608 
1609 /** Add rounder from $f14 to $f6 and pack result at destination */
1610 #define NORMALIZE_MMI(SHIFT) \
1611  "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1612  "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1613  "psrah $f6, $f6, "SHIFT" \n\t" \
1614  "psrah $f8, $f8, "SHIFT" \n\t"
1615 
1616 #define TRANSFER_DO_PACK(OP) \
1617  "packushb $f6, $f6, $f8 \n\t" \
1618  OP((%[dst]), $f6) \
1619  "sdc1 $f6, 0x00(%[dst]) \n\t"
1620 
1621 #define TRANSFER_DONT_PACK(OP) \
1622  OP(0(%[dst]), $f6) \
1623  OP(8(%[dst]), $f8) \
1624  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1625  "sdc1 $f8, 0x08(%[dst]) \n\t"
1626 
1627 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1628 #define DO_UNPACK(reg) \
1629  "punpcklbh "reg", "reg", $f0 \n\t"
1630 #define DONT_UNPACK(reg)
1631 
1632 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1633 #define LOAD_ROUNDER_MMI(ROUND) \
1634  "lwc1 $f14, "ROUND" \n\t" \
1635  "punpcklhw $f14, $f14, $f14 \n\t" \
1636  "punpcklwd $f14, $f14, $f14 \n\t"
1637 
1638 
1639 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1640  "paddh "#R1", "#R1", "#R2" \n\t" \
1641  PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1642  MMI_ULWC1(R0, $9, 0x00) \
1643  "pmullh "#R1", "#R1", $f6 \n\t" \
1644  "punpcklbh "#R0", "#R0", $f0 \n\t" \
1645  PTR_ADDU "$9, %[src], %[stride] \n\t" \
1646  MMI_ULWC1(R3, $9, 0x00) \
1647  "psubh "#R1", "#R1", "#R0" \n\t" \
1648  "punpcklbh "#R3", "#R3", $f0 \n\t" \
1649  "paddh "#R1", "#R1", $f14 \n\t" \
1650  "psubh "#R1", "#R1", "#R3" \n\t" \
1651  "psrah "#R1", "#R1", %[shift] \n\t" \
1652  MMI_SDC1(R1, %[dst], OFF) \
1653  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1654 
1655 /** Sacrificing $f12 makes it possible to pipeline loads from src */
1656 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1657  const uint8_t *src, mips_reg stride,
1658  int rnd, int64_t shift)
1659 {
1660  DECLARE_VAR_LOW32;
1661  DECLARE_VAR_ADDRT;
1662 
1663  __asm__ volatile(
1664  "xor $f0, $f0, $f0 \n\t"
1665  "li $8, 0x03 \n\t"
1666  LOAD_ROUNDER_MMI("%[rnd]")
1667  "ldc1 $f12, %[ff_pw_9] \n\t"
1668  "1: \n\t"
1669  MMI_ULWC1($f4, %[src], 0x00)
1670  PTR_ADDU "%[src], %[src], %[stride] \n\t"
1671  MMI_ULWC1($f6, %[src], 0x00)
1672  "punpcklbh $f4, $f4, $f0 \n\t"
1673  "punpcklbh $f6, $f6, $f0 \n\t"
1674  SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1675  SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1676  SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1677  SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1678  SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1679  SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1680  SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1681  SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1682  PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1683  PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1684  "addiu $8, $8, -0x01 \n\t"
1685  "bnez $8, 1b \n\t"
1686  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1687  [src]"+r"(src), [dst]"+r"(dst)
1688  : [stride]"r"(stride), [stride1]"r"(-2*stride),
1689  [shift]"f"(shift), [rnd]"m"(rnd),
1690  [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1691  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1692  "$f14", "$f16", "memory"
1693  );
1694 }
1695 
1696 /**
1697  * Data is already unpacked, so some operations can directly be made from
1698  * memory.
1699  */
1700 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1701 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1702  const int16_t *src, int rnd) \
1703 { \
1704  int h = 8; \
1705  DECLARE_VAR_ALL64; \
1706  DECLARE_VAR_ADDRT; \
1707  \
1708  src -= 1; \
1709  rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1710  \
1711  __asm__ volatile( \
1712  LOAD_ROUNDER_MMI("%[rnd]") \
1713  "ldc1 $f12, %[ff_pw_128] \n\t" \
1714  "ldc1 $f10, %[ff_pw_9] \n\t" \
1715  "1: \n\t" \
1716  MMI_ULDC1($f2, %[src], 0x00) \
1717  MMI_ULDC1($f4, %[src], 0x08) \
1718  MMI_ULDC1($f6, %[src], 0x02) \
1719  MMI_ULDC1($f8, %[src], 0x0a) \
1720  MMI_ULDC1($f0, %[src], 0x06) \
1721  "paddh $f2, $f2, $f0 \n\t" \
1722  MMI_ULDC1($f0, %[src], 0x0e) \
1723  "paddh $f4, $f4, $f0 \n\t" \
1724  MMI_ULDC1($f0, %[src], 0x04) \
1725  "paddh $f6, $f6, $f0 \n\t" \
1726  MMI_ULDC1($f0, %[src], 0x0b) \
1727  "paddh $f8, $f8, $f0 \n\t" \
1728  "pmullh $f6, $f6, $f10 \n\t" \
1729  "pmullh $f8, $f8, $f10 \n\t" \
1730  "psubh $f6, $f6, $f2 \n\t" \
1731  "psubh $f8, $f8, $f4 \n\t" \
1732  "li $8, 0x07 \n\t" \
1733  "mtc1 $8, $f16 \n\t" \
1734  NORMALIZE_MMI("$f16") \
1735  /* Remove bias */ \
1736  "paddh $f6, $f6, $f12 \n\t" \
1737  "paddh $f8, $f8, $f12 \n\t" \
1738  TRANSFER_DO_PACK(OP) \
1739  "addiu %[h], %[h], -0x01 \n\t" \
1740  PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742  "bnez %[h], 1b \n\t" \
1743  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1744  [h]"+r"(h), \
1745  [src]"+r"(src), [dst]"+r"(dst) \
1746  : [stride]"r"(stride), [rnd]"m"(rnd), \
1747  [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1748  : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1749  "$f16", "memory" \
1750  ); \
1751 }
1752 
1755 
1756 /**
1757  * Purely vertical or horizontal 1/2 shift interpolation.
1758  * Sacrify $f12 for *9 factor.
1759  */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762  mips_reg stride, int rnd, \
1763  mips_reg offset) \
1764 { \
1765  DECLARE_VAR_LOW32; \
1766  DECLARE_VAR_ADDRT; \
1767  \
1768  rnd = 8 - rnd; \
1769  \
1770  __asm__ volatile( \
1771  "xor $f0, $f0, $f0 \n\t" \
1772  "li $10, 0x08 \n\t" \
1773  LOAD_ROUNDER_MMI("%[rnd]") \
1774  "ldc1 $f12, %[ff_pw_9] \n\t" \
1775  "1: \n\t" \
1776  MMI_ULWC1($f6, %[src], 0x00) \
1777  MMI_ULWC1($f8, %[src], 0x04) \
1778  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1779  MMI_ULWC1($f2, $9, 0x00) \
1780  MMI_ULWC1($f4, $9, 0x04) \
1781  PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1782  "punpcklbh $f6, $f6, $f0 \n\t" \
1783  "punpcklbh $f8, $f8, $f0 \n\t" \
1784  "punpcklbh $f2, $f2, $f0 \n\t" \
1785  "punpcklbh $f4, $f4, $f0 \n\t" \
1786  "paddh $f6, $f6, $f2 \n\t" \
1787  "paddh $f8, $f8, $f4 \n\t" \
1788  PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1789  MMI_ULWC1($f2, $9, 0x00) \
1790  MMI_ULWC1($f4, $9, 0x04) \
1791  "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1792  "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1793  "punpcklbh $f2, $f2, $f0 \n\t" \
1794  "punpcklbh $f4, $f4, $f0 \n\t" \
1795  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1796  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1797  PTR_ADDU "$9, %[src], %[offset] \n\t" \
1798  MMI_ULWC1($f2, $9, 0x00) \
1799  MMI_ULWC1($f4, $9, 0x04) \
1800  "punpcklbh $f2, $f2, $f0 \n\t" \
1801  "punpcklbh $f4, $f4, $f0 \n\t" \
1802  "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1803  "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1804  "li $8, 0x04 \n\t" \
1805  "mtc1 $8, $f16 \n\t" \
1806  NORMALIZE_MMI("$f16") \
1807  "packushb $f6, $f6, $f8 \n\t" \
1808  OP((%[dst]), $f6) \
1809  "sdc1 $f6, 0x00(%[dst]) \n\t" \
1810  "addiu $10, $10, -0x01 \n\t" \
1811  PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1812  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1813  "bnez $10, 1b \n\t" \
1814  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1815  [src]"+r"(src), [dst]"+r"(dst) \
1816  : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1817  [stride]"r"(stride), [rnd]"m"(rnd), \
1818  [stride1]"r"(stride-offset), \
1819  [ff_pw_9]"m"(ff_pw_9) \
1820  : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1821  "$f12", "$f14", "$f16", "memory" \
1822  ); \
1823 }
1824 
1825 VC1_SHIFT2(OP_PUT, put_)
1826 VC1_SHIFT2(OP_AVG, avg_)
1827 
1828 /**
1829  * Core of the 1/4 and 3/4 shift bicubic interpolation.
1830  *
1831  * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1832  * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1833  * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1834  * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1835  * @param A2 Stride address of 2nd tap
1836  * @param A3 Stride address of 3rd tap
1837  * @param A4 Stride address of 4th tap
1838  */
1839 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1840  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1841  LOAD($f2, $9, M*0) \
1842  LOAD($f4, $9, M*4) \
1843  UNPACK("$f2") \
1844  UNPACK("$f4") \
1845  "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1846  "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1847  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1848  LOAD($f6, $9, M*0) \
1849  LOAD($f8, $9, M*4) \
1850  UNPACK("$f6") \
1851  UNPACK("$f8") \
1852  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1853  "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1854  "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1855  "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1856  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1857  LOAD($f2, $9, M*0) \
1858  LOAD($f4, $9, M*4) \
1859  UNPACK("$f2") \
1860  UNPACK("$f4") \
1861  "li $8, 0x02 \n\t" \
1862  "mtc1 $8, $f16 \n\t" \
1863  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1864  "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1865  "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1866  "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1867  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1868  LOAD($f2, $9, M*0) \
1869  LOAD($f4, $9, M*4) \
1870  UNPACK("$f2") \
1871  UNPACK("$f4") \
1872  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1873  "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1874  "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1875  "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1876 
1877 /**
1878  * Macro to build the vertical 16bits version of vc1_put_shift[13].
1879  * Here, offset=src_stride. Parameters passed A1 to A4 must use
1880  * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1881  *
1882  * @param NAME Either 1 or 3
1883  * @see MSPEL_FILTER13_CORE for information on A1->A4
1884  */
1885 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1886 static void \
1887 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1888  mips_reg src_stride, \
1889  int rnd, int64_t shift) \
1890 { \
1891  int h = 8; \
1892  DECLARE_VAR_LOW32; \
1893  DECLARE_VAR_ADDRT; \
1894  \
1895  src -= src_stride; \
1896  \
1897  __asm__ volatile( \
1898  "xor $f0, $f0, $f0 \n\t" \
1899  LOAD_ROUNDER_MMI("%[rnd]") \
1900  "ldc1 $f10, %[ff_pw_53] \n\t" \
1901  "ldc1 $f12, %[ff_pw_18] \n\t" \
1902  ".p2align 3 \n\t" \
1903  "1: \n\t" \
1904  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1905  NORMALIZE_MMI("%[shift]") \
1906  TRANSFER_DONT_PACK(OP_PUT) \
1907  /* Last 3 (in fact 4) bytes on the line */ \
1908  PTR_ADDU "$9, %[src], "#A1" \n\t" \
1909  MMI_ULWC1($f2, $9, 0x08) \
1910  DO_UNPACK("$f2") \
1911  "mov.d $f6, $f2 \n\t" \
1912  "paddh $f2, $f2, $f2 \n\t" \
1913  "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1914  PTR_ADDU "$9, %[src], "#A2" \n\t" \
1915  MMI_ULWC1($f6, $9, 0x08) \
1916  DO_UNPACK("$f6") \
1917  "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1918  "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1919  PTR_ADDU "$9, %[src], "#A3" \n\t" \
1920  MMI_ULWC1($f2, $9, 0x08) \
1921  DO_UNPACK("$f2") \
1922  "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1923  "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1924  PTR_ADDU "$9, %[src], "#A4" \n\t" \
1925  MMI_ULWC1($f2, $9, 0x08) \
1926  DO_UNPACK("$f2") \
1927  "li $8, 0x02 \n\t" \
1928  "mtc1 $8, $f16 \n\t" \
1929  "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1930  "psubh $f6, $f6, $f2 \n\t" \
1931  "paddh $f6, $f6, $f14 \n\t" \
1932  "li $8, 0x06 \n\t" \
1933  "mtc1 $8, $f16 \n\t" \
1934  "psrah $f6, $f6, $f16 \n\t" \
1935  "sdc1 $f6, 0x10(%[dst]) \n\t" \
1936  "addiu %[h], %[h], -0x01 \n\t" \
1937  PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1938  PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1939  "bnez %[h], 1b \n\t" \
1940  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1941  [h]"+r"(h), \
1942  [src]"+r"(src), [dst]"+r"(dst) \
1943  : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1944  [stride_x3]"r"(3*src_stride), \
1945  [rnd]"m"(rnd), [shift]"f"(shift), \
1946  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1947  [ff_pw_3]"f"(ff_pw_3) \
1948  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1949  "$f14", "$f16", "memory" \
1950  ); \
1951 }
1952 
1953 /**
1954  * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1955  * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1956  *
1957  * @param NAME Either 1 or 3
1958  * @see MSPEL_FILTER13_CORE for information on A1->A4
1959  */
1960 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1961 static void \
1962 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1963  const int16_t *src, int rnd) \
1964 { \
1965  int h = 8; \
1966  DECLARE_VAR_ALL64; \
1967  DECLARE_VAR_ADDRT; \
1968  \
1969  src -= 1; \
1970  rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1971  \
1972  __asm__ volatile( \
1973  "xor $f0, $f0, $f0 \n\t" \
1974  LOAD_ROUNDER_MMI("%[rnd]") \
1975  "ldc1 $f10, %[ff_pw_53] \n\t" \
1976  "ldc1 $f12, %[ff_pw_18] \n\t" \
1977  ".p2align 3 \n\t" \
1978  "1: \n\t" \
1979  MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1980  "li $8, 0x07 \n\t" \
1981  "mtc1 $8, $f16 \n\t" \
1982  NORMALIZE_MMI("$f16") \
1983  /* Remove bias */ \
1984  "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1985  "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1986  TRANSFER_DO_PACK(OP) \
1987  "addiu %[h], %[h], -0x01 \n\t" \
1988  PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1989  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1990  "bnez %[h], 1b \n\t" \
1991  : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1992  [h]"+r"(h), \
1993  [src]"+r"(src), [dst]"+r"(dst) \
1994  : [stride]"r"(stride), [rnd]"m"(rnd), \
1995  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1996  [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
1997  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1998  "$f14", "$f16", "memory" \
1999  ); \
2000 }
2001 
2002 /**
2003  * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2004  * Here, offset=src_stride. Parameters passed A1 to A4 must use
2005  * %3 (offset), %4 (2*offset) and %5 (3*offset).
2006  *
2007  * @param NAME Either 1 or 3
2008  * @see MSPEL_FILTER13_CORE for information on A1->A4
2009  */
2010 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2011 static void \
2012 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2013  mips_reg stride, int rnd, mips_reg offset) \
2014 { \
2015  int h = 8; \
2016  DECLARE_VAR_LOW32; \
2017  DECLARE_VAR_ADDRT; \
2018  \
2019  src -= offset; \
2020  rnd = 32-rnd; \
2021  \
2022  __asm__ volatile ( \
2023  "xor $f0, $f0, $f0 \n\t" \
2024  LOAD_ROUNDER_MMI("%[rnd]") \
2025  "ldc1 $f10, %[ff_pw_53] \n\t" \
2026  "ldc1 $f12, %[ff_pw_18] \n\t" \
2027  ".p2align 3 \n\t" \
2028  "1: \n\t" \
2029  MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2030  "li $8, 0x06 \n\t" \
2031  "mtc1 $8, $f16 \n\t" \
2032  NORMALIZE_MMI("$f16") \
2033  TRANSFER_DO_PACK(OP) \
2034  "addiu %[h], %[h], -0x01 \n\t" \
2035  PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2036  PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2037  "bnez %[h], 1b \n\t" \
2038  : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2039  [h]"+r"(h), \
2040  [src]"+r"(src), [dst]"+r"(dst) \
2041  : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2042  [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2043  [rnd]"m"(rnd), \
2044  [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2045  [ff_pw_3]"f"(ff_pw_3) \
2046  : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2047  "$f14", "$f16", "memory" \
2048  ); \
2049 }
2050 
2051 
2052 /** 1/4 shift bicubic interpolation */
2053 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2054 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2055 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2056 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2057 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2058 
2059 /** 3/4 shift bicubic interpolation */
2060 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2061 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2062 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2063 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2064 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2065 
2066 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2067  (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2068  int64_t shift);
2069 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2070  (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2071 typedef void (*vc1_mspel_mc_filter_8bits)
2072  (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2073  mips_reg offset);
2074 
2075 /**
2076  * Interpolate fractional pel values by applying proper vertical then
2077  * horizontal filter.
2078  *
2079  * @param dst Destination buffer for interpolated pels.
2080  * @param src Source buffer.
2081  * @param stride Stride for both src and dst buffers.
2082  * @param hmode Horizontal filter (expressed in quarter pixels shift).
2083  * @param hmode Vertical filter.
2084  * @param rnd Rounding bias.
2085  */
2086 #define VC1_MSPEL_MC(OP) \
2087 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2088  int hmode, int vmode, int rnd) \
2089 { \
2090  static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2091  { NULL, vc1_put_ver_16b_shift1_mmi, \
2092  vc1_put_ver_16b_shift2_mmi, \
2093  vc1_put_ver_16b_shift3_mmi }; \
2094  static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2095  { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2096  OP ## vc1_hor_16b_shift2_mmi, \
2097  OP ## vc1_hor_16b_shift3_mmi }; \
2098  static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2099  { NULL, OP ## vc1_shift1_mmi, \
2100  OP ## vc1_shift2_mmi, \
2101  OP ## vc1_shift3_mmi }; \
2102  \
2103  if (vmode) { /* Vertical filter to apply */ \
2104  if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2105  static const int shift_value[] = { 0, 5, 1, 5 }; \
2106  int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2107  int r; \
2108  LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2109  \
2110  r = (1<<(shift-1)) + rnd-1; \
2111  vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2112  \
2113  vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2114  return; \
2115  } \
2116  else { /* No horizontal filter, output 8 lines to dst */ \
2117  vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2118  return; \
2119  } \
2120  } \
2121  \
2122  /* Horizontal mode with no vertical mode */ \
2123  vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2124 } \
2125 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2126  int stride, int hmode, int vmode, int rnd)\
2127 { \
2128  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2129  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2130  dst += 8*stride; src += 8*stride; \
2131  OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2132  OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2133 }
2134 
2135 VC1_MSPEL_MC(put_)
2136 VC1_MSPEL_MC(avg_)
2137 
2138 /** Macro to ease bicubic filter interpolation functions declarations */
2139 #define DECLARE_FUNCTION(a, b) \
2140 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2141  const uint8_t *src, \
2142  ptrdiff_t stride, \
2143  int rnd) \
2144 { \
2145  put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2146 } \
2147 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2148  const uint8_t *src, \
2149  ptrdiff_t stride, \
2150  int rnd) \
2151 { \
2152  avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2153 } \
2154 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2155  const uint8_t *src, \
2156  ptrdiff_t stride, \
2157  int rnd) \
2158 { \
2159  put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2160 } \
2161 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2162  const uint8_t *src, \
2163  ptrdiff_t stride, \
2164  int rnd) \
2165 { \
2166  avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2167 }
2168 
2169 DECLARE_FUNCTION(0, 1)
2170 DECLARE_FUNCTION(0, 2)
2171 DECLARE_FUNCTION(0, 3)
2172 
2173 DECLARE_FUNCTION(1, 0)
2174 DECLARE_FUNCTION(1, 1)
2175 DECLARE_FUNCTION(1, 2)
2176 DECLARE_FUNCTION(1, 3)
2177 
2178 DECLARE_FUNCTION(2, 0)
2179 DECLARE_FUNCTION(2, 1)
2180 DECLARE_FUNCTION(2, 2)
2181 DECLARE_FUNCTION(2, 3)
2182 
2183 DECLARE_FUNCTION(3, 0)
2184 DECLARE_FUNCTION(3, 1)
2185 DECLARE_FUNCTION(3, 2)
2186 DECLARE_FUNCTION(3, 3)
2187 
2188 #define CHROMA_MC_8_MMI \
2189  "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2190  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2191  "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2192  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2193  "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2194  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2195  "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2196  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2197  \
2198  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2199  "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2200  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2201  "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2202  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2203  "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2204  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2205  "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2206  \
2207  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2208  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2209  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2210  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2211  \
2212  "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2213  "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2214  "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2215  "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2216  \
2217  "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2218  "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2219  "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2220 
2221 
2222 #define CHROMA_MC_4_MMI \
2223  "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2224  "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2225  "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2226  "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2227  \
2228  "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2229  "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2230  "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2231  "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2232  \
2233  "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2234  "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2235  "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2236  "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2237  \
2238  "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2239  "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2240 
2241 
2243  uint8_t *src /* align 1 */,
2244  ptrdiff_t stride, int h, int x, int y)
2245 {
2246  const int A = (8 - x) * (8 - y);
2247  const int B = (x) * (8 - y);
2248  const int C = (8 - x) * (y);
2249  const int D = (x) * (y);
2250  double ftmp[10];
2251  uint32_t tmp[1];
2252  DECLARE_VAR_ALL64;
2253  DECLARE_VAR_ADDRT;
2254 
2255  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2256 
2257  __asm__ volatile(
2258  "li %[tmp0], 0x06 \n\t"
2259  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2260  "mtc1 %[tmp0], %[ftmp9] \n\t"
2261  "pshufh %[A], %[A], %[ftmp0] \n\t"
2262  "pshufh %[B], %[B], %[ftmp0] \n\t"
2263  "pshufh %[C], %[C], %[ftmp0] \n\t"
2264  "pshufh %[D], %[D], %[ftmp0] \n\t"
2265 
2266  "1: \n\t"
2267  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2268  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2269  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2270  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2271  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2272 
2274 
2275  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2276  "addiu %[h], %[h], -0x01 \n\t"
2277  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2278  "bnez %[h], 1b \n\t"
2279  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2280  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2281  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2282  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2283  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2284  RESTRICT_ASM_ALL64
2285  RESTRICT_ASM_ADDRT
2286  [tmp0]"=&r"(tmp[0]),
2287  [src]"+&r"(src), [dst]"+&r"(dst),
2288  [h]"+&r"(h)
2289  : [stride]"r"((mips_reg)stride),
2290  [A]"f"(A), [B]"f"(B),
2291  [C]"f"(C), [D]"f"(D),
2292  [ff_pw_28]"f"(ff_pw_28)
2293  : "memory"
2294  );
2295 }
2296 
2298  uint8_t *src /* align 1 */,
2299  ptrdiff_t stride, int h, int x, int y)
2300 {
2301  const int A = (8 - x) * (8 - y);
2302  const int B = (x) * (8 - y);
2303  const int C = (8 - x) * (y);
2304  const int D = (x) * (y);
2305  double ftmp[6];
2306  uint32_t tmp[1];
2307  DECLARE_VAR_LOW32;
2308  DECLARE_VAR_ADDRT;
2309 
2310  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2311 
2312  __asm__ volatile(
2313  "li %[tmp0], 0x06 \n\t"
2314  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2315  "mtc1 %[tmp0], %[ftmp5] \n\t"
2316  "pshufh %[A], %[A], %[ftmp0] \n\t"
2317  "pshufh %[B], %[B], %[ftmp0] \n\t"
2318  "pshufh %[C], %[C], %[ftmp0] \n\t"
2319  "pshufh %[D], %[D], %[ftmp0] \n\t"
2320 
2321  "1: \n\t"
2322  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2323  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2324  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2325  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2326  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2327 
2329 
2330  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2331  "addiu %[h], %[h], -0x01 \n\t"
2332  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2333  "bnez %[h], 1b \n\t"
2334  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2335  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2336  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2337  [tmp0]"=&r"(tmp[0]),
2338  RESTRICT_ASM_LOW32
2339  RESTRICT_ASM_ADDRT
2340  [src]"+&r"(src), [dst]"+&r"(dst),
2341  [h]"+&r"(h)
2342  : [stride]"r"((mips_reg)stride),
2343  [A]"f"(A), [B]"f"(B),
2344  [C]"f"(C), [D]"f"(D),
2345  [ff_pw_28]"f"(ff_pw_28)
2346  : "memory"
2347  );
2348 }
2349 
2351  uint8_t *src /* align 1 */,
2352  ptrdiff_t stride, int h, int x, int y)
2353 {
2354  const int A = (8 - x) * (8 - y);
2355  const int B = (x) * (8 - y);
2356  const int C = (8 - x) * (y);
2357  const int D = (x) * (y);
2358  double ftmp[10];
2359  uint32_t tmp[1];
2360  DECLARE_VAR_ALL64;
2361  DECLARE_VAR_ADDRT;
2362 
2363  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2364 
2365  __asm__ volatile(
2366  "li %[tmp0], 0x06 \n\t"
2367  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2368  "mtc1 %[tmp0], %[ftmp9] \n\t"
2369  "pshufh %[A], %[A], %[ftmp0] \n\t"
2370  "pshufh %[B], %[B], %[ftmp0] \n\t"
2371  "pshufh %[C], %[C], %[ftmp0] \n\t"
2372  "pshufh %[D], %[D], %[ftmp0] \n\t"
2373 
2374  "1: \n\t"
2375  MMI_ULDC1(%[ftmp1], %[src], 0x00)
2376  MMI_ULDC1(%[ftmp2], %[src], 0x01)
2377  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2378  MMI_ULDC1(%[ftmp3], %[src], 0x00)
2379  MMI_ULDC1(%[ftmp4], %[src], 0x01)
2380 
2382 
2383  MMI_LDC1(%[ftmp2], %[dst], 0x00)
2384  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2385 
2386  MMI_SDC1(%[ftmp1], %[dst], 0x00)
2387  "addiu %[h], %[h], -0x01 \n\t"
2388  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2389  "bnez %[h], 1b \n\t"
2390  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2391  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2392  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2393  [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2394  [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2395  [tmp0]"=&r"(tmp[0]),
2396  RESTRICT_ASM_ALL64
2397  RESTRICT_ASM_ADDRT
2398  [src]"+&r"(src), [dst]"+&r"(dst),
2399  [h]"+&r"(h)
2400  : [stride]"r"((mips_reg)stride),
2401  [A]"f"(A), [B]"f"(B),
2402  [C]"f"(C), [D]"f"(D),
2403  [ff_pw_28]"f"(ff_pw_28)
2404  : "memory"
2405  );
2406 }
2407 
2409  uint8_t *src /* align 1 */,
2410  ptrdiff_t stride, int h, int x, int y)
2411 {
2412  const int A = (8 - x) * (8 - y);
2413  const int B = ( x) * (8 - y);
2414  const int C = (8 - x) * ( y);
2415  const int D = ( x) * ( y);
2416  double ftmp[6];
2417  uint32_t tmp[1];
2418  DECLARE_VAR_LOW32;
2419  DECLARE_VAR_ADDRT;
2420 
2421  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2422 
2423  __asm__ volatile(
2424  "li %[tmp0], 0x06 \n\t"
2425  "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2426  "mtc1 %[tmp0], %[ftmp5] \n\t"
2427  "pshufh %[A], %[A], %[ftmp0] \n\t"
2428  "pshufh %[B], %[B], %[ftmp0] \n\t"
2429  "pshufh %[C], %[C], %[ftmp0] \n\t"
2430  "pshufh %[D], %[D], %[ftmp0] \n\t"
2431 
2432  "1: \n\t"
2433  MMI_ULWC1(%[ftmp1], %[src], 0x00)
2434  MMI_ULWC1(%[ftmp2], %[src], 0x01)
2435  PTR_ADDU "%[src], %[src], %[stride] \n\t"
2436  MMI_ULWC1(%[ftmp3], %[src], 0x00)
2437  MMI_ULWC1(%[ftmp4], %[src], 0x01)
2438 
2440 
2441  MMI_LWC1(%[ftmp2], %[dst], 0x00)
2442  "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2443 
2444  MMI_SWC1(%[ftmp1], %[dst], 0x00)
2445  "addiu %[h], %[h], -0x01 \n\t"
2446  PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2447  "bnez %[h], 1b \n\t"
2448  : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2449  [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2450  [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2451  [tmp0]"=&r"(tmp[0]),
2452  RESTRICT_ASM_LOW32
2453  RESTRICT_ASM_ADDRT
2454  [src]"+&r"(src), [dst]"+&r"(dst),
2455  [h]"+&r"(h)
2456  : [stride]"r"((mips_reg)stride),
2457  [A]"f"(A), [B]"f"(B),
2458  [C]"f"(C), [D]"f"(D),
2459  [ff_pw_28]"f"(ff_pw_28)
2460  : "memory"
2461  );
2462 }
vc1_filter_line
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
Definition: vc1dsp_mmi.c:1487
stride
int stride
Definition: mace.c:144
ff_pw_64
const uint64_t ff_pw_64
Definition: constants.c:45
OP_AVG
#define OP_AVG(S, D)
Definition: vc1dsp_mmi.c:1605
ff_vc1_h_loop_filter16_mmi
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1578
ff_vc1_v_s_overlap_mmi
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
Definition: vc1dsp_mmi.c:1453
ff_put_no_rnd_vc1_chroma_mc8_mmi
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2242
ff_vc1_h_s_overlap_mmi
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
Definition: vc1dsp_mmi.c:1400
vc1dsp.h
ff_vc1_inv_trans_8x4_mmi
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
ff_pw_4
const uint64_t ff_pw_4
Definition: constants.c:29
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
b
#define b
Definition: input.c:41
ff_avg_no_rnd_vc1_chroma_mc8_mmi
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2350
mips_reg
#define mips_reg
Definition: asmdefs.h:44
ff_avg_no_rnd_vc1_chroma_mc4_mmi
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2408
ff_vc1_inv_trans_8x8_dc_mmi
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:123
D
D(D(float, sse)
Definition: rematrix_init.c:28
ff_avg_pixels8_8_mmi
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
A
#define A(x)
Definition: vp56_arith.h:28
DECLARE_FUNCTION
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
Definition: vc1dsp_mmi.c:2139
vc1dsp_mips.h
MSPEL_FILTER13_VER_16B
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1885
MSPEL_FILTER13_HOR_16B
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:1960
TRANSPOSE_4H
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
Definition: mmiutils.h:267
constants.h
mmiutils.h
vc1_mspel_mc_filter_ver_16bits
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
Definition: vc1dsp_mmi.c:2067
ff_vc1_h_overlap_mmi
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1377
a1
#define a1
Definition: regdef.h:47
ff_vc1_inv_trans_4x4_dc_mmi
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1123
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
rnd
#define rnd()
Definition: checkasm.h:107
ff_put_pixels8_8_mmi
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
ff_vc1_h_loop_filter8_mmi
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1568
CHROMA_MC_8_MMI
#define CHROMA_MC_8_MMI
Definition: vc1dsp_mmi.c:2188
ff_vc1_inv_trans_4x8_dc_mmi
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:867
vc1_mspel_mc_filter_8bits
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
Definition: vc1dsp_mmi.c:2072
vc1_mspel_mc_filter_hor_16bits
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
Definition: vc1dsp_mmi.c:2070
CHROMA_MC_4_MMI
#define CHROMA_MC_4_MMI
Definition: vc1dsp_mmi.c:2222
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
ff_put_vc1_mspel_mc00_mmi
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1583
ff_vc1_inv_trans_8x8_mmi
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
SHIFT2_LINE
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
Definition: vc1dsp_mmi.c:1639
VC1_INV_TRANCS_8_TYPE2
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
Definition: vc1dsp_mmi.c:74
MSPEL_FILTER13_8B
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
Definition: vc1dsp_mmi.c:2010
src
#define src
Definition: vp8dsp.c:254
ff_vc1_inv_trans_4x4_mmi
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:1171
VC1_MSPEL_MC
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Definition: vc1dsp_mmi.c:2086
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_pw_1
const uint64_t ff_pw_1
Definition: constants.c:26
shift1
static const int shift1[6]
Definition: dxa.c:50
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_vc1_v_overlap_mmi
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
Definition: vc1dsp_mmi.c:1430
hpeldsp_mips.h
VC1_HOR_16B_SHIFT2
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
Definition: vc1dsp_mmi.c:1700
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
a0
#define a0
Definition: regdef.h:46
ff_put_vc1_mspel_mc00_16_mmi
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1588
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
a2
#define a2
Definition: regdef.h:48
ff_vc1_inv_trans_4x8_mmi
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
av_always_inline
#define av_always_inline
Definition: attributes.h:49
uint8_t
uint8_t
Definition: audio_convert.c:194
len
int len
Definition: vorbis_enc_data.h:452
vc1_loop_filter
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
Definition: vc1dsp_mmi.c:1536
ff_vc1_v_loop_filter16_mmi
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1573
ff_avg_pixels16_8_mmi
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
VC1_INV_TRANCS_8_TYPE1
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
Definition: vc1dsp_mmi.c:30
VC1_SHIFT2
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
Definition: vc1dsp_mmi.c:1760
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
ff_vc1_inv_trans_8x4_dc_mmi
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_mmi.c:416
B
#define B
Definition: huffyuvdsp.h:32
ff_vc1_v_loop_filter8_mmi
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1563
temp
else temp
Definition: vf_mcdeint.c:256
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
ff_avg_vc1_mspel_mc00_16_mmi
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1598
shift
static int shift(int a, int b)
Definition: sonic.c:82
OP_PUT
#define OP_PUT(S, D)
Definition: vc1dsp_mmi.c:1604
ff_put_no_rnd_vc1_chroma_mc4_mmi
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Definition: vc1dsp_mmi.c:2297
LOAD_ROUNDER_MMI
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
Definition: vc1dsp_mmi.c:1633
ff_put_pixels16_8_mmi
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
ff_vc1_h_loop_filter4_mmi
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1558
ff_vc1_v_loop_filter4_mmi
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
Definition: vc1dsp_mmi.c:1553
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:565
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:72
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
ff_pw_28
const uint64_t ff_pw_28
Definition: constants.c:42
h
h
Definition: vp9dsp_template.c:2038
ff_avg_vc1_mspel_mc00_mmi
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Definition: vc1dsp_mmi.c:1593
a3
#define a3
Definition: regdef.h:49
clip
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
vc1_put_ver_16b_shift2_mmi
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
Definition: vc1dsp_mmi.c:1656
ff_pw_9
const uint64_t ff_pw_9
Definition: constants.c:33