Go to the documentation of this file.
33 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
34 "li %[tmp0], "#r1" \n\t" \
35 "mtc1 %[tmp0], %[ftmp13] \n\t" \
36 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
37 "li %[tmp0], "#r2" \n\t" \
38 "mtc1 %[tmp0], %[ftmp14] \n\t" \
39 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
40 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
41 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
42 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
43 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
44 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
45 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
47 "li %[tmp0], "#r3" \n\t" \
48 "mtc1 %[tmp0], %[ftmp13] \n\t" \
49 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
50 "li %[tmp0], "#r4" \n\t" \
51 "mtc1 %[tmp0], %[ftmp14] \n\t" \
52 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
53 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
54 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
55 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
56 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
57 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
58 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
60 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
61 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
62 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
63 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
64 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
65 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
66 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
67 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
68 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
69 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
70 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
71 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
72 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
73 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
74 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
75 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
77 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
78 "li %[tmp0], "#r1" \n\t" \
79 "mtc1 %[tmp0], %[ftmp13] \n\t" \
80 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
81 "li %[tmp0], "#r2" \n\t" \
82 "mtc1 %[tmp0], %[ftmp14] \n\t" \
83 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
84 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
85 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
86 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
87 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
88 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
89 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
91 "li %[tmp0], "#r3" \n\t" \
92 "mtc1 %[tmp0], %[ftmp13] \n\t" \
93 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
94 "li %[tmp0], "#r4" \n\t" \
95 "mtc1 %[tmp0], %[ftmp14] \n\t" \
96 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
97 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
98 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
99 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
100 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
101 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
102 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
104 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
105 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
106 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
107 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
108 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
109 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
110 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
111 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
112 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
113 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
114 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
115 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
116 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
117 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
118 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
119 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
120 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
121 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
122 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
123 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
134 dc = (3 *
dc + 1) >> 1;
135 dc = (3 *
dc + 16) >> 5;
139 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
140 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
141 "li %[count], 0x02 \n\t"
144 MMI_LDC1(%[ftmp1], %[dest], 0x00)
145 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
146 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
147 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
148 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
149 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
150 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
152 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
153 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
154 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
155 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
156 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
157 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
158 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
159 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
161 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
162 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
163 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
164 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
165 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
166 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
167 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
168 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
170 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
171 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
172 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
173 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
175 MMI_SDC1(%[ftmp1], %[dest], 0x00)
176 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
177 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
178 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
179 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
180 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
181 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
183 "addiu %[count], %[count], -0x01 \n\t"
184 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
185 "bnez %[count], 1b \n\t"
186 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
187 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
188 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
189 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
190 [ftmp8]
"=&f"(ftmp[8]),
191 [addr0]
"=&r"(addr[0]),
192 [count]
"=&r"(count), [dest]
"+&r"(dest)
193 : [linesize]
"r"((
mips_reg)linesize),
199 #if _MIPS_SIM != _ABIO32
208 "li %[tmp0], 0x03 \n\t"
209 "mtc1 %[tmp0], %[ftmp0] \n\t"
212 MMI_LDC1(%[ftmp1], %[
block], 0x00)
213 MMI_LDC1(%[ftmp11], %[
block], 0x10)
214 MMI_LDC1(%[ftmp2], %[
block], 0x20)
215 MMI_LDC1(%[ftmp12], %[
block], 0x30)
216 MMI_LDC1(%[ftmp3], %[
block], 0x40)
217 MMI_LDC1(%[ftmp13], %[
block], 0x50)
218 MMI_LDC1(%[ftmp4], %[
block], 0x60)
219 MMI_LDC1(%[ftmp14], %[
block], 0x70)
220 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
221 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
222 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
223 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
225 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
226 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
227 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
228 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
232 0x000f0010, 0x00040009, %[
ff_pw_4])
236 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
240 0xfff00009, 0x000f0004, %[
ff_pw_4])
244 0xfff70004, 0xfff0000f, %[
ff_pw_4])
246 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
247 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
250 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
252 MMI_SDC1(%[ftmp15], %[
temp], 0x00)
253 MMI_SDC1(%[ftmp19], %[
temp], 0x08)
254 MMI_SDC1(%[ftmp16], %[
temp], 0x10)
255 MMI_SDC1(%[ftmp20], %[
temp], 0x18)
256 MMI_SDC1(%[ftmp17], %[
temp], 0x20)
257 MMI_SDC1(%[ftmp21], %[
temp], 0x28)
258 MMI_SDC1(%[ftmp18], %[
temp], 0x30)
259 MMI_SDC1(%[ftmp22], %[
temp], 0x38)
262 MMI_LDC1(%[ftmp1], %[
block], 0x08)
263 MMI_LDC1(%[ftmp11], %[
block], 0x18)
264 MMI_LDC1(%[ftmp2], %[
block], 0x28)
265 MMI_LDC1(%[ftmp12], %[
block], 0x38)
266 MMI_LDC1(%[ftmp3], %[
block], 0x48)
267 MMI_LDC1(%[ftmp13], %[
block], 0x58)
268 MMI_LDC1(%[ftmp4], %[
block], 0x68)
269 MMI_LDC1(%[ftmp14], %[
block], 0x78)
270 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
271 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
272 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
273 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
275 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
276 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
277 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
278 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
282 0x000f0010, 0x00040009, %[
ff_pw_4])
286 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
290 0xfff00009, 0x000f0004, %[
ff_pw_4])
294 0xfff70004, 0xfff0000f, %[
ff_pw_4])
296 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
297 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
300 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
302 MMI_SDC1(%[ftmp19], %[
temp], 0x48)
303 MMI_SDC1(%[ftmp20], %[
temp], 0x58)
304 MMI_SDC1(%[ftmp21], %[
temp], 0x68)
305 MMI_SDC1(%[ftmp22], %[
temp], 0x78)
309 "li %[tmp0], 0x07 \n\t"
310 "mtc1 %[tmp0], %[ftmp0] \n\t"
313 MMI_LDC1(%[ftmp1], %[
temp], 0x00)
314 MMI_LDC1(%[ftmp11], %[
temp], 0x10)
315 MMI_LDC1(%[ftmp2], %[
temp], 0x20)
316 MMI_LDC1(%[ftmp12], %[
temp], 0x30)
317 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
318 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
319 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
320 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
322 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
323 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
324 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
325 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
343 MMI_SDC1(%[ftmp15], %[
block], 0x00)
344 MMI_SDC1(%[ftmp16], %[
block], 0x10)
345 MMI_SDC1(%[ftmp17], %[
block], 0x20)
346 MMI_SDC1(%[ftmp18], %[
block], 0x30)
347 MMI_SDC1(%[ftmp19], %[
block], 0x40)
348 MMI_SDC1(%[ftmp20], %[
block], 0x50)
349 MMI_SDC1(%[ftmp21], %[
block], 0x60)
350 MMI_SDC1(%[ftmp22], %[
block], 0x70)
353 MMI_LDC1(%[ftmp1], %[
temp], 0x08)
354 MMI_LDC1(%[ftmp11], %[
temp], 0x18)
355 MMI_LDC1(%[ftmp2], %[
temp], 0x28)
356 MMI_LDC1(%[ftmp12], %[
temp], 0x38)
357 MMI_LDC1(%[ftmp3], %[
temp], 0x48)
358 MMI_LDC1(%[ftmp13], %[
temp], 0x58)
359 MMI_LDC1(%[ftmp4], %[
temp], 0x68)
360 MMI_LDC1(%[ftmp14], %[
temp], 0x78)
361 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
362 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
363 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
364 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
366 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
367 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
368 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
369 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
387 MMI_SDC1(%[ftmp15], %[
block], 0x08)
388 MMI_SDC1(%[ftmp16], %[
block], 0x18)
389 MMI_SDC1(%[ftmp17], %[
block], 0x28)
390 MMI_SDC1(%[ftmp18], %[
block], 0x38)
391 MMI_SDC1(%[ftmp19], %[
block], 0x48)
392 MMI_SDC1(%[ftmp20], %[
block], 0x58)
393 MMI_SDC1(%[ftmp21], %[
block], 0x68)
394 MMI_SDC1(%[ftmp22], %[
block], 0x78)
396 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
397 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
398 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
399 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
400 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
401 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
402 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
403 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
404 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
405 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
406 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
407 [ftmp22]
"=&f"(ftmp[22]),
424 dc = ( 3 *
dc + 1) >> 1;
425 dc = (17 *
dc + 64) >> 7;
429 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
430 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
432 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
433 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
434 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
435 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
437 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
438 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
439 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
440 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
441 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
442 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
443 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
444 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
446 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
447 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
448 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
449 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
450 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
451 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
452 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
453 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
455 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
456 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
457 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
458 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
460 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
461 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
462 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
463 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
464 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
465 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
466 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
467 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
468 [ftmp8]
"=&f"(ftmp[8])
469 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
470 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
476 #if _MIPS_SIM != _ABIO32
480 int16_t *dst =
block;
484 int16_t
coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
485 12, 15, 6, -4, -12, -16, -16, -9,
486 12, 9, -6, -16, -12, 4, 16, 15,
487 12, 4, -16, -9, 12, 15, -6, -16,
488 12, -4, -16, 9, 12, -15, -6, 16,
489 12, -9, -6, 16, -12, -4, 16, -15,
490 12, -15, 6, 4, -12, 16, -16, 9,
491 12, -16, 16, -15, 12, -9, 6, -4};
495 "li %[tmp0], 0x03 \n\t"
496 "mtc1 %[tmp0], %[ftmp0] \n\t"
499 MMI_LDC1(%[ftmp1], %[
src], 0x00)
500 MMI_LDC1(%[ftmp2], %[
src], 0x08)
503 MMI_LDC1(%[ftmp3], %[
coeff], 0x00)
504 MMI_LDC1(%[ftmp4], %[
coeff], 0x08)
505 MMI_LDC1(%[ftmp5], %[
coeff], 0x10)
506 MMI_LDC1(%[ftmp6], %[
coeff], 0x18)
507 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
508 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
509 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
510 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
511 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
512 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
513 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
514 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
515 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
516 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
519 MMI_LDC1(%[ftmp3], %[
coeff], 0x20)
520 MMI_LDC1(%[ftmp4], %[
coeff], 0x28)
521 MMI_LDC1(%[ftmp5], %[
coeff], 0x30)
522 MMI_LDC1(%[ftmp6], %[
coeff], 0x38)
523 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
524 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
525 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
526 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
527 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
528 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
529 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
530 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
531 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
532 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
535 MMI_LDC1(%[ftmp3], %[
coeff], 0x40)
536 MMI_LDC1(%[ftmp4], %[
coeff], 0x48)
537 MMI_LDC1(%[ftmp5], %[
coeff], 0x50)
538 MMI_LDC1(%[ftmp6], %[
coeff], 0x58)
539 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
540 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
541 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
542 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
543 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
544 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
545 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
546 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
547 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
548 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
551 MMI_LDC1(%[ftmp3], %[
coeff], 0x60)
552 MMI_LDC1(%[ftmp4], %[
coeff], 0x68)
553 MMI_LDC1(%[ftmp5], %[
coeff], 0x70)
554 MMI_LDC1(%[ftmp6], %[
coeff], 0x78)
555 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
556 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
557 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
558 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
559 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
560 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
561 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
562 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
563 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
564 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
567 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
568 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
569 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
570 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
571 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
572 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
573 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
574 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
575 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
576 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
577 MMI_SDC1(%[ftmp9], %[dst], 0x00)
578 MMI_SDC1(%[ftmp10], %[dst], 0x08)
582 "addiu %[count], %[count], -0x01 \n\t"
583 "bnez %[count], 1b \n\t"
584 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
585 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
586 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
587 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
588 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
589 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
590 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
591 [ftmp14]
"=&f"(ftmp[14]), [tmp0]
"=&r"(
tmp[0]),
592 [
src]
"+&r"(
src), [dst]
"+&r"(dst), [count]
"+&r"(count)
601 "li %[tmp0], 0x44 \n\t"
602 "mtc1 %[tmp0], %[ftmp15] \n\t"
605 "li %[tmp0], 0x07 \n\t"
606 "mtc1 %[tmp0], %[ftmp0] \n\t"
607 MMI_LDC1(%[ftmp1], %[
src], 0x00)
608 MMI_LDC1(%[ftmp2], %[
src], 0x10)
609 MMI_LDC1(%[ftmp3], %[
src], 0x20)
610 MMI_LDC1(%[ftmp4], %[
src], 0x30)
611 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
612 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
613 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
614 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
617 "li %[tmp0], 0x00160011 \n\t"
618 "mtc1 %[tmp0], %[ftmp3] \n\t"
619 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
620 "li %[tmp0], 0x000a0011 \n\t"
621 "mtc1 %[tmp0], %[ftmp4] \n\t"
622 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
623 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
624 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
625 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
626 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
627 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
628 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
629 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
630 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
631 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
632 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
633 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
634 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
635 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
638 "li %[tmp0], 0x000a0011 \n\t"
639 "mtc1 %[tmp0], %[ftmp3] \n\t"
640 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
641 "li %[tmp0], 0xffeaffef \n\t"
642 "mtc1 %[tmp0], %[ftmp4] \n\t"
643 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
644 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
645 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
646 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
647 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
648 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
649 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
650 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
651 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
652 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
653 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
654 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
655 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
656 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
659 "li %[tmp0], 0xfff60011 \n\t"
660 "mtc1 %[tmp0], %[ftmp3] \n\t"
661 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
662 "li %[tmp0], 0x0016ffef \n\t"
663 "mtc1 %[tmp0], %[ftmp4] \n\t"
664 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
665 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
666 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
667 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
668 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
669 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
670 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
671 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
672 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
673 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
674 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
675 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
676 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
677 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
680 "li %[tmp0], 0xffea0011 \n\t"
681 "mtc1 %[tmp0], %[ftmp3] \n\t"
682 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
683 "li %[tmp0], 0xfff60011 \n\t"
684 "mtc1 %[tmp0], %[ftmp4] \n\t"
685 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
686 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
687 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
688 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
689 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
690 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
691 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
692 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
693 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
694 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
695 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
696 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
697 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
698 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
700 MMI_LWC1(%[ftmp1], %[dest], 0x00)
701 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
702 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
703 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
705 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
706 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
707 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
708 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
709 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
710 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
711 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
712 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
713 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
714 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
715 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
716 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
717 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
718 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
719 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
720 MMI_SWC1(%[ftmp1], %[dest], 0x00)
721 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
722 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
723 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
725 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
726 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
729 "li %[tmp0], 0x07 \n\t"
730 "mtc1 %[tmp0], %[ftmp0] \n\t"
731 MMI_LDC1(%[ftmp1], %[
src], 0x08)
732 MMI_LDC1(%[ftmp2], %[
src], 0x18)
733 MMI_LDC1(%[ftmp3], %[
src], 0x28)
734 MMI_LDC1(%[ftmp4], %[
src], 0x38)
735 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
736 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
737 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
738 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
741 "li %[tmp0], 0x00160011 \n\t"
742 "mtc1 %[tmp0], %[ftmp3] \n\t"
743 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
744 "li %[tmp0], 0x000a0011 \n\t"
745 "mtc1 %[tmp0], %[ftmp4] \n\t"
746 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
747 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
748 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
749 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
750 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
751 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
752 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
753 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
754 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
755 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
756 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
757 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
758 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
759 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
762 "li %[tmp0], 0x000a0011 \n\t"
763 "mtc1 %[tmp0], %[ftmp3] \n\t"
764 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
765 "li %[tmp0], 0xffeaffef \n\t"
766 "mtc1 %[tmp0], %[ftmp4] \n\t"
767 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
768 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
769 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
770 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
771 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
772 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
773 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
774 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
775 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
776 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
777 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
778 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
779 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
780 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
783 "li %[tmp0], 0xfff60011 \n\t"
784 "mtc1 %[tmp0], %[ftmp3] \n\t"
785 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
786 "li %[tmp0], 0x0016ffef \n\t"
787 "mtc1 %[tmp0], %[ftmp4] \n\t"
788 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
789 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
790 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
791 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
792 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
793 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
794 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
795 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
796 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
797 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
798 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
799 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
800 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
801 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
804 "li %[tmp0], 0xffea0011 \n\t"
805 "mtc1 %[tmp0], %[ftmp3] \n\t"
806 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
807 "li %[tmp0], 0xfff60011 \n\t"
808 "mtc1 %[tmp0], %[ftmp4] \n\t"
809 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
810 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
811 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
812 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
813 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
814 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
815 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
816 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
817 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
818 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
819 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
820 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
821 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
822 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
824 MMI_LWC1(%[ftmp1], %[dest], 0x04)
825 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
826 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
827 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
829 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
830 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
831 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
832 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
833 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
834 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
835 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
836 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
837 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
838 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
839 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
840 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
841 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
842 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
843 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
844 MMI_SWC1(%[ftmp1], %[dest], 0x04)
845 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
846 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
847 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
849 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
850 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
852 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
853 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
854 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
855 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
856 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
857 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
858 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
859 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
862 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
876 dc = (17 *
dc + 4) >> 3;
877 dc = (12 *
dc + 64) >> 7;
881 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
882 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
884 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
885 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
886 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
887 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
888 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
889 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
890 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
891 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
893 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
894 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
895 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
896 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
897 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
898 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
899 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
900 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
902 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
903 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
904 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
905 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
906 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
907 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
908 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
909 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
911 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
912 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
913 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
914 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
915 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
916 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
917 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
918 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
920 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
921 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
922 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
923 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
924 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
925 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
926 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
927 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
928 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
929 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
930 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
931 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
933 [ftmp8]
"=&f"(ftmp[8])
934 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
935 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
936 [dest4]
"r"(dest+4*linesize), [dest5]
"r"(dest+5*linesize),
937 [dest6]
"r"(dest+6*linesize), [dest7]
"r"(dest+7*linesize),
943 #if _MIPS_SIM != _ABIO32
947 int16_t *dst =
block;
949 uint64_t count = 8,
tmp[1];
950 int16_t
coeff[16] = {17, 22, 17, 10,
958 "li %[tmp0], 0x03 \n\t"
959 "mtc1 %[tmp0], %[ftmp0] \n\t"
961 MMI_LDC1(%[ftmp2], %[
coeff], 0x00)
962 MMI_LDC1(%[ftmp3], %[
coeff], 0x08)
963 MMI_LDC1(%[ftmp4], %[
coeff], 0x10)
964 MMI_LDC1(%[ftmp5], %[
coeff], 0x18)
967 MMI_LDC1(%[ftmp1], %[
src], 0x00)
968 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
969 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
970 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
971 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
972 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
973 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
974 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
975 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
976 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
977 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
978 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
979 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
980 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
981 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
982 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
983 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
984 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
985 MMI_SDC1(%[ftmp8], %[dst], 0x00)
989 "addiu %[count], %[count], -0x01 \n\t"
990 "bnez %[count], 1b \n\t"
991 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
992 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
993 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
994 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
995 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
996 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
997 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
998 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
1007 "li %[tmp0], 0x07 \n\t"
1008 "mtc1 %[tmp0], %[ftmp0] \n\t"
1010 MMI_LDC1(%[ftmp1], %[
src], 0x00)
1011 MMI_LDC1(%[ftmp2], %[
src], 0x20)
1012 MMI_LDC1(%[ftmp3], %[
src], 0x40)
1013 MMI_LDC1(%[ftmp4], %[
src], 0x60)
1014 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1015 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1016 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1017 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1019 MMI_LDC1(%[ftmp1], %[
src], 0x10)
1020 MMI_LDC1(%[ftmp2], %[
src], 0x30)
1021 MMI_LDC1(%[ftmp3], %[
src], 0x50)
1022 MMI_LDC1(%[ftmp4], %[
src], 0x70)
1023 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1024 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1025 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1026 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1044 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1045 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1046 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1047 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1048 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1049 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1050 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1051 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1052 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1053 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1054 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1055 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1056 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1057 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1058 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1059 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1060 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1061 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1062 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1063 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1064 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1065 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1066 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1067 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1069 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1070 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1071 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1072 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1073 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1074 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1075 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1076 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1078 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1079 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1080 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1081 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1082 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1083 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1084 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1085 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1087 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1088 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1089 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1090 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1091 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1092 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1093 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1094 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1095 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1096 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1097 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1098 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1099 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1100 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1101 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1103 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1104 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1105 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1106 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1107 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1108 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1109 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1110 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1111 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
1112 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
1113 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
1114 [ftmp22]
"=&f"(ftmp[22]),
1117 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1131 dc = (17 *
dc + 4) >> 3;
1132 dc = (17 *
dc + 64) >> 7;
1136 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1137 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1139 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1140 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1141 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1142 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1144 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1145 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1146 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1147 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1149 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1150 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1151 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1152 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1154 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1155 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1156 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1157 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1159 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1160 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1161 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1162 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1163 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1164 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1166 [ftmp4]
"=&f"(ftmp[4])
1167 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
1168 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
1177 int16_t *dst =
block;
1179 uint32_t count = 4,
tmp[1];
1180 int16_t
coeff[16] = {17, 22, 17, 10,
1187 "li %[tmp0], 0x03 \n\t"
1188 "mtc1 %[tmp0], %[ftmp0] \n\t"
1189 MMI_LDC1(%[ftmp2], %[
coeff], 0x00)
1190 MMI_LDC1(%[ftmp3], %[
coeff], 0x08)
1191 MMI_LDC1(%[ftmp4], %[
coeff], 0x10)
1192 MMI_LDC1(%[ftmp5], %[
coeff], 0x18)
1195 MMI_LDC1(%[ftmp1], %[
src], 0x00)
1196 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1197 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1198 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1199 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1200 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1201 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1202 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1203 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1204 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1205 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1206 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1207 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1208 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1209 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1210 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1211 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1212 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1213 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1217 "addiu %[count], %[count], -0x01 \n\t"
1218 "bnez %[count], 1b \n\t"
1219 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1220 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1221 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1222 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1223 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1224 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1225 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
1226 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
1235 "li %[tmp0], 0x07 \n\t"
1236 "mtc1 %[tmp0], %[ftmp0] \n\t"
1237 "li %[tmp0], 0x44 \n\t"
1238 "mtc1 %[tmp0], %[ftmp15] \n\t"
1240 MMI_LDC1(%[ftmp1], %[
src], 0x00)
1241 MMI_LDC1(%[ftmp2], %[
src], 0x10)
1242 MMI_LDC1(%[ftmp3], %[
src], 0x20)
1243 MMI_LDC1(%[ftmp4], %[
src], 0x30)
1244 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1245 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1246 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1247 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1250 "li %[tmp0], 0x00160011 \n\t"
1251 "mtc1 %[tmp0], %[ftmp3] \n\t"
1252 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1253 "li %[tmp0], 0x000a0011 \n\t"
1254 "mtc1 %[tmp0], %[ftmp4] \n\t"
1255 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1256 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1257 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1258 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1259 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1260 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1261 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1262 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1263 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1264 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1265 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1266 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1267 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1268 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1271 "li %[tmp0], 0x000a0011 \n\t"
1272 "mtc1 %[tmp0], %[ftmp3] \n\t"
1273 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1274 "li %[tmp0], 0xffeaffef \n\t"
1275 "mtc1 %[tmp0], %[ftmp4] \n\t"
1276 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1277 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1278 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1279 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1280 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1281 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1282 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1283 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1284 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1285 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1286 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1287 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1288 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1289 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1292 "li %[tmp0], 0xfff60011 \n\t"
1293 "mtc1 %[tmp0], %[ftmp3] \n\t"
1294 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1295 "li %[tmp0], 0x0016ffef \n\t"
1296 "mtc1 %[tmp0], %[ftmp4] \n\t"
1297 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1298 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1299 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1300 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1301 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1302 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1303 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1304 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1305 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1306 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1307 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1308 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1309 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1310 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1313 "li %[tmp0], 0xffea0011 \n\t"
1314 "mtc1 %[tmp0], %[ftmp3] \n\t"
1315 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1316 "li %[tmp0], 0xfff60011 \n\t"
1317 "mtc1 %[tmp0], %[ftmp4] \n\t"
1318 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1319 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1320 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1321 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1322 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1323 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1324 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1325 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1326 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1327 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1328 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1329 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1330 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1331 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1333 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1334 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1335 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1336 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1337 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1338 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1339 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1340 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1341 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1342 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1343 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1344 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1345 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1346 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1347 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1348 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1349 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1350 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1351 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1352 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1354 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1355 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1356 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1357 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1358 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1359 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1360 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1362 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1363 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1364 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1365 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1366 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1367 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1368 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1369 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1372 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1384 for (
i = 0;
i < 8;
i++) {
1389 d1 = (
a -
d + 3 +
rnd) >> 3;
1390 d2 = (
a -
d +
b -
c + 4 -
rnd) >> 3;
1406 int rnd1 =
flags & 2 ? 3 : 4;
1407 int rnd2 = 7 - rnd1;
1408 for (
i = 0;
i < 8;
i++) {
1416 left[6] = ((
a << 3) - d1 + rnd1) >> 3;
1417 left[7] = ((
b << 3) - d2 + rnd2) >> 3;
1418 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1419 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1421 right += right_stride;
1422 left += left_stride;
1437 for (
i = 0;
i < 8;
i++) {
1442 d1 = (
a -
d + 3 +
rnd) >> 3;
1443 d2 = (
a -
d +
b -
c + 4 -
rnd) >> 3;
1459 int rnd1 = 4, rnd2 = 3;
1460 for (
i = 0;
i < 8;
i++) {
1468 top[48] = ((
a << 3) - d1 + rnd1) >> 3;
1469 top[56] = ((
b << 3) - d2 + rnd2) >> 3;
1470 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1471 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1492 int a0_sign =
a0 >> 31;
1494 a0 = (
a0 ^ a0_sign) - a0_sign;
1502 int clip_sign =
clip >> 31;
1504 clip = ((
clip ^ clip_sign) - clip_sign) >> 1;
1507 int d = 5 * (
a3 -
a0);
1508 int d_sign = (
d >> 31);
1510 d = ((
d ^ d_sign) - d_sign) >> 3;
1513 if (d_sign ^ clip_sign)
1517 d = (
d ^ d_sign) - d_sign;
1543 for (
i = 0;
i <
len;
i += 4) {
1605 #define OP_PUT(S, D)
1606 #define OP_AVG(S, D) \
1607 "ldc1 $f16, "#S" \n\t" \
1608 "pavgb "#D", "#D", $f16 \n\t"
1611 #define NORMALIZE_MMI(SHIFT) \
1612 "paddh $f6, $f6, $f14 \n\t" \
1613 "paddh $f8, $f8, $f14 \n\t" \
1614 "psrah $f6, $f6, "SHIFT" \n\t" \
1615 "psrah $f8, $f8, "SHIFT" \n\t"
1617 #define TRANSFER_DO_PACK(OP) \
1618 "packushb $f6, $f6, $f8 \n\t" \
1620 "sdc1 $f6, 0x00(%[dst]) \n\t"
1622 #define TRANSFER_DONT_PACK(OP) \
1623 OP(0(%[dst]), $f6) \
1624 OP(8(%[dst]), $f8) \
1625 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1626 "sdc1 $f8, 0x08(%[dst]) \n\t"
1629 #define DO_UNPACK(reg) \
1630 "punpcklbh "reg", "reg", $f0 \n\t"
1631 #define DONT_UNPACK(reg)
1634 #define LOAD_ROUNDER_MMI(ROUND) \
1635 "lwc1 $f14, "ROUND" \n\t" \
1636 "punpcklhw $f14, $f14, $f14 \n\t" \
1637 "punpcklwd $f14, $f14, $f14 \n\t"
1640 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1641 "paddh "#R1", "#R1", "#R2" \n\t" \
1642 PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1643 MMI_ULWC1(R0, $9, 0x00) \
1644 "pmullh "#R1", "#R1", $f6 \n\t" \
1645 "punpcklbh "#R0", "#R0", $f0 \n\t" \
1646 PTR_ADDU "$9, %[src], %[stride] \n\t" \
1647 MMI_ULWC1(R3, $9, 0x00) \
1648 "psubh "#R1", "#R1", "#R0" \n\t" \
1649 "punpcklbh "#R3", "#R3", $f0 \n\t" \
1650 "paddh "#R1", "#R1", $f14 \n\t" \
1651 "psubh "#R1", "#R1", "#R3" \n\t" \
1652 "psrah "#R1", "#R1", %[shift] \n\t" \
1653 MMI_SDC1(R1, %[dst], OFF) \
1654 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1667 "pxor $f0, $f0, $f0 \n\t"
1671 MMI_ULWC1($f4, %[
src], 0x00)
1672 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1673 MMI_ULWC1($f6, %[
src], 0x00)
1674 "punpcklbh $f4, $f4, $f0 \n\t"
1675 "punpcklbh $f6, $f6, $f0 \n\t"
1684 PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1686 "addiu $8, $8, -0x01 \n\t"
1689 [
src]
"+r"(
src), [dst]
"+r"(dst)
1693 :
"$8",
"$9",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
1694 "$f14",
"$f16",
"memory"
1702 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1703 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1704 const int16_t *src, int rnd) \
1707 DECLARE_VAR_ALL64; \
1708 DECLARE_VAR_ADDRT; \
1711 rnd -= (-1+9+9-1)*1024; \
1714 LOAD_ROUNDER_MMI("%[rnd]") \
1716 MMI_ULDC1($f2, %[src], 0x00) \
1717 MMI_ULDC1($f4, %[src], 0x08) \
1718 MMI_ULDC1($f6, %[src], 0x02) \
1719 MMI_ULDC1($f8, %[src], 0x0a) \
1720 MMI_ULDC1($f0, %[src], 0x06) \
1721 "paddh $f2, $f2, $f0 \n\t" \
1722 MMI_ULDC1($f0, %[src], 0x0e) \
1723 "paddh $f4, $f4, $f0 \n\t" \
1724 MMI_ULDC1($f0, %[src], 0x04) \
1725 "paddh $f6, $f6, $f0 \n\t" \
1726 MMI_ULDC1($f0, %[src], 0x0b) \
1727 "paddh $f8, $f8, $f0 \n\t" \
1728 "pmullh $f6, $f6, %[ff_pw_9] \n\t" \
1729 "pmullh $f8, $f8, %[ff_pw_9] \n\t" \
1730 "psubh $f6, $f6, $f2 \n\t" \
1731 "psubh $f8, $f8, $f4 \n\t" \
1732 "li $8, 0x07 \n\t" \
1733 "mtc1 $8, $f16 \n\t" \
1734 NORMALIZE_MMI("$f16") \
1736 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1737 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1738 TRANSFER_DO_PACK(OP) \
1739 "addiu %[h], %[h], -0x01 \n\t" \
1740 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742 "bnez %[h], 1b \n\t" \
1743 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1745 [src]"+r"(src), [dst]"+r"(dst) \
1746 : [stride]"r"(stride), [rnd]"m"(rnd), \
1747 [ff_pw_9]"f"(ff_pw_9.f), [ff_pw_128]"f"(ff_pw_128.f) \
1748 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f14", \
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762 mips_reg stride, int rnd, \
1765 DECLARE_VAR_LOW32; \
1766 DECLARE_VAR_ADDRT; \
1771 "pxor $f0, $f0, $f0 \n\t" \
1772 "li $10, 0x08 \n\t" \
1773 LOAD_ROUNDER_MMI("%[rnd]") \
1775 MMI_ULWC1($f6, %[src], 0x00) \
1776 MMI_ULWC1($f8, %[src], 0x04) \
1777 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1778 MMI_ULWC1($f2, $9, 0x00) \
1779 MMI_ULWC1($f4, $9, 0x04) \
1780 PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1781 "punpcklbh $f6, $f6, $f0 \n\t" \
1782 "punpcklbh $f8, $f8, $f0 \n\t" \
1783 "punpcklbh $f2, $f2, $f0 \n\t" \
1784 "punpcklbh $f4, $f4, $f0 \n\t" \
1785 "paddh $f6, $f6, $f2 \n\t" \
1786 "paddh $f8, $f8, $f4 \n\t" \
1787 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1788 MMI_ULWC1($f2, $9, 0x00) \
1789 MMI_ULWC1($f4, $9, 0x04) \
1790 "pmullh $f6, $f6, %[ff_pw_9] \n\t" \
1791 "pmullh $f8, $f8, %[ff_pw_9] \n\t" \
1792 "punpcklbh $f2, $f2, $f0 \n\t" \
1793 "punpcklbh $f4, $f4, $f0 \n\t" \
1794 "psubh $f6, $f6, $f2 \n\t" \
1795 "psubh $f8, $f8, $f4 \n\t" \
1796 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1797 MMI_ULWC1($f2, $9, 0x00) \
1798 MMI_ULWC1($f4, $9, 0x04) \
1799 "punpcklbh $f2, $f2, $f0 \n\t" \
1800 "punpcklbh $f4, $f4, $f0 \n\t" \
1801 "psubh $f6, $f6, $f2 \n\t" \
1802 "psubh $f8, $f8, $f4 \n\t" \
1803 "li $8, 0x04 \n\t" \
1804 "mtc1 $8, $f16 \n\t" \
1805 NORMALIZE_MMI("$f16") \
1806 "packushb $f6, $f6, $f8 \n\t" \
1808 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1809 "addiu $10, $10, -0x01 \n\t" \
1810 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1811 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1812 "bnez $10, 1b \n\t" \
1813 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1814 [src]"+r"(src), [dst]"+r"(dst) \
1815 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1816 [stride]"r"(stride), [rnd]"m"(rnd), \
1817 [stride1]"r"(stride-offset), \
1818 [ff_pw_9]"f"(ff_pw_9.f) \
1819 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1820 "$f14", "$f16", "memory" \
1838 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1839 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1840 LOAD($f2, $9, M*0) \
1841 LOAD($f4, $9, M*4) \
1844 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1845 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1846 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1847 LOAD($f6, $9, M*0) \
1848 LOAD($f8, $9, M*4) \
1851 "pmullh $f6, $f6, %[ff_pw_18] \n\t" \
1852 "pmullh $f8, $f8, %[ff_pw_18] \n\t" \
1853 "psubh $f6, $f6, $f2 \n\t" \
1854 "psubh $f8, $f8, $f4 \n\t" \
1855 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1856 LOAD($f2, $9, M*0) \
1857 LOAD($f4, $9, M*4) \
1860 "li $8, 0x02 \n\t" \
1861 "mtc1 $8, $f16 \n\t" \
1862 "psllh $f2, $f2, $f16 \n\t" \
1863 "psllh $f4, $f4, $f16 \n\t" \
1864 "psubh $f6, $f6, $f2 \n\t" \
1865 "psubh $f8, $f8, $f4 \n\t" \
1866 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1867 LOAD($f2, $9, M*0) \
1868 LOAD($f4, $9, M*4) \
1871 "pmullh $f2, $f2, %[ff_pw_53] \n\t" \
1872 "pmullh $f4, $f4, %[ff_pw_53] \n\t" \
1873 "paddh $f6, $f6, $f2 \n\t" \
1874 "paddh $f8, $f8, $f4 \n\t"
1884 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1886 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1887 mips_reg src_stride, \
1888 int rnd, int64_t shift) \
1891 union mmi_intfloat64 shift_u; \
1892 DECLARE_VAR_LOW32; \
1893 DECLARE_VAR_ADDRT; \
1894 shift_u.i = shift; \
1896 src -= src_stride; \
1899 "pxor $f0, $f0, $f0 \n\t" \
1900 LOAD_ROUNDER_MMI("%[rnd]") \
1903 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1904 NORMALIZE_MMI("%[shift]") \
1905 TRANSFER_DONT_PACK(OP_PUT) \
1907 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1908 MMI_ULWC1($f2, $9, 0x08) \
1910 "mov.d $f6, $f2 \n\t" \
1911 "paddh $f2, $f2, $f2 \n\t" \
1912 "paddh $f2, $f2, $f6 \n\t" \
1913 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1914 MMI_ULWC1($f6, $9, 0x08) \
1916 "pmullh $f6, $f6, %[ff_pw_18] \n\t" \
1917 "psubh $f6, $f6, $f2 \n\t" \
1918 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1919 MMI_ULWC1($f2, $9, 0x08) \
1921 "pmullh $f2, $f2, %[ff_pw_53] \n\t" \
1922 "paddh $f6, $f6, $f2 \n\t" \
1923 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1924 MMI_ULWC1($f2, $9, 0x08) \
1926 "li $8, 0x02 \n\t" \
1927 "mtc1 $8, $f16 \n\t" \
1928 "psllh $f2, $f2, $f16 \n\t" \
1929 "psubh $f6, $f6, $f2 \n\t" \
1930 "paddh $f6, $f6, $f14 \n\t" \
1931 "li $8, 0x06 \n\t" \
1932 "mtc1 $8, $f16 \n\t" \
1933 "psrah $f6, $f6, $f16 \n\t" \
1934 "sdc1 $f6, 0x10(%[dst]) \n\t" \
1935 "addiu %[h], %[h], -0x01 \n\t" \
1936 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1937 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1938 "bnez %[h], 1b \n\t" \
1939 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1941 [src]"+r"(src), [dst]"+r"(dst) \
1942 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1943 [stride_x3]"r"(3*src_stride), \
1944 [rnd]"m"(rnd), [shift]"f"(shift_u.f), \
1945 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1946 [ff_pw_3]"f"(ff_pw_3.f) \
1947 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1948 "$f14", "$f16", "memory" \
1959 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1961 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1962 const int16_t *src, int rnd) \
1965 DECLARE_VAR_ALL64; \
1966 DECLARE_VAR_ADDRT; \
1969 rnd -= (-4+58+13-3)*256; \
1972 "pxor $f0, $f0, $f0 \n\t" \
1973 LOAD_ROUNDER_MMI("%[rnd]") \
1976 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1977 "li $8, 0x07 \n\t" \
1978 "mtc1 $8, $f16 \n\t" \
1979 NORMALIZE_MMI("$f16") \
1981 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1982 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1983 TRANSFER_DO_PACK(OP) \
1984 "addiu %[h], %[h], -0x01 \n\t" \
1985 PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1986 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1987 "bnez %[h], 1b \n\t" \
1988 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1990 [src]"+r"(src), [dst]"+r"(dst) \
1991 : [stride]"r"(stride), [rnd]"m"(rnd), \
1992 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
1993 [ff_pw_3]"f"(ff_pw_3.f), [ff_pw_128]"f"(ff_pw_128.f) \
1994 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
1995 "$f14", "$f16", "memory" \
2007 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2009 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2010 mips_reg stride, int rnd, mips_reg offset) \
2013 DECLARE_VAR_LOW32; \
2014 DECLARE_VAR_ADDRT; \
2019 __asm__ volatile ( \
2020 "pxor $f0, $f0, $f0 \n\t" \
2021 LOAD_ROUNDER_MMI("%[rnd]") \
2024 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2025 "li $8, 0x06 \n\t" \
2026 "mtc1 $8, $f16 \n\t" \
2027 NORMALIZE_MMI("$f16") \
2028 TRANSFER_DO_PACK(OP) \
2029 "addiu %[h], %[h], -0x01 \n\t" \
2030 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2031 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2032 "bnez %[h], 1b \n\t" \
2033 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2035 [src]"+r"(src), [dst]"+r"(dst) \
2036 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2037 [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2039 [ff_pw_53]"f"(ff_pw_53.f), [ff_pw_18]"f"(ff_pw_18.f), \
2040 [ff_pw_3]"f"(ff_pw_3.f) \
2041 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", \
2042 "$f14", "$f16", "memory" \
2081 #define VC1_MSPEL_MC(OP) \
2082 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2083 int hmode, int vmode, int rnd) \
2085 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2086 { NULL, vc1_put_ver_16b_shift1_mmi, \
2087 vc1_put_ver_16b_shift2_mmi, \
2088 vc1_put_ver_16b_shift3_mmi }; \
2089 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2090 { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2091 OP ## vc1_hor_16b_shift2_mmi, \
2092 OP ## vc1_hor_16b_shift3_mmi }; \
2093 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2094 { NULL, OP ## vc1_shift1_mmi, \
2095 OP ## vc1_shift2_mmi, \
2096 OP ## vc1_shift3_mmi }; \
2100 static const int shift_value[] = { 0, 5, 1, 5 }; \
2101 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2103 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2105 r = (1<<(shift-1)) + rnd-1; \
2106 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2108 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2112 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2118 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2120 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2121 int stride, int hmode, int vmode, int rnd)\
2123 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2124 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2125 dst += 8*stride; src += 8*stride; \
2126 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2127 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2134 #define DECLARE_FUNCTION(a, b) \
2135 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2136 const uint8_t *src, \
2140 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2142 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2143 const uint8_t *src, \
2147 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2149 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2150 const uint8_t *src, \
2154 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2156 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2157 const uint8_t *src, \
2161 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2183 #define CHROMA_MC_8_MMI \
2184 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2185 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2186 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2187 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2188 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2189 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2190 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2191 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2193 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2194 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2195 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2196 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2197 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2198 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2199 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2200 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2202 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2203 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2204 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2205 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2207 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2208 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2209 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2210 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2212 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2213 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2214 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2217 #define CHROMA_MC_4_MMI \
2218 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2219 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2220 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2221 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2223 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2224 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2225 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2226 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2228 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2229 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2230 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2231 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2233 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2234 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2238 const uint8_t *
src ,
2239 ptrdiff_t
stride,
int h,
int x,
int y)
2246 A.i = (8 - x) * (8 - y);
2247 B.i = (x) * (8 - y);
2248 C.i = (8 - x) * (y);
2251 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2254 "li %[tmp0], 0x06 \n\t"
2255 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2256 "mtc1 %[tmp0], %[ftmp9] \n\t"
2257 "pshufh %[A], %[A], %[ftmp0] \n\t"
2258 "pshufh %[B], %[B], %[ftmp0] \n\t"
2259 "pshufh %[C], %[C], %[ftmp0] \n\t"
2260 "pshufh %[D], %[D], %[ftmp0] \n\t"
2263 MMI_ULDC1(%[ftmp1], %[
src], 0x00)
2264 MMI_ULDC1(%[ftmp2], %[
src], 0x01)
2265 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2266 MMI_ULDC1(%[ftmp3], %[
src], 0x00)
2267 MMI_ULDC1(%[ftmp4], %[
src], 0x01)
2271 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2272 "addiu %[h], %[h], -0x01 \n\t"
2273 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2274 "bnez %[h], 1b \n\t"
2275 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2276 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2277 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2278 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2279 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2282 [tmp0]
"=&r"(
tmp[0]),
2283 [
src]
"+&r"(
src), [dst]
"+&r"(dst),
2286 [
A]
"f"(
A.f), [
B]
"f"(
B.f),
2287 [
C]
"f"(
C.f), [
D]
"f"(
D.f),
2294 const uint8_t *
src ,
2295 ptrdiff_t
stride,
int h,
int x,
int y)
2302 A.i = (8 - x) * (8 - y);
2303 B.i = (x) * (8 - y);
2304 C.i = (8 - x) * (y);
2307 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2310 "li %[tmp0], 0x06 \n\t"
2311 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2312 "mtc1 %[tmp0], %[ftmp5] \n\t"
2313 "pshufh %[A], %[A], %[ftmp0] \n\t"
2314 "pshufh %[B], %[B], %[ftmp0] \n\t"
2315 "pshufh %[C], %[C], %[ftmp0] \n\t"
2316 "pshufh %[D], %[D], %[ftmp0] \n\t"
2319 MMI_ULWC1(%[ftmp1], %[
src], 0x00)
2320 MMI_ULWC1(%[ftmp2], %[
src], 0x01)
2321 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2322 MMI_ULWC1(%[ftmp3], %[
src], 0x00)
2323 MMI_ULWC1(%[ftmp4], %[
src], 0x01)
2327 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2328 "addiu %[h], %[h], -0x01 \n\t"
2329 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2330 "bnez %[h], 1b \n\t"
2331 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2332 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2333 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2334 [tmp0]
"=&r"(
tmp[0]),
2337 [
src]
"+&r"(
src), [dst]
"+&r"(dst),
2340 [
A]
"f"(
A.f), [
B]
"f"(
B.f),
2341 [
C]
"f"(
C.f), [
D]
"f"(
D.f),
2348 const uint8_t *
src ,
2349 ptrdiff_t
stride,
int h,
int x,
int y)
2356 A.i = (8 - x) * (8 - y);
2357 B.i = (x) * (8 - y);
2358 C.i = (8 - x) * (y);
2361 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2364 "li %[tmp0], 0x06 \n\t"
2365 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2366 "mtc1 %[tmp0], %[ftmp9] \n\t"
2367 "pshufh %[A], %[A], %[ftmp0] \n\t"
2368 "pshufh %[B], %[B], %[ftmp0] \n\t"
2369 "pshufh %[C], %[C], %[ftmp0] \n\t"
2370 "pshufh %[D], %[D], %[ftmp0] \n\t"
2373 MMI_ULDC1(%[ftmp1], %[
src], 0x00)
2374 MMI_ULDC1(%[ftmp2], %[
src], 0x01)
2375 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2376 MMI_ULDC1(%[ftmp3], %[
src], 0x00)
2377 MMI_ULDC1(%[ftmp4], %[
src], 0x01)
2381 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2382 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2384 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2385 "addiu %[h], %[h], -0x01 \n\t"
2386 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2387 "bnez %[h], 1b \n\t"
2388 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2389 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2390 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2391 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2392 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2393 [tmp0]
"=&r"(
tmp[0]),
2396 [
src]
"+&r"(
src), [dst]
"+&r"(dst),
2399 [
A]
"f"(
A.f), [
B]
"f"(
B.f),
2400 [
C]
"f"(
C.f), [
D]
"f"(
D.f),
2407 const uint8_t *
src ,
2408 ptrdiff_t
stride,
int h,
int x,
int y)
2415 A.i = (8 - x) * (8 - y);
2416 B.i = (x) * (8 - y);
2417 C.i = (8 - x) * (y);
2420 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2423 "li %[tmp0], 0x06 \n\t"
2424 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2425 "mtc1 %[tmp0], %[ftmp5] \n\t"
2426 "pshufh %[A], %[A], %[ftmp0] \n\t"
2427 "pshufh %[B], %[B], %[ftmp0] \n\t"
2428 "pshufh %[C], %[C], %[ftmp0] \n\t"
2429 "pshufh %[D], %[D], %[ftmp0] \n\t"
2432 MMI_ULWC1(%[ftmp1], %[
src], 0x00)
2433 MMI_ULWC1(%[ftmp2], %[
src], 0x01)
2434 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2435 MMI_ULWC1(%[ftmp3], %[
src], 0x00)
2436 MMI_ULWC1(%[ftmp4], %[
src], 0x01)
2440 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2441 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2443 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2444 "addiu %[h], %[h], -0x01 \n\t"
2445 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2446 "bnez %[h], 1b \n\t"
2447 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2448 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2449 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2450 [tmp0]
"=&r"(
tmp[0]),
2453 [
src]
"+&r"(
src), [dst]
"+&r"(dst),
2456 [
A]
"f"(
A.f), [
B]
"f"(
B.f),
2457 [
C]
"f"(
C.f), [
D]
"f"(
D.f),
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
const union av_intfloat64 ff_pw_4
#define DECLARE_VAR_LOW32
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
const union av_intfloat64 ff_pw_1
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
const union av_intfloat64 ff_pw_64
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, ptrdiff_t stride, int pq)
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, ptrdiff_t left_stride, ptrdiff_t right_stride, int flags)
const union av_intfloat64 ff_pw_32_1
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
void ff_vc1_h_overlap_mmi(uint8_t *src, ptrdiff_t stride)
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
#define DECLARE_ALIGNED(n, t, v)
static int shift(int a, int b)
void ff_vc1_v_overlap_mmi(uint8_t *src, ptrdiff_t stride)
const union av_intfloat64 ff_pw_32_64
const union av_intfloat64 ff_pw_32_4
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
#define DECLARE_VAR_ALL64
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
const union av_intfloat64 ff_pw_28
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
#define i(width, name, range_min, range_max)
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, ptrdiff_t stride, int pq)
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int h, int x, int y)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define RESTRICT_ASM_LOW32
#define DECLARE_VAR_ADDRT
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define flags(name, subs,...)
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, ptrdiff_t stride, int pq)
static const double coeff[2][5]
The exact code depends on how similar the blocks are and how related they are to the block
#define RESTRICT_ASM_ADDRT
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
static const uint8_t shift1[6]
#define RESTRICT_ASM_ALL64
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.