30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \ 31 "li %[tmp0], "#r1" \n\t" \ 32 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 33 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 34 "li %[tmp0], "#r2" \n\t" \ 35 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 36 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 37 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 38 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 39 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 40 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 41 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 42 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 44 "li %[tmp0], "#r3" \n\t" \ 45 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 46 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 47 "li %[tmp0], "#r4" \n\t" \ 48 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 49 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 50 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 51 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 52 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 53 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 54 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 55 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 57 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 58 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \ 59 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 60 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 61 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 62 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 63 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 64 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 65 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 66 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 67 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 68 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 69 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 70 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 71 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 72 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 74 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \ 75 "li %[tmp0], "#r1" \n\t" \ 76 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 77 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 78 "li %[tmp0], "#r2" \n\t" \ 79 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 80 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 81 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \ 82 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \ 83 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 84 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \ 85 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \ 86 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ 88 "li %[tmp0], "#r3" \n\t" \ 89 "mtc1 %[tmp0], %[ftmp13] \n\t" \ 90 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \ 91 "li %[tmp0], "#r4" \n\t" \ 92 "mtc1 %[tmp0], %[ftmp14] \n\t" \ 93 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \ 94 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \ 95 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \ 96 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 97 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \ 98 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \ 99 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \ 101 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \ 102 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \ 103 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \ 104 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \ 105 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \ 106 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \ 107 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \ 108 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \ 109 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \ 110 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \ 111 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \ 112 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 113 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \ 114 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 115 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \ 116 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \ 117 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \ 118 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \ 119 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \ 120 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t" 130 dc = (3 * dc + 1) >> 1;
131 dc = (3 * dc + 16) >> 5;
134 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 135 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 136 "li %[count], 0x02 \n\t" 139 MMI_LDC1(%[ftmp1], %[dest], 0x00)
140 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 141 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
142 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 143 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
144 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 145 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
147 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 148 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 149 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 150 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 151 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 152 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 153 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 154 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 156 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 157 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 158 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 159 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 160 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 161 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 162 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 163 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 165 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 166 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 167 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 168 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 170 MMI_SDC1(%[ftmp1], %[dest], 0x00)
171 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t" 172 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
173 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 174 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
175 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t" 176 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
178 "addiu %[count], %[count], -0x01 \n\t" 179 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t" 180 "bnez %[count], 1b \n\t" 181 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
182 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
183 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
184 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
185 [ftmp8]
"=&f"(ftmp[8]),
186 [addr0]
"=&r"(addr[0]),
188 : [linesize]
"r"((
mips_reg)linesize),
194 #if _MIPS_SIM != _ABIO32 198 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
199 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
200 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
206 "li %[tmp0], 0x03 \n\t" 207 "mtc1 %[tmp0], %[ftmp0] \n\t" 210 MMI_LDC1(%[ftmp1], %[block], 0x00)
211 MMI_LDC1(%[ftmp11], %[block], 0x10)
212 MMI_LDC1(%[ftmp2], %[block], 0x20)
213 MMI_LDC1(%[ftmp12], %[block], 0x30)
214 MMI_LDC1(%[ftmp3], %[block], 0x40)
215 MMI_LDC1(%[ftmp13], %[block], 0x50)
216 MMI_LDC1(%[ftmp4], %[block], 0x60)
217 MMI_LDC1(%[ftmp14], %[block], 0x70)
218 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 219 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 220 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 221 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 223 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 224 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 225 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 226 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 230 0x000f0010, 0x00040009, %[
ff_pw_4])
234 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
238 0xfff00009, 0x000f0004, %[
ff_pw_4])
242 0xfff70004, 0xfff0000f, %[
ff_pw_4])
244 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
245 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
247 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
248 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
250 MMI_SDC1(%[ftmp15], %[
temp], 0x00)
251 MMI_SDC1(%[ftmp19], %[
temp], 0x08)
252 MMI_SDC1(%[ftmp16], %[
temp], 0x10)
253 MMI_SDC1(%[ftmp20], %[
temp], 0x18)
254 MMI_SDC1(%[ftmp17], %[
temp], 0x20)
255 MMI_SDC1(%[ftmp21], %[
temp], 0x28)
256 MMI_SDC1(%[ftmp18], %[
temp], 0x30)
257 MMI_SDC1(%[ftmp22], %[
temp], 0x38)
260 MMI_LDC1(%[ftmp1], %[block], 0x08)
261 MMI_LDC1(%[ftmp11], %[block], 0x18)
262 MMI_LDC1(%[ftmp2], %[block], 0x28)
263 MMI_LDC1(%[ftmp12], %[block], 0x38)
264 MMI_LDC1(%[ftmp3], %[block], 0x48)
265 MMI_LDC1(%[ftmp13], %[block], 0x58)
266 MMI_LDC1(%[ftmp4], %[block], 0x68)
267 MMI_LDC1(%[ftmp14], %[block], 0x78)
268 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 269 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 270 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 271 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 273 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 274 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 275 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 276 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 280 0x000f0010, 0x00040009, %[
ff_pw_4])
284 0xfffc000f, 0xfff7fff0, %[
ff_pw_4])
288 0xfff00009, 0x000f0004, %[
ff_pw_4])
292 0xfff70004, 0xfff0000f, %[
ff_pw_4])
294 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
295 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
297 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
298 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
300 MMI_SDC1(%[ftmp19], %[
temp], 0x48)
301 MMI_SDC1(%[ftmp20], %[
temp], 0x58)
302 MMI_SDC1(%[ftmp21], %[
temp], 0x68)
303 MMI_SDC1(%[ftmp22], %[
temp], 0x78)
307 "li %[tmp0], 0x07 \n\t" 308 "mtc1 %[tmp0], %[ftmp0] \n\t" 311 MMI_LDC1(%[ftmp1], %[
temp], 0x00)
312 MMI_LDC1(%[ftmp11], %[
temp], 0x10)
313 MMI_LDC1(%[ftmp2], %[
temp], 0x20)
314 MMI_LDC1(%[ftmp12], %[
temp], 0x30)
315 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 316 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 317 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t" 318 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t" 320 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 321 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 322 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t" 323 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t" 341 MMI_SDC1(%[ftmp15], %[block], 0x00)
342 MMI_SDC1(%[ftmp16], %[block], 0x10)
343 MMI_SDC1(%[ftmp17], %[block], 0x20)
344 MMI_SDC1(%[ftmp18], %[block], 0x30)
345 MMI_SDC1(%[ftmp19], %[block], 0x40)
346 MMI_SDC1(%[ftmp20], %[block], 0x50)
347 MMI_SDC1(%[ftmp21], %[block], 0x60)
348 MMI_SDC1(%[ftmp22], %[block], 0x70)
351 MMI_LDC1(%[ftmp1], %[
temp], 0x08)
352 MMI_LDC1(%[ftmp11], %[
temp], 0x18)
353 MMI_LDC1(%[ftmp2], %[
temp], 0x28)
354 MMI_LDC1(%[ftmp12], %[
temp], 0x38)
355 MMI_LDC1(%[ftmp3], %[
temp], 0x48)
356 MMI_LDC1(%[ftmp13], %[
temp], 0x58)
357 MMI_LDC1(%[ftmp4], %[
temp], 0x68)
358 MMI_LDC1(%[ftmp14], %[
temp], 0x78)
359 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 360 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 361 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 362 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 364 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t" 365 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t" 366 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t" 367 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t" 385 MMI_SDC1(%[ftmp15], %[block], 0x08)
386 MMI_SDC1(%[ftmp16], %[block], 0x18)
387 MMI_SDC1(%[ftmp17], %[block], 0x28)
388 MMI_SDC1(%[ftmp18], %[block], 0x38)
389 MMI_SDC1(%[ftmp19], %[block], 0x48)
390 MMI_SDC1(%[ftmp20], %[block], 0x58)
391 MMI_SDC1(%[ftmp21], %[block], 0x68)
392 MMI_SDC1(%[ftmp22], %[block], 0x78)
394 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
395 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
396 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
397 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
398 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
399 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
400 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
401 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
402 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
403 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
404 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
405 [ftmp22]
"=&f"(ftmp[22]),
408 [
ff_pw_4]
"f"(ff_pw_4_local), [block]
"r"(block),
421 dc = ( 3 * dc + 1) >> 1;
422 dc = (17 * dc + 64) >> 7;
425 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 426 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 428 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
429 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
430 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
431 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
433 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" 434 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 435 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" 436 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 437 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" 438 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 439 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" 440 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 442 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 443 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 444 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 445 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 446 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 447 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 448 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 449 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 451 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 452 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t" 453 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" 454 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" 456 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
457 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
458 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
459 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
460 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
461 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
462 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
463 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
464 [ftmp8]
"=&f"(ftmp[8])
465 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
466 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
472 #if _MIPS_SIM != _ABIO32 476 int16_t *dst =
block;
480 DECLARE_ALIGNED(16,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
481 DECLARE_ALIGNED(16,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
482 int16_t
coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
483 12, 15, 6, -4, -12, -16, -16, -9,
484 12, 9, -6, -16, -12, 4, 16, 15,
485 12, 4, -16, -9, 12, 15, -6, -16,
486 12, -4, -16, 9, 12, -15, -6, 16,
487 12, -9, -6, 16, -12, -4, 16, -15,
488 12, -15, 6, 4, -12, 16, -16, 9,
489 12, -16, 16, -15, 12, -9, 6, -4};
493 "li %[tmp0], 0x03 \n\t" 494 "mtc1 %[tmp0], %[ftmp0] \n\t" 497 MMI_LDC1(%[ftmp1], %[src], 0x00)
498 MMI_LDC1(%[ftmp2], %[src], 0x08)
501 MMI_LDC1(%[ftmp3], %[coeff], 0x00)
502 MMI_LDC1(%[ftmp4], %[coeff], 0x08)
503 MMI_LDC1(%[ftmp5], %[coeff], 0x10)
504 MMI_LDC1(%[ftmp6], %[coeff], 0x18)
505 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 506 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 507 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 508 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 509 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 510 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 511 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 512 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 513 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t" 514 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t" 517 MMI_LDC1(%[ftmp3], %[coeff], 0x20)
518 MMI_LDC1(%[ftmp4], %[coeff], 0x28)
519 MMI_LDC1(%[ftmp5], %[coeff], 0x30)
520 MMI_LDC1(%[ftmp6], %[coeff], 0x38)
521 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 522 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 523 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 524 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 525 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 526 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 527 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 528 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 529 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t" 530 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t" 533 MMI_LDC1(%[ftmp3], %[coeff], 0x40)
534 MMI_LDC1(%[ftmp4], %[coeff], 0x48)
535 MMI_LDC1(%[ftmp5], %[coeff], 0x50)
536 MMI_LDC1(%[ftmp6], %[coeff], 0x58)
537 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 538 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 539 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 540 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 541 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 542 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 543 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 544 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 545 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t" 546 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t" 549 MMI_LDC1(%[ftmp3], %[coeff], 0x60)
550 MMI_LDC1(%[ftmp4], %[coeff], 0x68)
551 MMI_LDC1(%[ftmp5], %[coeff], 0x70)
552 MMI_LDC1(%[ftmp6], %[coeff], 0x78)
553 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" 554 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t" 555 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 556 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t" 557 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t" 558 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 559 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t" 560 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t" 561 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t" 562 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t" 565 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t" 566 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t" 567 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" 568 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" 569 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t" 570 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t" 571 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" 572 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t" 573 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t" 574 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t" 575 MMI_SDC1(%[ftmp9], %[dst], 0x00)
576 MMI_SDC1(%[ftmp10], %[dst], 0x08)
580 "addiu %[count], %[count], -0x01 \n\t" 581 "bnez %[count], 1b \n\t" 582 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
583 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
584 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
585 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
586 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
587 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
588 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
589 [ftmp14]
"=&f"(ftmp[14]), [tmp0]
"=&r"(tmp[0]),
599 "li %[tmp0], 0x44 \n\t" 600 "mtc1 %[tmp0], %[ftmp15] \n\t" 603 "li %[tmp0], 0x07 \n\t" 604 "mtc1 %[tmp0], %[ftmp0] \n\t" 605 MMI_LDC1(%[ftmp1], %[src], 0x00)
606 MMI_LDC1(%[ftmp2], %[src], 0x10)
607 MMI_LDC1(%[ftmp3], %[src], 0x20)
608 MMI_LDC1(%[ftmp4], %[src], 0x30)
609 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 610 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 611 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 612 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 615 "li %[tmp0], 0x00160011 \n\t" 616 "mtc1 %[tmp0], %[ftmp3] \n\t" 617 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 618 "li %[tmp0], 0x000a0011 \n\t" 619 "mtc1 %[tmp0], %[ftmp4] \n\t" 620 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 621 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 622 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 623 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 624 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 625 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 626 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 627 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 628 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 629 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 630 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 631 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 632 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 633 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 636 "li %[tmp0], 0x000a0011 \n\t" 637 "mtc1 %[tmp0], %[ftmp3] \n\t" 638 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 639 "li %[tmp0], 0xffeaffef \n\t" 640 "mtc1 %[tmp0], %[ftmp4] \n\t" 641 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 642 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 643 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 644 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 645 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 646 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 647 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 648 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 649 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 650 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 651 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 652 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 653 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 654 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 657 "li %[tmp0], 0xfff60011 \n\t" 658 "mtc1 %[tmp0], %[ftmp3] \n\t" 659 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 660 "li %[tmp0], 0x0016ffef \n\t" 661 "mtc1 %[tmp0], %[ftmp4] \n\t" 662 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 663 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 664 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 665 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 666 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 667 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 668 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 669 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 670 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 671 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 672 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 673 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 674 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 675 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 678 "li %[tmp0], 0xffea0011 \n\t" 679 "mtc1 %[tmp0], %[ftmp3] \n\t" 680 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 681 "li %[tmp0], 0xfff60011 \n\t" 682 "mtc1 %[tmp0], %[ftmp4] \n\t" 683 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 684 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 685 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 686 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 687 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 688 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 689 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 690 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 691 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 692 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 693 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 694 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 695 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 696 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 698 MMI_LWC1(%[ftmp1], %[dest], 0x00)
699 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 700 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
701 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 702 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
703 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 704 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
705 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 706 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 707 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 708 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 709 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 710 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 711 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 712 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 713 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 714 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 715 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 716 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 717 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 718 MMI_SWC1(%[ftmp1], %[dest], 0x00)
719 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 720 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
721 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 722 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
723 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 724 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
727 "li %[tmp0], 0x07 \n\t" 728 "mtc1 %[tmp0], %[ftmp0] \n\t" 729 MMI_LDC1(%[ftmp1], %[src], 0x08)
730 MMI_LDC1(%[ftmp2], %[src], 0x18)
731 MMI_LDC1(%[ftmp3], %[src], 0x28)
732 MMI_LDC1(%[ftmp4], %[src], 0x38)
733 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 734 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 735 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 736 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 739 "li %[tmp0], 0x00160011 \n\t" 740 "mtc1 %[tmp0], %[ftmp3] \n\t" 741 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 742 "li %[tmp0], 0x000a0011 \n\t" 743 "mtc1 %[tmp0], %[ftmp4] \n\t" 744 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 745 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 746 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 747 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 748 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 749 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 750 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 751 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 752 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 753 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 754 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 755 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 756 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 757 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 760 "li %[tmp0], 0x000a0011 \n\t" 761 "mtc1 %[tmp0], %[ftmp3] \n\t" 762 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 763 "li %[tmp0], 0xffeaffef \n\t" 764 "mtc1 %[tmp0], %[ftmp4] \n\t" 765 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 766 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 767 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 768 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 769 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 770 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 771 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 772 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 773 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 774 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 775 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 776 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 777 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 778 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 781 "li %[tmp0], 0xfff60011 \n\t" 782 "mtc1 %[tmp0], %[ftmp3] \n\t" 783 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 784 "li %[tmp0], 0x0016ffef \n\t" 785 "mtc1 %[tmp0], %[ftmp4] \n\t" 786 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 787 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 788 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 789 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 790 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 791 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 792 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 793 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 794 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 795 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 796 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 797 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 798 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 799 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 802 "li %[tmp0], 0xffea0011 \n\t" 803 "mtc1 %[tmp0], %[ftmp3] \n\t" 804 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 805 "li %[tmp0], 0xfff60011 \n\t" 806 "mtc1 %[tmp0], %[ftmp4] \n\t" 807 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 808 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 809 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 810 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 811 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 812 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 813 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 814 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 815 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 816 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 817 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 818 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 819 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 820 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 822 MMI_LWC1(%[ftmp1], %[dest], 0x04)
823 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 824 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
825 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 826 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
827 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 828 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
829 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 830 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 831 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 832 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 833 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 834 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 835 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 836 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 837 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 838 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 839 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 840 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 841 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 842 MMI_SWC1(%[ftmp1], %[dest], 0x04)
843 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 844 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
845 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 846 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
847 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 848 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
850 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
851 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
852 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
853 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
854 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
855 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
856 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
857 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
860 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
873 dc = (17 * dc + 4) >> 3;
874 dc = (12 * dc + 64) >> 7;
877 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 878 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 880 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
881 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
882 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
883 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
884 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
885 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
886 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
887 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
889 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 890 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 891 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 892 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 893 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 894 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 895 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 896 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 898 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 899 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 900 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 901 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 902 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t" 903 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t" 904 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t" 905 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t" 907 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 908 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 909 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 910 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 911 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 912 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 913 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 914 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 916 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
917 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
918 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
919 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
920 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
921 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
922 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
923 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
924 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
925 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
926 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
927 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
929 [ftmp8]
"=&f"(ftmp[8])
930 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
931 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
932 [dest4]
"r"(dest+4*linesize), [dest5]
"r"(dest+5*linesize),
933 [dest6]
"r"(dest+6*linesize), [dest7]
"r"(dest+7*linesize),
939 #if _MIPS_SIM != _ABIO32 943 int16_t *dst =
block;
946 int16_t
coeff[16] = {17, 22, 17, 10,
950 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
951 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
952 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
957 "li %[tmp0], 0x03 \n\t" 958 "mtc1 %[tmp0], %[ftmp0] \n\t" 960 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
961 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
962 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
963 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
966 MMI_LDC1(%[ftmp1], %[src], 0x00)
967 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 968 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 969 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 970 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 971 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 972 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 973 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 974 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 975 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 976 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 977 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 978 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 979 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 980 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 981 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 982 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 983 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 984 MMI_SDC1(%[ftmp8], %[dst], 0x00)
988 "addiu %[count], %[count], -0x01 \n\t" 989 "bnez %[count], 1b \n\t" 990 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
991 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
992 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
993 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
994 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
995 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
996 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
997 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
998 : [
ff_pw_4]
"f"(ff_pw_4_local), [coeff]
"r"(coeff)
1006 "li %[tmp0], 0x07 \n\t" 1007 "mtc1 %[tmp0], %[ftmp0] \n\t" 1009 MMI_LDC1(%[ftmp1], %[src], 0x00)
1010 MMI_LDC1(%[ftmp2], %[src], 0x20)
1011 MMI_LDC1(%[ftmp3], %[src], 0x40)
1012 MMI_LDC1(%[ftmp4], %[src], 0x60)
1013 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1014 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1015 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1016 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1018 MMI_LDC1(%[ftmp1], %[src], 0x10)
1019 MMI_LDC1(%[ftmp2], %[src], 0x30)
1020 MMI_LDC1(%[ftmp3], %[src], 0x50)
1021 MMI_LDC1(%[ftmp4], %[src], 0x70)
1022 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1023 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1024 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t" 1025 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t" 1043 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1044 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1045 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1046 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1047 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1048 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1049 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1050 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1051 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1052 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1053 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1054 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1055 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1056 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1057 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1058 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1059 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1060 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1061 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1062 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1063 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1064 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1065 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1066 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1068 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t" 1069 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t" 1070 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t" 1071 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t" 1072 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t" 1073 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t" 1074 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t" 1075 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t" 1077 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1078 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1079 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1080 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1081 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" 1082 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" 1083 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t" 1084 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1086 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1087 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1088 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1089 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1090 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1091 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1092 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1093 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1094 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1095 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1096 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1097 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1098 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1099 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1100 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1102 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1103 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1104 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1105 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1106 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1107 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1108 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1109 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1110 [ftmp16]
"=&f"(ftmp[16]), [ftmp17]
"=&f"(ftmp[17]),
1111 [ftmp18]
"=&f"(ftmp[18]), [ftmp19]
"=&f"(ftmp[19]),
1112 [ftmp20]
"=&f"(ftmp[20]), [ftmp21]
"=&f"(ftmp[21]),
1113 [ftmp22]
"=&f"(ftmp[22]),
1116 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1129 dc = (17 * dc + 4) >> 3;
1130 dc = (17 * dc + 64) >> 7;
1133 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1134 "pshufh %[dc], %[dc], %[ftmp0] \n\t" 1136 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1137 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1138 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1139 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1141 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1142 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1143 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1144 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1146 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t" 1147 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t" 1148 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t" 1149 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t" 1151 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1152 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1153 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1154 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1156 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1157 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1158 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1159 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1160 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1161 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1163 [ftmp4]
"=&f"(ftmp[4])
1164 : [dest0]
"r"(dest+0*linesize), [dest1]
"r"(dest+1*linesize),
1165 [dest2]
"r"(dest+2*linesize), [dest3]
"r"(dest+3*linesize),
1174 int16_t *dst =
block;
1177 int16_t
coeff[16] = {17, 22, 17, 10,
1181 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1182 DECLARE_ALIGNED(8,
const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1186 "li %[tmp0], 0x03 \n\t" 1187 "mtc1 %[tmp0], %[ftmp0] \n\t" 1188 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1189 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1190 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1191 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1194 MMI_LDC1(%[ftmp1], %[src], 0x00)
1195 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t" 1196 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" 1197 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" 1198 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t" 1199 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t" 1200 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t" 1201 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1202 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1203 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t" 1204 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t" 1205 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t" 1206 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t" 1207 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t" 1208 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1209 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t" 1210 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t" 1211 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t" 1212 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1216 "addiu %[count], %[count], -0x01 \n\t" 1217 "bnez %[count], 1b \n\t" 1218 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1219 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1220 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1221 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1222 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1223 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1224 [tmp0]
"=&r"(
tmp[0]), [count]
"+&r"(count),
1225 [
src]
"+&r"(
src), [dst]
"+&r"(dst)
1226 : [
ff_pw_4]
"f"(ff_pw_4_local), [coeff]
"r"(coeff)
1234 "li %[tmp0], 0x07 \n\t" 1235 "mtc1 %[tmp0], %[ftmp0] \n\t" 1236 "li %[tmp0], 0x44 \n\t" 1237 "mtc1 %[tmp0], %[ftmp15] \n\t" 1239 MMI_LDC1(%[ftmp1], %[src], 0x00)
1240 MMI_LDC1(%[ftmp2], %[src], 0x10)
1241 MMI_LDC1(%[ftmp3], %[src], 0x20)
1242 MMI_LDC1(%[ftmp4], %[src], 0x30)
1243 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t" 1244 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t" 1245 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t" 1246 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t" 1249 "li %[tmp0], 0x00160011 \n\t" 1250 "mtc1 %[tmp0], %[ftmp3] \n\t" 1251 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1252 "li %[tmp0], 0x000a0011 \n\t" 1253 "mtc1 %[tmp0], %[ftmp4] \n\t" 1254 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1255 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1256 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1257 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1258 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1259 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1260 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1261 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1262 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1263 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1264 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1265 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1266 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1267 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t" 1270 "li %[tmp0], 0x000a0011 \n\t" 1271 "mtc1 %[tmp0], %[ftmp3] \n\t" 1272 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1273 "li %[tmp0], 0xffeaffef \n\t" 1274 "mtc1 %[tmp0], %[ftmp4] \n\t" 1275 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1276 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1277 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1278 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1279 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1280 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1281 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1282 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1283 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1284 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1285 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1286 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1287 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1288 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t" 1291 "li %[tmp0], 0xfff60011 \n\t" 1292 "mtc1 %[tmp0], %[ftmp3] \n\t" 1293 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1294 "li %[tmp0], 0x0016ffef \n\t" 1295 "mtc1 %[tmp0], %[ftmp4] \n\t" 1296 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1297 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1298 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1299 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1300 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1301 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1302 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1303 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1304 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1305 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1306 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1307 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1308 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1309 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t" 1312 "li %[tmp0], 0xffea0011 \n\t" 1313 "mtc1 %[tmp0], %[ftmp3] \n\t" 1314 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t" 1315 "li %[tmp0], 0xfff60011 \n\t" 1316 "mtc1 %[tmp0], %[ftmp4] \n\t" 1317 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t" 1318 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t" 1319 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t" 1320 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t" 1321 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t" 1322 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t" 1323 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t" 1324 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t" 1325 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t" 1326 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t" 1327 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t" 1328 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" 1329 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t" 1330 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t" 1332 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1333 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1334 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1335 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1336 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1337 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1338 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1339 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 1340 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1341 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1342 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1343 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1344 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" 1345 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t" 1346 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t" 1347 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" 1348 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 1349 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" 1350 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" 1351 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" 1353 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1354 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t" 1355 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1356 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1357 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1358 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t" 1359 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1361 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
1362 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
1363 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
1364 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
1365 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
1366 [ftmp10]
"=&f"(ftmp[10]), [ftmp11]
"=&f"(ftmp[11]),
1367 [ftmp12]
"=&f"(ftmp[12]), [ftmp13]
"=&f"(ftmp[13]),
1368 [ftmp14]
"=&f"(ftmp[14]), [ftmp15]
"=&f"(ftmp[15]),
1371 [
src]
"r"(
src), [dest]
"r"(dest), [linesize]
"r"(linesize)
1383 for (i = 0; i < 8; i++) {
1388 d1 = (a - d + 3 +
rnd) >> 3;
1389 d2 = (a - d + b - c + 4 -
rnd) >> 3;
1392 src[-1] = av_clip_uint8(b - d2);
1393 src[0] = av_clip_uint8(c + d2);
1405 int rnd1 = flags & 2 ? 3 : 4;
1406 int rnd2 = 7 - rnd1;
1407 for (i = 0; i < 8; i++) {
1415 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1416 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1417 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1418 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1420 right += right_stride;
1421 left += left_stride;
1436 for (i = 0; i < 8; i++) {
1441 d1 = (a - d + 3 +
rnd) >> 3;
1442 d2 = (a - d + b - c + 4 -
rnd) >> 3;
1444 src[-2 *
stride] = a - d1;
1445 src[-
stride] = av_clip_uint8(b - d2);
1446 src[0] = av_clip_uint8(c + d2);
1458 int rnd1 = 4, rnd2 = 3;
1459 for (i = 0; i < 8; i++) {
1467 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1468 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1469 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1470 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1490 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1491 int a0_sign = a0 >> 31;
1493 a0 = (a0 ^ a0_sign) - a0_sign;
1495 int a1 =
FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1496 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1497 int a2 =
FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1498 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1499 if (a1 < a0 || a2 < a0) {
1501 int clip_sign = clip >> 31;
1503 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1506 int d = 5 * (a3 -
a0);
1507 int d_sign = (d >> 31);
1509 d = ((d ^ d_sign) - d_sign) >> 3;
1512 if (d_sign ^ clip_sign)
1516 d = (d ^ d_sign) - d_sign;
1517 src[-1 *
stride] = av_clip_uint8(src[-1 * stride] - d);
1518 src[ 0 *
stride] = av_clip_uint8(src[ 0 * stride] + d);
1542 for (i = 0; i <
len; i += 4) {
1604 #define OP_PUT(S, D) 1605 #define OP_AVG(S, D) \ 1606 "ldc1 $f16, "#S" \n\t" \ 1607 "pavgb "#D", "#D", $f16 \n\t" 1610 #define NORMALIZE_MMI(SHIFT) \ 1611 "paddh $f6, $f6, $f14 \n\t" \ 1612 "paddh $f8, $f8, $f14 \n\t" \ 1613 "psrah $f6, $f6, "SHIFT" \n\t" \ 1614 "psrah $f8, $f8, "SHIFT" \n\t" 1616 #define TRANSFER_DO_PACK(OP) \ 1617 "packushb $f6, $f6, $f8 \n\t" \ 1619 "sdc1 $f6, 0x00(%[dst]) \n\t" 1621 #define TRANSFER_DONT_PACK(OP) \ 1622 OP(0(%[dst]), $f6) \ 1623 OP(8(%[dst]), $f8) \ 1624 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1625 "sdc1 $f8, 0x08(%[dst]) \n\t" 1628 #define DO_UNPACK(reg) \ 1629 "punpcklbh "reg", "reg", $f0 \n\t" 1630 #define DONT_UNPACK(reg) 1633 #define LOAD_ROUNDER_MMI(ROUND) \ 1634 "lwc1 $f14, "ROUND" \n\t" \ 1635 "punpcklhw $f14, $f14, $f14 \n\t" \ 1636 "punpcklwd $f14, $f14, $f14 \n\t" 1639 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \ 1640 "paddh "#R1", "#R1", "#R2" \n\t" \ 1641 PTR_ADDU "$9, %[src], %[stride1] \n\t" \ 1642 MMI_ULWC1(R0, $9, 0x00) \ 1643 "pmullh "#R1", "#R1", $f6 \n\t" \ 1644 "punpcklbh "#R0", "#R0", $f0 \n\t" \ 1645 PTR_ADDU "$9, %[src], %[stride] \n\t" \ 1646 MMI_ULWC1(R3, $9, 0x00) \ 1647 "psubh "#R1", "#R1", "#R0" \n\t" \ 1648 "punpcklbh "#R3", "#R3", $f0 \n\t" \ 1649 "paddh "#R1", "#R1", $f14 \n\t" \ 1650 "psubh "#R1", "#R1", "#R3" \n\t" \ 1651 "psrah "#R1", "#R1", %[shift] \n\t" \ 1652 MMI_SDC1(R1, %[dst], OFF) \ 1653 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1664 "xor $f0, $f0, $f0 \n\t" 1667 "ldc1 $f12, %[ff_pw_9] \n\t" 1669 MMI_ULWC1($f4, %[src], 0x00)
1670 PTR_ADDU "%[src], %[src], %[stride] \n\t" 1671 MMI_ULWC1($f6, %[src], 0x00)
1672 "punpcklbh $f4, $f4, $f0 \n\t" 1673 "punpcklbh $f6, $f6, $f0 \n\t" 1682 PTR_SUBU "%[src], %[src], %[stride2] \n\t" 1684 "addiu $8, $8, -0x01 \n\t" 1686 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1687 [
src]
"+r"(
src), [dst]
"+r"(dst)
1691 :
"$8",
"$9",
"$f0",
"$f2",
"$f4",
"$f6",
"$f8",
"$f10",
"$f12",
1692 "$f14",
"$f16",
"memory" 1700 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \ 1701 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \ 1702 const int16_t *src, int rnd) \ 1705 DECLARE_VAR_ALL64; \ 1706 DECLARE_VAR_ADDRT; \ 1709 rnd -= (-1+9+9-1)*1024; \ 1712 LOAD_ROUNDER_MMI("%[rnd]") \ 1713 "ldc1 $f12, %[ff_pw_128] \n\t" \ 1714 "ldc1 $f10, %[ff_pw_9] \n\t" \ 1716 MMI_ULDC1($f2, %[src], 0x00) \ 1717 MMI_ULDC1($f4, %[src], 0x08) \ 1718 MMI_ULDC1($f6, %[src], 0x02) \ 1719 MMI_ULDC1($f8, %[src], 0x0a) \ 1720 MMI_ULDC1($f0, %[src], 0x06) \ 1721 "paddh $f2, $f2, $f0 \n\t" \ 1722 MMI_ULDC1($f0, %[src], 0x0e) \ 1723 "paddh $f4, $f4, $f0 \n\t" \ 1724 MMI_ULDC1($f0, %[src], 0x04) \ 1725 "paddh $f6, $f6, $f0 \n\t" \ 1726 MMI_ULDC1($f0, %[src], 0x0b) \ 1727 "paddh $f8, $f8, $f0 \n\t" \ 1728 "pmullh $f6, $f6, $f10 \n\t" \ 1729 "pmullh $f8, $f8, $f10 \n\t" \ 1730 "psubh $f6, $f6, $f2 \n\t" \ 1731 "psubh $f8, $f8, $f4 \n\t" \ 1732 "li $8, 0x07 \n\t" \ 1733 "mtc1 $8, $f16 \n\t" \ 1734 NORMALIZE_MMI("$f16") \ 1736 "paddh $f6, $f6, $f12 \n\t" \ 1737 "paddh $f8, $f8, $f12 \n\t" \ 1738 TRANSFER_DO_PACK(OP) \ 1739 "addiu %[h], %[h], -0x01 \n\t" \ 1740 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \ 1741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1742 "bnez %[h], 1b \n\t" \ 1743 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1745 [src]"+r"(src), [dst]"+r"(dst) \ 1746 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1747 [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \ 1748 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \ 1760 #define VC1_SHIFT2(OP, OPNAME)\ 1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \ 1762 mips_reg stride, int rnd, \ 1765 DECLARE_VAR_LOW32; \ 1766 DECLARE_VAR_ADDRT; \ 1771 "xor $f0, $f0, $f0 \n\t" \ 1772 "li $10, 0x08 \n\t" \ 1773 LOAD_ROUNDER_MMI("%[rnd]") \ 1774 "ldc1 $f12, %[ff_pw_9] \n\t" \ 1776 MMI_ULWC1($f6, %[src], 0x00) \ 1777 MMI_ULWC1($f8, %[src], 0x04) \ 1778 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1779 MMI_ULWC1($f2, $9, 0x00) \ 1780 MMI_ULWC1($f4, $9, 0x04) \ 1781 PTR_ADDU "%[src], %[src], %[offset] \n\t" \ 1782 "punpcklbh $f6, $f6, $f0 \n\t" \ 1783 "punpcklbh $f8, $f8, $f0 \n\t" \ 1784 "punpcklbh $f2, $f2, $f0 \n\t" \ 1785 "punpcklbh $f4, $f4, $f0 \n\t" \ 1786 "paddh $f6, $f6, $f2 \n\t" \ 1787 "paddh $f8, $f8, $f4 \n\t" \ 1788 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \ 1789 MMI_ULWC1($f2, $9, 0x00) \ 1790 MMI_ULWC1($f4, $9, 0x04) \ 1791 "pmullh $f6, $f6, $f12 \n\t" \ 1792 "pmullh $f8, $f8, $f12 \n\t" \ 1793 "punpcklbh $f2, $f2, $f0 \n\t" \ 1794 "punpcklbh $f4, $f4, $f0 \n\t" \ 1795 "psubh $f6, $f6, $f2 \n\t" \ 1796 "psubh $f8, $f8, $f4 \n\t" \ 1797 PTR_ADDU "$9, %[src], %[offset] \n\t" \ 1798 MMI_ULWC1($f2, $9, 0x00) \ 1799 MMI_ULWC1($f4, $9, 0x04) \ 1800 "punpcklbh $f2, $f2, $f0 \n\t" \ 1801 "punpcklbh $f4, $f4, $f0 \n\t" \ 1802 "psubh $f6, $f6, $f2 \n\t" \ 1803 "psubh $f8, $f8, $f4 \n\t" \ 1804 "li $8, 0x04 \n\t" \ 1805 "mtc1 $8, $f16 \n\t" \ 1806 NORMALIZE_MMI("$f16") \ 1807 "packushb $f6, $f6, $f8 \n\t" \ 1809 "sdc1 $f6, 0x00(%[dst]) \n\t" \ 1810 "addiu $10, $10, -0x01 \n\t" \ 1811 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \ 1812 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1813 "bnez $10, 1b \n\t" \ 1814 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1815 [src]"+r"(src), [dst]"+r"(dst) \ 1816 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \ 1817 [stride]"r"(stride), [rnd]"m"(rnd), \ 1818 [stride1]"r"(stride-offset), \ 1819 [ff_pw_9]"m"(ff_pw_9) \ 1820 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \ 1821 "$f12", "$f14", "$f16", "memory" \ 1839 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \ 1840 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1841 LOAD($f2, $9, M*0) \ 1842 LOAD($f4, $9, M*4) \ 1845 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \ 1846 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \ 1847 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1848 LOAD($f6, $9, M*0) \ 1849 LOAD($f8, $9, M*4) \ 1852 "pmullh $f6, $f6, $f12 \n\t" \ 1853 "pmullh $f8, $f8, $f12 \n\t" \ 1854 "psubh $f6, $f6, $f2 \n\t" \ 1855 "psubh $f8, $f8, $f4 \n\t" \ 1856 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1857 LOAD($f2, $9, M*0) \ 1858 LOAD($f4, $9, M*4) \ 1861 "li $8, 0x02 \n\t" \ 1862 "mtc1 $8, $f16 \n\t" \ 1863 "psllh $f2, $f2, $f16 \n\t" \ 1864 "psllh $f4, $f4, $f16 \n\t" \ 1865 "psubh $f6, $f6, $f2 \n\t" \ 1866 "psubh $f8, $f8, $f4 \n\t" \ 1867 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1868 LOAD($f2, $9, M*0) \ 1869 LOAD($f4, $9, M*4) \ 1872 "pmullh $f2, $f2, $f10 \n\t" \ 1873 "pmullh $f4, $f4, $f10 \n\t" \ 1874 "paddh $f6, $f6, $f2 \n\t" \ 1875 "paddh $f8, $f8, $f4 \n\t" 1885 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ 1887 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \ 1888 mips_reg src_stride, \ 1889 int rnd, int64_t shift) \ 1892 DECLARE_VAR_LOW32; \ 1893 DECLARE_VAR_ADDRT; \ 1895 src -= src_stride; \ 1898 "xor $f0, $f0, $f0 \n\t" \ 1899 LOAD_ROUNDER_MMI("%[rnd]") \ 1900 "ldc1 $f10, %[ff_pw_53] \n\t" \ 1901 "ldc1 $f12, %[ff_pw_18] \n\t" \ 1904 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 1905 NORMALIZE_MMI("%[shift]") \ 1906 TRANSFER_DONT_PACK(OP_PUT) \ 1908 PTR_ADDU "$9, %[src], "#A1" \n\t" \ 1909 MMI_ULWC1($f2, $9, 0x08) \ 1911 "mov.d $f6, $f2 \n\t" \ 1912 "paddh $f2, $f2, $f2 \n\t" \ 1913 "paddh $f2, $f2, $f6 \n\t" \ 1914 PTR_ADDU "$9, %[src], "#A2" \n\t" \ 1915 MMI_ULWC1($f6, $9, 0x08) \ 1917 "pmullh $f6, $f6, $f12 \n\t" \ 1918 "psubh $f6, $f6, $f2 \n\t" \ 1919 PTR_ADDU "$9, %[src], "#A3" \n\t" \ 1920 MMI_ULWC1($f2, $9, 0x08) \ 1922 "pmullh $f2, $f2, $f10 \n\t" \ 1923 "paddh $f6, $f6, $f2 \n\t" \ 1924 PTR_ADDU "$9, %[src], "#A4" \n\t" \ 1925 MMI_ULWC1($f2, $9, 0x08) \ 1927 "li $8, 0x02 \n\t" \ 1928 "mtc1 $8, $f16 \n\t" \ 1929 "psllh $f2, $f2, $f16 \n\t" \ 1930 "psubh $f6, $f6, $f2 \n\t" \ 1931 "paddh $f6, $f6, $f14 \n\t" \ 1932 "li $8, 0x06 \n\t" \ 1933 "mtc1 $8, $f16 \n\t" \ 1934 "psrah $f6, $f6, $f16 \n\t" \ 1935 "sdc1 $f6, 0x10(%[dst]) \n\t" \ 1936 "addiu %[h], %[h], -0x01 \n\t" \ 1937 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \ 1938 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \ 1939 "bnez %[h], 1b \n\t" \ 1940 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 1942 [src]"+r"(src), [dst]"+r"(dst) \ 1943 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \ 1944 [stride_x3]"r"(3*src_stride), \ 1945 [rnd]"m"(rnd), [shift]"f"(shift), \ 1946 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 1947 [ff_pw_3]"f"(ff_pw_3) \ 1948 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 1949 "$f14", "$f16", "memory" \ 1960 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 1962 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \ 1963 const int16_t *src, int rnd) \ 1966 DECLARE_VAR_ALL64; \ 1967 DECLARE_VAR_ADDRT; \ 1970 rnd -= (-4+58+13-3)*256; \ 1973 "xor $f0, $f0, $f0 \n\t" \ 1974 LOAD_ROUNDER_MMI("%[rnd]") \ 1975 "ldc1 $f10, %[ff_pw_53] \n\t" \ 1976 "ldc1 $f12, %[ff_pw_18] \n\t" \ 1979 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \ 1980 "li $8, 0x07 \n\t" \ 1981 "mtc1 $8, $f16 \n\t" \ 1982 NORMALIZE_MMI("$f16") \ 1984 "paddh $f6, $f6, %[ff_pw_128] \n\t" \ 1985 "paddh $f8, $f8, %[ff_pw_128] \n\t" \ 1986 TRANSFER_DO_PACK(OP) \ 1987 "addiu %[h], %[h], -0x01 \n\t" \ 1988 PTR_ADDU "%[src], %[src], 0x18 \n\t" \ 1989 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 1990 "bnez %[h], 1b \n\t" \ 1991 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \ 1993 [src]"+r"(src), [dst]"+r"(dst) \ 1994 : [stride]"r"(stride), [rnd]"m"(rnd), \ 1995 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 1996 [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \ 1997 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 1998 "$f14", "$f16", "memory" \ 2010 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ 2012 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \ 2013 mips_reg stride, int rnd, mips_reg offset) \ 2016 DECLARE_VAR_LOW32; \ 2017 DECLARE_VAR_ADDRT; \ 2022 __asm__ volatile ( \ 2023 "xor $f0, $f0, $f0 \n\t" \ 2024 LOAD_ROUNDER_MMI("%[rnd]") \ 2025 "ldc1 $f10, %[ff_pw_53] \n\t" \ 2026 "ldc1 $f12, %[ff_pw_18] \n\t" \ 2029 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \ 2030 "li $8, 0x06 \n\t" \ 2031 "mtc1 $8, $f16 \n\t" \ 2032 NORMALIZE_MMI("$f16") \ 2033 TRANSFER_DO_PACK(OP) \ 2034 "addiu %[h], %[h], -0x01 \n\t" \ 2035 PTR_ADDU "%[src], %[src], %[stride] \n\t" \ 2036 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \ 2037 "bnez %[h], 1b \n\t" \ 2038 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \ 2040 [src]"+r"(src), [dst]"+r"(dst) \ 2041 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \ 2042 [offset_x3]"r"(3*offset), [stride]"r"(stride), \ 2044 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \ 2045 [ff_pw_3]"f"(ff_pw_3) \ 2046 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \ 2047 "$f14", "$f16", "memory" \ 2060 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2061 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2086 #define VC1_MSPEL_MC(OP) \ 2087 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ 2088 int hmode, int vmode, int rnd) \ 2090 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ 2091 { NULL, vc1_put_ver_16b_shift1_mmi, \ 2092 vc1_put_ver_16b_shift2_mmi, \ 2093 vc1_put_ver_16b_shift3_mmi }; \ 2094 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ 2095 { NULL, OP ## vc1_hor_16b_shift1_mmi, \ 2096 OP ## vc1_hor_16b_shift2_mmi, \ 2097 OP ## vc1_hor_16b_shift3_mmi }; \ 2098 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \ 2099 { NULL, OP ## vc1_shift1_mmi, \ 2100 OP ## vc1_shift2_mmi, \ 2101 OP ## vc1_shift3_mmi }; \ 2105 static const int shift_value[] = { 0, 5, 1, 5 }; \ 2106 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \ 2108 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \ 2110 r = (1<<(shift-1)) + rnd-1; \ 2111 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \ 2113 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \ 2117 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \ 2123 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \ 2125 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ 2126 int stride, int hmode, int vmode, int rnd)\ 2128 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2129 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2130 dst += 8*stride; src += 8*stride; \ 2131 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 2132 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 2139 #define DECLARE_FUNCTION(a, b) \ 2140 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2141 const uint8_t *src, \ 2145 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2147 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \ 2148 const uint8_t *src, \ 2152 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ 2154 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2155 const uint8_t *src, \ 2159 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2161 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \ 2162 const uint8_t *src, \ 2166 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ 2188 #define CHROMA_MC_8_MMI \ 2189 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ 2190 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2191 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ 2192 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2193 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ 2194 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2195 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \ 2196 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2198 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2199 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \ 2200 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2201 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \ 2202 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2203 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \ 2204 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2205 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \ 2207 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2208 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2209 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2210 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2212 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \ 2213 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \ 2214 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ 2215 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \ 2217 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ 2218 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ 2219 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t" 2222 #define CHROMA_MC_4_MMI \ 2223 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \ 2224 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ 2225 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \ 2226 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ 2228 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \ 2229 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \ 2230 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \ 2231 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \ 2233 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ 2234 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \ 2235 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ 2236 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \ 2238 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \ 2239 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" 2244 ptrdiff_t stride,
int h,
int x,
int y)
2246 const int A = (8 - x) * (8 - y);
2247 const int B = (x) * (8 - y);
2248 const int C = (8 - x) * (y);
2249 const int D = (x) * (y);
2255 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2258 "li %[tmp0], 0x06 \n\t" 2259 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2260 "mtc1 %[tmp0], %[ftmp9] \n\t" 2261 "pshufh %[A], %[A], %[ftmp0] \n\t" 2262 "pshufh %[B], %[B], %[ftmp0] \n\t" 2263 "pshufh %[C], %[C], %[ftmp0] \n\t" 2264 "pshufh %[D], %[D], %[ftmp0] \n\t" 2267 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2268 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2269 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2270 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2271 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2275 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2276 "addiu %[h], %[h], -0x01 \n\t" 2277 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2278 "bnez %[h], 1b \n\t" 2279 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2280 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2281 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2282 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2283 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2286 [tmp0]
"=&r"(tmp[0]),
2287 [src]
"+&r"(src), [dst]
"+&r"(dst),
2290 [
A]
"f"(
A), [B]
"f"(B),
2291 [
C]
"f"(
C), [D]
"f"(D),
2299 ptrdiff_t stride,
int h,
int x,
int y)
2301 const int A = (8 - x) * (8 - y);
2302 const int B = (x) * (8 - y);
2303 const int C = (8 - x) * (y);
2304 const int D = (x) * (y);
2310 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2313 "li %[tmp0], 0x06 \n\t" 2314 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2315 "mtc1 %[tmp0], %[ftmp5] \n\t" 2316 "pshufh %[A], %[A], %[ftmp0] \n\t" 2317 "pshufh %[B], %[B], %[ftmp0] \n\t" 2318 "pshufh %[C], %[C], %[ftmp0] \n\t" 2319 "pshufh %[D], %[D], %[ftmp0] \n\t" 2322 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2323 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2324 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2325 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2326 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2330 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2331 "addiu %[h], %[h], -0x01 \n\t" 2332 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2333 "bnez %[h], 1b \n\t" 2334 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2335 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2336 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2337 [tmp0]
"=&r"(tmp[0]),
2340 [src]
"+&r"(src), [dst]
"+&r"(dst),
2343 [
A]
"f"(
A), [B]
"f"(B),
2344 [
C]
"f"(
C), [D]
"f"(D),
2352 ptrdiff_t stride,
int h,
int x,
int y)
2354 const int A = (8 - x) * (8 - y);
2355 const int B = (x) * (8 - y);
2356 const int C = (8 - x) * (y);
2357 const int D = (x) * (y);
2363 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2366 "li %[tmp0], 0x06 \n\t" 2367 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2368 "mtc1 %[tmp0], %[ftmp9] \n\t" 2369 "pshufh %[A], %[A], %[ftmp0] \n\t" 2370 "pshufh %[B], %[B], %[ftmp0] \n\t" 2371 "pshufh %[C], %[C], %[ftmp0] \n\t" 2372 "pshufh %[D], %[D], %[ftmp0] \n\t" 2375 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2376 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2377 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2378 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2379 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2383 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2384 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2386 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2387 "addiu %[h], %[h], -0x01 \n\t" 2388 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2389 "bnez %[h], 1b \n\t" 2390 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2391 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2392 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2393 [ftmp6]
"=&f"(ftmp[6]), [ftmp7]
"=&f"(ftmp[7]),
2394 [ftmp8]
"=&f"(ftmp[8]), [ftmp9]
"=&f"(ftmp[9]),
2395 [tmp0]
"=&r"(tmp[0]),
2398 [src]
"+&r"(src), [dst]
"+&r"(dst),
2401 [
A]
"f"(
A), [B]
"f"(B),
2402 [
C]
"f"(
C), [D]
"f"(D),
2410 ptrdiff_t stride,
int h,
int x,
int y)
2412 const int A = (8 - x) * (8 - y);
2413 const int B = ( x) * (8 - y);
2414 const int C = (8 - x) * ( y);
2415 const int D = ( x) * ( y);
2421 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2424 "li %[tmp0], 0x06 \n\t" 2425 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" 2426 "mtc1 %[tmp0], %[ftmp5] \n\t" 2427 "pshufh %[A], %[A], %[ftmp0] \n\t" 2428 "pshufh %[B], %[B], %[ftmp0] \n\t" 2429 "pshufh %[C], %[C], %[ftmp0] \n\t" 2430 "pshufh %[D], %[D], %[ftmp0] \n\t" 2433 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2434 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2435 PTR_ADDU "%[src], %[src], %[stride] \n\t" 2436 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2437 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2441 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2442 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" 2444 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2445 "addiu %[h], %[h], -0x01 \n\t" 2446 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" 2447 "bnez %[h], 1b \n\t" 2448 : [ftmp0]
"=&f"(ftmp[0]), [ftmp1]
"=&f"(ftmp[1]),
2449 [ftmp2]
"=&f"(ftmp[2]), [ftmp3]
"=&f"(ftmp[3]),
2450 [ftmp4]
"=&f"(ftmp[4]), [ftmp5]
"=&f"(ftmp[5]),
2451 [tmp0]
"=&r"(tmp[0]),
2454 [src]
"+&r"(src), [dst]
"+&r"(dst),
2457 [
A]
"f"(
A), [B]
"f"(B),
2458 [
C]
"f"(
C), [D]
"f"(D),
void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
static int shift(int a, int b)
void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0)
void(* vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd)
void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
#define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3,fr_t0, fr_t1, fr_t2, fr_t3)
brief: Transpose 4X4 half word packaged data.
#define DECLARE_FUNCTION(a, b)
Macro to ease bicubic filter interpolation functions declarations.
static void vc1_loop_filter(uint8_t *src, int step, int stride, int len, int pq)
VC-1 in-loop deblocking filter.
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
void(* vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd, int64_t shift)
1/4 shift bicubic interpolation
The exact code depends on how similar the blocks are and how related they are to the block
void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
#define LOAD_ROUNDER_MMI(ROUND)
Compute the rounder 32-r or 8-r and unpacks it to $f14.
#define VC1_SHIFT2(OP, OPNAME)
Purely vertical or horizontal 1/2 shift interpolation.
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
static void vc1_put_ver_16b_shift2_mmi(int16_t *dst, const uint8_t *src, mips_reg stride, int rnd, int64_t shift)
Sacrificing $f12 makes it possible to pipeline loads from src.
void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define i(width, name, range_min, range_max)
static const int shift1[6]
void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
#define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1)
static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
VC-1 in-loop deblocking filter for one line.
simple assert() macros that are a bit more flexible than ISO C assert().
void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
#define VC1_MSPEL_MC(OP)
Interpolate fractional pel values by applying proper vertical then horizontal filter.
void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the 8bits, any direction, version of vc1_put_shift[13].
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Prediction block[y][x] dc[1]
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
#define SHIFT2_LINE(OFF, R0, R1, R2, R3)
void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int32_t h)
void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
#define flags(name, subs,...)
void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
GLint GLenum GLboolean GLsizei stride
void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y)
static const double coeff[2][5]
#define VC1_HOR_16B_SHIFT2(OP, OPNAME)
Data is already unpacked, so some operations can directly be made from memory.
void(* vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd, mips_reg offset)
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)
Macro to build the vertical 16bits version of vc1_put_shift[13].
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)
Macro to build the horizontal 16bits version of vc1_put_shift[13].
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step