31 const int w2= (width+1)>>1;
32 const int w_l= (width>>1);
33 const int w_r= w2 - 1;
45 "pcmpeqd %%xmm7, %%xmm7 \n\t" 46 "pcmpeqd %%xmm3, %%xmm3 \n\t" 47 "psllw $1, %%xmm3 \n\t" 48 "paddw %%xmm7, %%xmm3 \n\t" 49 "psllw $13, %%xmm3 \n\t" 51 for(; i<w_l-15; i+=16){
53 "movdqu (%1), %%xmm1 \n\t" 54 "movdqu 16(%1), %%xmm5 \n\t" 55 "movdqu 2(%1), %%xmm2 \n\t" 56 "movdqu 18(%1), %%xmm6 \n\t" 57 "paddw %%xmm1, %%xmm2 \n\t" 58 "paddw %%xmm5, %%xmm6 \n\t" 59 "paddw %%xmm7, %%xmm2 \n\t" 60 "paddw %%xmm7, %%xmm6 \n\t" 61 "pmulhw %%xmm3, %%xmm2 \n\t" 62 "pmulhw %%xmm3, %%xmm6 \n\t" 63 "paddw (%0), %%xmm2 \n\t" 64 "paddw 16(%0), %%xmm6 \n\t" 65 "movdqa %%xmm2, (%0) \n\t" 66 "movdqa %%xmm6, 16(%0) \n\t" 67 ::
"r"(&b[
i]),
"r"(&ref[i])
79 for(; (((
x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
80 dst[
i] = dst[
i] - (b[
i] + b[i + 1]);
82 for(; i<w_r-15; i+=16){
84 "movdqu (%1), %%xmm1 \n\t" 85 "movdqu 16(%1), %%xmm5 \n\t" 86 "movdqu 2(%1), %%xmm2 \n\t" 87 "movdqu 18(%1), %%xmm6 \n\t" 88 "paddw %%xmm1, %%xmm2 \n\t" 89 "paddw %%xmm5, %%xmm6 \n\t" 90 "movdqa (%0), %%xmm0 \n\t" 91 "movdqa 16(%0), %%xmm4 \n\t" 92 "psubw %%xmm2, %%xmm0 \n\t" 93 "psubw %%xmm6, %%xmm4 \n\t" 94 "movdqa %%xmm0, (%0) \n\t" 95 "movdqa %%xmm4, 16(%0) \n\t" 96 ::
"r"(&dst[
i]),
"r"(&b[i])
109 "psllw $15, %%xmm7 \n\t" 110 "pcmpeqw %%xmm6, %%xmm6 \n\t" 111 "psrlw $13, %%xmm6 \n\t" 112 "paddw %%xmm7, %%xmm6 \n\t" 114 for(; i<w_l-15; i+=16){
116 "movdqu (%1), %%xmm0 \n\t" 117 "movdqu 16(%1), %%xmm4 \n\t" 118 "movdqu 2(%1), %%xmm1 \n\t" 119 "movdqu 18(%1), %%xmm5 \n\t" 120 "paddw %%xmm6, %%xmm0 \n\t" 121 "paddw %%xmm6, %%xmm4 \n\t" 122 "paddw %%xmm7, %%xmm1 \n\t" 123 "paddw %%xmm7, %%xmm5 \n\t" 124 "pavgw %%xmm1, %%xmm0 \n\t" 125 "pavgw %%xmm5, %%xmm4 \n\t" 126 "psubw %%xmm7, %%xmm0 \n\t" 127 "psubw %%xmm7, %%xmm4 \n\t" 128 "psraw $1, %%xmm0 \n\t" 129 "psraw $1, %%xmm4 \n\t" 130 "movdqa (%0), %%xmm1 \n\t" 131 "movdqa 16(%0), %%xmm5 \n\t" 132 "paddw %%xmm1, %%xmm0 \n\t" 133 "paddw %%xmm5, %%xmm4 \n\t" 134 "psraw $2, %%xmm0 \n\t" 135 "psraw $2, %%xmm4 \n\t" 136 "paddw %%xmm1, %%xmm0 \n\t" 137 "paddw %%xmm5, %%xmm4 \n\t" 138 "movdqa %%xmm0, (%0) \n\t" 139 "movdqa %%xmm4, 16(%0) \n\t" 140 ::
"r"(&b[
i]),
"r"(&ref[i])
145 b[0] = b_0 + ((2 * ref[1] +
W_BO-1 + 4 * b_0) >>
W_BS);
152 for(; (((
x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
153 temp[
i] = src[
i] - ((-
W_AM*(b[
i] + b[i+1]))>>
W_AS);
155 for(; i<w_r-7; i+=8){
157 "movdqu 2(%1), %%xmm2 \n\t" 158 "movdqu 18(%1), %%xmm6 \n\t" 159 "paddw (%1), %%xmm2 \n\t" 160 "paddw 16(%1), %%xmm6 \n\t" 161 "movdqu (%0), %%xmm0 \n\t" 162 "movdqu 16(%0), %%xmm4 \n\t" 163 "paddw %%xmm2, %%xmm0 \n\t" 164 "paddw %%xmm6, %%xmm4 \n\t" 165 "psraw $1, %%xmm2 \n\t" 166 "psraw $1, %%xmm6 \n\t" 167 "paddw %%xmm0, %%xmm2 \n\t" 168 "paddw %%xmm4, %%xmm6 \n\t" 169 "movdqa %%xmm2, (%2) \n\t" 170 "movdqa %%xmm6, 16(%2) \n\t" 171 ::
"r"(&src[
i]),
"r"(&b[i]),
"r"(&temp[
i])
181 for (; (i & 0x3E) != 0x3E; i-=2){
185 for (i-=62; i>=0; i-=64){
187 "movdqa (%1), %%xmm0 \n\t" 188 "movdqa 16(%1), %%xmm2 \n\t" 189 "movdqa 32(%1), %%xmm4 \n\t" 190 "movdqa 48(%1), %%xmm6 \n\t" 191 "movdqa (%1), %%xmm1 \n\t" 192 "movdqa 16(%1), %%xmm3 \n\t" 193 "movdqa 32(%1), %%xmm5 \n\t" 194 "movdqa 48(%1), %%xmm7 \n\t" 195 "punpcklwd (%2), %%xmm0 \n\t" 196 "punpcklwd 16(%2), %%xmm2 \n\t" 197 "punpcklwd 32(%2), %%xmm4 \n\t" 198 "punpcklwd 48(%2), %%xmm6 \n\t" 199 "movdqa %%xmm0, (%0) \n\t" 200 "movdqa %%xmm2, 32(%0) \n\t" 201 "movdqa %%xmm4, 64(%0) \n\t" 202 "movdqa %%xmm6, 96(%0) \n\t" 203 "punpckhwd (%2), %%xmm1 \n\t" 204 "punpckhwd 16(%2), %%xmm3 \n\t" 205 "punpckhwd 32(%2), %%xmm5 \n\t" 206 "punpckhwd 48(%2), %%xmm7 \n\t" 207 "movdqa %%xmm1, 16(%0) \n\t" 208 "movdqa %%xmm3, 48(%0) \n\t" 209 "movdqa %%xmm5, 80(%0) \n\t" 210 "movdqa %%xmm7, 112(%0) \n\t" 211 ::
"r"(&(
b)[i]),
"r"(&(
b)[i>>1]),
"r"(&(
temp)[i>>1])
218 static void ff_snow_horizontal_compose97i_mmx(
IDWTELEM *b,
IDWTELEM *temp,
int width){
219 const int w2= (width+1)>>1;
220 const int w_l= (width>>1);
221 const int w_r= w2 - 1;
230 "pcmpeqw %%mm7, %%mm7 \n\t" 231 "pcmpeqw %%mm3, %%mm3 \n\t" 232 "psllw $1, %%mm3 \n\t" 233 "paddw %%mm7, %%mm3 \n\t" 234 "psllw $13, %%mm3 \n\t" 236 for(; i<w_l-7; i+=8){
238 "movq (%1), %%mm2 \n\t" 239 "movq 8(%1), %%mm6 \n\t" 240 "paddw 2(%1), %%mm2 \n\t" 241 "paddw 10(%1), %%mm6 \n\t" 242 "paddw %%mm7, %%mm2 \n\t" 243 "paddw %%mm7, %%mm6 \n\t" 244 "pmulhw %%mm3, %%mm2 \n\t" 245 "pmulhw %%mm3, %%mm6 \n\t" 246 "paddw (%0), %%mm2 \n\t" 247 "paddw 8(%0), %%mm6 \n\t" 248 "movq %%mm2, (%0) \n\t" 249 "movq %%mm6, 8(%0) \n\t" 250 ::
"r"(&b[
i]),
"r"(&ref[i])
261 for(; i<w_r-7; i+=8){
263 "movq (%1), %%mm2 \n\t" 264 "movq 8(%1), %%mm6 \n\t" 265 "paddw 2(%1), %%mm2 \n\t" 266 "paddw 10(%1), %%mm6 \n\t" 267 "movq (%0), %%mm0 \n\t" 268 "movq 8(%0), %%mm4 \n\t" 269 "psubw %%mm2, %%mm0 \n\t" 270 "psubw %%mm6, %%mm4 \n\t" 271 "movq %%mm0, (%0) \n\t" 272 "movq %%mm4, 8(%0) \n\t" 273 ::
"r"(&dst[
i]),
"r"(&b[i])
284 b[0] = b[0] + (((2 * ref[1] +
W_BO) + 4 * b[0]) >>
W_BS);
286 "psllw $15, %%mm7 \n\t" 287 "pcmpeqw %%mm6, %%mm6 \n\t" 288 "psrlw $13, %%mm6 \n\t" 289 "paddw %%mm7, %%mm6 \n\t" 291 for(; i<w_l-7; i+=8){
293 "movq (%1), %%mm0 \n\t" 294 "movq 8(%1), %%mm4 \n\t" 295 "movq 2(%1), %%mm1 \n\t" 296 "movq 10(%1), %%mm5 \n\t" 297 "paddw %%mm6, %%mm0 \n\t" 298 "paddw %%mm6, %%mm4 \n\t" 299 "paddw %%mm7, %%mm1 \n\t" 300 "paddw %%mm7, %%mm5 \n\t" 301 "pavgw %%mm1, %%mm0 \n\t" 302 "pavgw %%mm5, %%mm4 \n\t" 303 "psubw %%mm7, %%mm0 \n\t" 304 "psubw %%mm7, %%mm4 \n\t" 305 "psraw $1, %%mm0 \n\t" 306 "psraw $1, %%mm4 \n\t" 307 "movq (%0), %%mm1 \n\t" 308 "movq 8(%0), %%mm5 \n\t" 309 "paddw %%mm1, %%mm0 \n\t" 310 "paddw %%mm5, %%mm4 \n\t" 311 "psraw $2, %%mm0 \n\t" 312 "psraw $2, %%mm4 \n\t" 313 "paddw %%mm1, %%mm0 \n\t" 314 "paddw %%mm5, %%mm4 \n\t" 315 "movq %%mm0, (%0) \n\t" 316 "movq %%mm4, 8(%0) \n\t" 317 ::
"r"(&b[
i]),
"r"(&ref[i])
328 for(; i<w_r-7; i+=8){
330 "movq 2(%1), %%mm2 \n\t" 331 "movq 10(%1), %%mm6 \n\t" 332 "paddw (%1), %%mm2 \n\t" 333 "paddw 8(%1), %%mm6 \n\t" 334 "movq (%0), %%mm0 \n\t" 335 "movq 8(%0), %%mm4 \n\t" 336 "paddw %%mm2, %%mm0 \n\t" 337 "paddw %%mm6, %%mm4 \n\t" 338 "psraw $1, %%mm2 \n\t" 339 "psraw $1, %%mm6 \n\t" 340 "paddw %%mm0, %%mm2 \n\t" 341 "paddw %%mm4, %%mm6 \n\t" 342 "movq %%mm2, (%2) \n\t" 343 "movq %%mm6, 8(%2) \n\t" 344 ::
"r"(&src[
i]),
"r"(&b[i]),
"r"(&temp[
i])
354 for (; (i & 0x1E) != 0x1E; i-=2){
358 for (i-=30; i>=0; i-=32){
360 "movq (%1), %%mm0 \n\t" 361 "movq 8(%1), %%mm2 \n\t" 362 "movq 16(%1), %%mm4 \n\t" 363 "movq 24(%1), %%mm6 \n\t" 364 "movq (%1), %%mm1 \n\t" 365 "movq 8(%1), %%mm3 \n\t" 366 "movq 16(%1), %%mm5 \n\t" 367 "movq 24(%1), %%mm7 \n\t" 368 "punpcklwd (%2), %%mm0 \n\t" 369 "punpcklwd 8(%2), %%mm2 \n\t" 370 "punpcklwd 16(%2), %%mm4 \n\t" 371 "punpcklwd 24(%2), %%mm6 \n\t" 372 "movq %%mm0, (%0) \n\t" 373 "movq %%mm2, 16(%0) \n\t" 374 "movq %%mm4, 32(%0) \n\t" 375 "movq %%mm6, 48(%0) \n\t" 376 "punpckhwd (%2), %%mm1 \n\t" 377 "punpckhwd 8(%2), %%mm3 \n\t" 378 "punpckhwd 16(%2), %%mm5 \n\t" 379 "punpckhwd 24(%2), %%mm7 \n\t" 380 "movq %%mm1, 8(%0) \n\t" 381 "movq %%mm3, 24(%0) \n\t" 382 "movq %%mm5, 40(%0) \n\t" 383 "movq %%mm7, 56(%0) \n\t" 384 ::
"r"(&b[
i]),
"r"(&b[i>>1]),
"r"(&temp[i>>1])
392 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ 393 ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ 394 ""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\ 395 ""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\ 396 ""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t" 398 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ 399 snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) 401 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ 402 snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) 404 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ 405 "psubw %%"s0", %%"t0" \n\t"\ 406 "psubw %%"s1", %%"t1" \n\t"\ 407 "psubw %%"s2", %%"t2" \n\t"\ 408 "psubw %%"s3", %%"t3" \n\t" 410 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ 411 "movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\ 412 "movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\ 413 "movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\ 414 "movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t" 416 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ 417 "psraw $"n", %%"t0" \n\t"\ 418 "psraw $"n", %%"t1" \n\t"\ 419 "psraw $"n", %%"t2" \n\t"\ 420 "psraw $"n", %%"t3" \n\t" 422 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ 423 "paddw %%"s0", %%"t0" \n\t"\ 424 "paddw %%"s1", %%"t1" \n\t"\ 425 "paddw %%"s2", %%"t2" \n\t"\ 426 "paddw %%"s3", %%"t3" \n\t" 428 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ 429 "pmulhw %%"s0", %%"t0" \n\t"\ 430 "pmulhw %%"s1", %%"t1" \n\t"\ 431 "pmulhw %%"s2", %%"t2" \n\t"\ 432 "pmulhw %%"s3", %%"t3" \n\t" 434 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 435 "movdqa %%"s0", %%"t0" \n\t"\ 436 "movdqa %%"s1", %%"t1" \n\t"\ 437 "movdqa %%"s2", %%"t2" \n\t"\ 438 "movdqa %%"s3", %%"t3" \n\t" 456 snow_vertical_compose_sse2_load(
"%4",
"xmm0",
"xmm2",
"xmm4",
"xmm6")
457 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
460 "pcmpeqw %%xmm0, %%xmm0 \n\t"
461 "pcmpeqw %%xmm2, %%xmm2 \n\t"
462 "paddw %%xmm2, %%xmm2 \n\t"
463 "paddw %%xmm0, %%xmm2 \n\t"
464 "psllw $13, %%xmm2 \n\t"
465 snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
466 snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
467 snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
468 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
469 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
470 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
471 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
472 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
474 "pcmpeqw %%xmm7, %%xmm7 \n\t"
475 "pcmpeqw %%xmm5, %%xmm5 \n\t"
476 "psllw $15, %%xmm7 \n\t"
477 "psrlw $13, %%xmm5 \n\t"
478 "paddw %%xmm7, %%xmm5 \n\t"
479 snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
480 "movq (%2,%%"FF_REG_d"), %%xmm1 \n\t"
481 "movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t"
482 "paddw %%xmm7, %%xmm1 \n\t"
483 "paddw %%xmm7, %%xmm3 \n\t"
484 "pavgw %%xmm1, %%xmm0 \n\t"
485 "pavgw %%xmm3, %%xmm2 \n\t"
486 "movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t"
487 "movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t"
488 "paddw %%xmm7, %%xmm1 \n\t"
489 "paddw %%xmm7, %%xmm3 \n\t"
490 "pavgw %%xmm1, %%xmm4 \n\t"
491 "pavgw %%xmm3, %%xmm6 \n\t"
492 snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
493 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
494 snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
496 snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
497 snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
498 snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
499 snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
500 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
501 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
502 snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
503 snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
504 snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
507 "
sub $64, %%"FF_REG_d" \n\t"
510 :"
r"(b0),"
r"(b1),"
r"(b2),"
r"(b3),"
r"(b4),"
r"(b5));
513 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ 514 ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ 515 ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\ 516 ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\ 517 ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t" 519 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ 520 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) 522 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ 523 snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) 525 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ 526 "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\ 527 "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\ 528 "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\ 529 "movq %%"s3", 24("w",%%"FF_REG_d") \n\t" 531 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 532 "movq %%"s0", %%"t0" \n\t"\ 533 "movq %%"s1", %%"t1" \n\t"\ 534 "movq %%"s2", %%"t2" \n\t"\ 535 "movq %%"s3", %%"t3" \n\t" 553 snow_vertical_compose_mmx_load(
"%4",
"mm1",
"mm3",
"mm5",
"mm7")
554 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
555 "pcmpeqw %%mm0, %%mm0 \n\t"
556 "pcmpeqw %%mm2, %%mm2 \n\t"
557 "paddw %%mm2, %%mm2 \n\t"
558 "paddw %%mm0, %%mm2 \n\t"
559 "psllw $13, %%mm2 \n\t"
560 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
561 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
562 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
563 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
564 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
565 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
566 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
567 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
568 "pcmpeqw %%mm7, %%mm7 \n\t"
569 "pcmpeqw %%mm5, %%mm5 \n\t"
570 "psllw $15, %%mm7 \n\t"
571 "psrlw $13, %%mm5 \n\t"
572 "paddw %%mm7, %%mm5 \n\t"
573 snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
574 "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
575 "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
576 "paddw %%mm7, %%mm1 \n\t"
577 "paddw %%mm7, %%mm3 \n\t"
578 "pavgw %%mm1, %%mm0 \n\t"
579 "pavgw %%mm3, %%mm2 \n\t"
580 "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
581 "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
582 "paddw %%mm7, %%mm1 \n\t"
583 "paddw %%mm7, %%mm3 \n\t"
584 "pavgw %%mm1, %%mm4 \n\t"
585 "pavgw %%mm3, %%mm6 \n\t"
586 snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
587 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
588 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
590 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
591 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
592 snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
593 snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
594 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
595 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
596 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
597 snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
598 snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
601 "sub $32, %%"FF_REG_d" \n\t"
604 :"
r"(b0),"
r"(b1),"
r"(b2),"
r"(b3),"
r"(b4),"
r"(b5));
609 #define snow_inner_add_yblock_sse2_header \ 610 IDWTELEM * * dst_array = sb->line + src_y;\ 613 "mov %7, %%"FF_REG_c" \n\t"\ 615 "mov %4, %%"FF_REG_S" \n\t"\ 616 "pxor %%xmm7, %%xmm7 \n\t" \ 617 "pcmpeqd %%xmm3, %%xmm3 \n\t"\ 618 "psllw $15, %%xmm3 \n\t"\ 619 "psrlw $12, %%xmm3 \n\t" \ 621 "mov %1, %%"FF_REG_D" \n\t"\ 622 "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 623 "add %3, %%"FF_REG_D" \n\t" 625 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ 626 "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 627 "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ 628 "movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\ 629 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 630 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 631 "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ 632 "movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\ 633 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 634 "punpcklbw %%xmm7, %%xmm4 \n\t"\ 635 "pmullw %%xmm0, %%"out_reg1" \n\t"\ 636 "pmullw %%xmm4, %%"out_reg2" \n\t" 638 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ 639 "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 640 "movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ 641 "movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\ 642 "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ 643 "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ 644 "movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ 645 "movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\ 646 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 647 "punpcklbw %%xmm7, %%xmm4 \n\t"\ 648 "pmullw %%xmm0, %%"out_reg1" \n\t"\ 649 "pmullw %%xmm4, %%"out_reg2" \n\t" 651 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ 652 snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ 653 "paddusw %%xmm2, %%xmm1 \n\t"\ 654 "paddusw %%xmm6, %%xmm5 \n\t" 656 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ 657 snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ 658 "paddusw %%xmm2, %%xmm1 \n\t"\ 659 "paddusw %%xmm6, %%xmm5 \n\t" 661 #define snow_inner_add_yblock_sse2_end_common1\ 662 "add $32, %%"FF_REG_S" \n\t"\ 663 "add %%"FF_REG_c", %0 \n\t"\ 664 "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 665 "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 666 "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 667 "add %%"FF_REG_c", (%%"FF_REG_a") \n\t" 669 #define snow_inner_add_yblock_sse2_end_common2\ 671 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 673 "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 674 XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\ 675 "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 677 #define snow_inner_add_yblock_sse2_end_8\ 678 "sal $1, %%"FF_REG_c" \n\t"\ 679 "add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\ 680 snow_inner_add_yblock_sse2_end_common1\ 681 "sar $1, %%"FF_REG_c" \n\t"\ 683 snow_inner_add_yblock_sse2_end_common2 685 #define snow_inner_add_yblock_sse2_end_16\ 686 "add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\ 687 snow_inner_add_yblock_sse2_end_common1\ 689 snow_inner_add_yblock_sse2_end_common2 693 snow_inner_add_yblock_sse2_header
694 snow_inner_add_yblock_sse2_start_8(
"xmm1",
"xmm5",
"3",
"0")
695 snow_inner_add_yblock_sse2_accum_8("2", "8")
696 snow_inner_add_yblock_sse2_accum_8("1", "128")
697 snow_inner_add_yblock_sse2_accum_8("0", "136")
699 "mov %0, %%"FF_REG_d" \n\t"
700 "movdqa (%%"FF_REG_D"), %%xmm0 \n\t"
701 "movdqa %%xmm1, %%xmm2 \n\t"
703 "punpckhwd %%xmm7, %%xmm1 \n\t"
704 "punpcklwd %%xmm7, %%xmm2 \n\t"
705 "paddd %%xmm2, %%xmm0 \n\t"
706 "movdqa 16(%%"FF_REG_D"), %%xmm2\n\t"
707 "paddd %%xmm1, %%xmm2 \n\t"
708 "paddd %%xmm3, %%xmm0 \n\t"
709 "paddd %%xmm3, %%xmm2 \n\t"
711 "mov %1, %%"FF_REG_D" \n\t"
712 "mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t"
713 "add %3, %%"FF_REG_D" \n\t"
715 "movdqa (%%"FF_REG_D"), %%xmm4 \n\t"
716 "movdqa %%xmm5, %%xmm6 \n\t"
717 "punpckhwd %%xmm7, %%xmm5 \n\t"
718 "punpcklwd %%xmm7, %%xmm6 \n\t"
719 "paddd %%xmm6, %%xmm4 \n\t"
720 "movdqa 16(%%"FF_REG_D"), %%xmm6\n\t"
721 "paddd %%xmm5, %%xmm6 \n\t"
722 "paddd %%xmm3, %%xmm4 \n\t"
723 "paddd %%xmm3, %%xmm6 \n\t"
725 "psrad $8, %%xmm0 \n\t"
726 "psrad $8, %%xmm2 \n\t"
727 "packssdw %%xmm2, %%xmm0 \n\t"
728 "packuswb %%xmm7, %%xmm0 \n\t"
729 "movq %%xmm0, (%%"FF_REG_d") \n\t"
731 "psrad $8, %%xmm4 \n\t"
732 "psrad $8, %%xmm6 \n\t"
733 "packssdw %%xmm6, %%xmm4 \n\t"
734 "packuswb %%xmm7, %%xmm4 \n\t"
735 "movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t"
736 snow_inner_add_yblock_sse2_end_8
739 static
void inner_add_yblock_bw_16_obmc_32_sse2(const
uint8_t *obmc, const
x86_reg obmc_stride,
uint8_t * * block,
int b_w,
x86_reg b_h,
741 snow_inner_add_yblock_sse2_header
742 snow_inner_add_yblock_sse2_start_16(
"xmm1",
"xmm5",
"3",
"0")
743 snow_inner_add_yblock_sse2_accum_16("2", "16")
744 snow_inner_add_yblock_sse2_accum_16("1", "512")
745 snow_inner_add_yblock_sse2_accum_16("0", "528")
747 "mov %0, %%"FF_REG_d" \n\t"
748 "psrlw $4, %%xmm1 \n\t"
749 "psrlw $4, %%xmm5 \n\t"
750 "paddw (%%"FF_REG_D"), %%xmm1 \n\t"
751 "paddw 16(%%"FF_REG_D"), %%xmm5 \n\t"
752 "paddw %%xmm3, %%xmm1 \n\t"
753 "paddw %%xmm3, %%xmm5 \n\t"
754 "psraw $4, %%xmm1 \n\t"
755 "psraw $4, %%xmm5 \n\t"
756 "packuswb %%xmm5, %%xmm1 \n\t"
758 "movdqu %%xmm1, (%%"FF_REG_d") \n\t"
760 snow_inner_add_yblock_sse2_end_16
763 #define snow_inner_add_yblock_mmx_header \ 764 IDWTELEM * * dst_array = sb->line + src_y;\ 767 "mov %7, %%"FF_REG_c" \n\t"\ 769 "mov %4, %%"FF_REG_S" \n\t"\ 770 "pxor %%mm7, %%mm7 \n\t" \ 771 "pcmpeqd %%mm3, %%mm3 \n\t"\ 772 "psllw $15, %%mm3 \n\t"\ 773 "psrlw $12, %%mm3 \n\t" \ 775 "mov %1, %%"FF_REG_D" \n\t"\ 776 "mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ 777 "add %3, %%"FF_REG_D" \n\t" 779 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ 780 "mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ 781 "movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\ 782 "movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\ 783 "punpcklbw %%mm7, %%"out_reg1" \n\t"\ 784 "punpcklbw %%mm7, %%"out_reg2" \n\t"\ 785 "movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\ 786 "movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\ 787 "punpcklbw %%mm7, %%mm0 \n\t"\ 788 "punpcklbw %%mm7, %%mm4 \n\t"\ 789 "pmullw %%mm0, %%"out_reg1" \n\t"\ 790 "pmullw %%mm4, %%"out_reg2" \n\t" 792 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ 793 snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ 794 "paddusw %%mm2, %%mm1 \n\t"\ 795 "paddusw %%mm6, %%mm5 \n\t" 797 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ 798 "mov %0, %%"FF_REG_d" \n\t"\ 799 "psrlw $4, %%mm1 \n\t"\ 800 "psrlw $4, %%mm5 \n\t"\ 801 "paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\ 802 "paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\ 803 "paddw %%mm3, %%mm1 \n\t"\ 804 "paddw %%mm3, %%mm5 \n\t"\ 805 "psraw $4, %%mm1 \n\t"\ 806 "psraw $4, %%mm5 \n\t"\ 807 "packuswb %%mm5, %%mm1 \n\t"\ 808 "movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t" 810 #define snow_inner_add_yblock_mmx_end(s_step)\ 811 "add $"s_step", %%"FF_REG_S" \n\t"\ 812 "add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ 813 "add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ 814 "add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ 815 "add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\ 816 "add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\ 817 "add %%"FF_REG_c", %0 \n\t"\ 820 :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ 822 "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ 823 "%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); 827 snow_inner_add_yblock_mmx_header
828 snow_inner_add_yblock_mmx_start(
"mm1",
"mm5",
"3",
"0",
"0")
829 snow_inner_add_yblock_mmx_accum("2", "8", "0")
830 snow_inner_add_yblock_mmx_accum("1", "128", "0")
831 snow_inner_add_yblock_mmx_accum("0", "136", "0")
832 snow_inner_add_yblock_mmx_mix("0", "0")
833 snow_inner_add_yblock_mmx_end("16")
838 snow_inner_add_yblock_mmx_header
839 snow_inner_add_yblock_mmx_start(
"mm1",
"mm5",
"3",
"0",
"0")
840 snow_inner_add_yblock_mmx_accum("2", "16", "0")
841 snow_inner_add_yblock_mmx_accum("1", "512", "0")
842 snow_inner_add_yblock_mmx_accum("0", "528", "0")
843 snow_inner_add_yblock_mmx_mix("0", "0")
845 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
846 snow_inner_add_yblock_mmx_accum("2", "24", "8")
847 snow_inner_add_yblock_mmx_accum("1", "520", "8")
848 snow_inner_add_yblock_mmx_accum("0", "536", "8")
849 snow_inner_add_yblock_mmx_mix("16", "8")
850 snow_inner_add_yblock_mmx_end("32")
853 static
void ff_snow_inner_add_yblock_sse2(const
uint8_t *obmc, const
int obmc_stride,
uint8_t * * block,
int b_w,
int b_h,
857 inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
858 else if (b_w == 8 && obmc_stride == 16) {
860 inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
862 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
864 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
867 static void ff_snow_inner_add_yblock_mmx(
const uint8_t *obmc,
const int obmc_stride,
uint8_t * * block,
int b_w,
int b_h,
870 inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
871 else if (b_w == 8 && obmc_stride == 16)
872 inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
874 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w, int lift_high, int mul, int add, int shift)
The exact code depends on how similar the blocks are and how related they are to the block
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
void(* horizontal_compose97i)(IDWTELEM *b, IDWTELEM *temp, int width)
av_cold void ff_dwt_init_x86(SnowDWTContext *c)
static double b0(void *priv, double x, double y)
static double b1(void *priv, double x, double y)
Libavcodec external API header.
#define AV_CPU_FLAG_MMX
standard MMX
static double b3(void *priv, double x, double y)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
static av_always_inline void snow_interleave_line_header(int *i, int width, IDWTELEM *low, IDWTELEM *high)
static int ref[MAX_W *MAX_W]
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
void(* inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t **block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer *sb, int add, uint8_t *dst8)
void(* vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width)
static float add(float src0, float src1)
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
static float sub(float src0, float src1)
Used to minimize the amount of memory used in order to optimize cache performance.
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int width, int w)
static double b2(void *priv, double x, double y)