37 int line_size,
int h);
39 int line_size,
int h);
41 int line_size,
int h);
45 #define hadamard_func(cpu) \
46 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
47 uint8_t *src2, int stride, int h); \
48 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
49 uint8_t *src2, int stride, int h);
63 score1 = c->
mecc.
sse[0](
c, pix1, pix2, line_size, h);
72 return score1 +
FFABS(score2) * 8;
78 int score1 =
ff_sse8_mmx(c, pix1, pix2, line_size, h);
85 return score1 +
FFABS(score2) * 8;
100 #define SUM(in0, in1, out0, out1) \
101 "movq (%0), %%mm2\n" \
102 "movq 8(%0), %%mm3\n" \
104 "movq %%mm2, " #out0 "\n" \
105 "movq %%mm3, " #out1 "\n" \
106 "psubusb " #in0 ", %%mm2\n" \
107 "psubusb " #in1 ", %%mm3\n" \
108 "psubusb " #out0 ", " #in0 "\n" \
109 "psubusb " #out1 ", " #in1 "\n" \
110 "por %%mm2, " #in0 "\n" \
111 "por %%mm3, " #in1 "\n" \
112 "movq " #in0 ", %%mm2\n" \
113 "movq " #in1 ", %%mm3\n" \
114 "punpcklbw %%mm7, " #in0 "\n" \
115 "punpcklbw %%mm7, " #in1 "\n" \
116 "punpckhbw %%mm7, %%mm2\n" \
117 "punpckhbw %%mm7, %%mm3\n" \
118 "paddw " #in1 ", " #in0 "\n" \
119 "paddw %%mm3, %%mm2\n" \
120 "paddw %%mm2, " #in0 "\n" \
121 "paddw " #in0 ", %%mm6\n"
126 "pxor %%mm6, %%mm6\n"
127 "pxor %%mm7, %%mm7\n"
129 "movq 8(%0), %%mm1\n"
134 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
136 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
141 "movq %%mm6, %%mm0\n"
143 "paddw %%mm6, %%mm0\n"
144 "movq %%mm0, %%mm6\n"
146 "paddw %%mm6, %%mm0\n"
148 :
"+r" (pix),
"=r" (tmp)
149 :
"r" ((
x86_reg) line_size),
"m" (h)
157 int line_size,
int h)
164 #define SUM(in0, in1, out0, out1) \
165 "movq (%0), " #out0 "\n" \
166 "movq 8(%0), " #out1 "\n" \
168 "psadbw " #out0 ", " #in0 "\n" \
169 "psadbw " #out1 ", " #in1 "\n" \
170 "paddw " #in1 ", " #in0 "\n" \
171 "paddw " #in0 ", %%mm6\n"
175 "pxor %%mm6, %%mm6\n"
176 "pxor %%mm7, %%mm7\n"
178 "movq 8(%0), %%mm1\n"
183 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
185 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
191 :
"+r" (pix),
"=r" (tmp)
192 :
"r" ((
x86_reg) line_size),
"m" (h)
200 int line_size,
int h)
208 #define SUM(in0, in1, out0, out1) \
209 "movq (%0), %%mm2\n" \
210 "movq (%1), " #out0 "\n" \
211 "movq 8(%0), %%mm3\n" \
212 "movq 8(%1), " #out1 "\n" \
215 "psubb " #out0 ", %%mm2\n" \
216 "psubb " #out1 ", %%mm3\n" \
217 "pxor %%mm7, %%mm2\n" \
218 "pxor %%mm7, %%mm3\n" \
219 "movq %%mm2, " #out0 "\n" \
220 "movq %%mm3, " #out1 "\n" \
221 "psubusb " #in0 ", %%mm2\n" \
222 "psubusb " #in1 ", %%mm3\n" \
223 "psubusb " #out0 ", " #in0 "\n" \
224 "psubusb " #out1 ", " #in1 "\n" \
225 "por %%mm2, " #in0 "\n" \
226 "por %%mm3, " #in1 "\n" \
227 "movq " #in0 ", %%mm2\n" \
228 "movq " #in1 ", %%mm3\n" \
229 "punpcklbw %%mm7, " #in0 "\n" \
230 "punpcklbw %%mm7, " #in1 "\n" \
231 "punpckhbw %%mm7, %%mm2\n" \
232 "punpckhbw %%mm7, %%mm3\n" \
233 "paddw " #in1 ", " #in0 "\n" \
234 "paddw %%mm3, %%mm2\n" \
235 "paddw %%mm2, " #in0 "\n" \
236 "paddw " #in0 ", %%mm6\n"
241 "pxor %%mm6, %%mm6\n"
242 "pcmpeqw %%mm7, %%mm7\n"
244 "packsswb %%mm7, %%mm7\n"
247 "movq 8(%0), %%mm1\n"
248 "movq 8(%1), %%mm3\n"
251 "psubb %%mm2, %%mm0\n"
252 "psubb %%mm3, %%mm1\n"
253 "pxor %%mm7, %%mm0\n"
254 "pxor %%mm7, %%mm1\n"
258 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
260 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
265 "movq %%mm6, %%mm0\n"
267 "paddw %%mm6, %%mm0\n"
268 "movq %%mm0, %%mm6\n"
270 "paddw %%mm6, %%mm0\n"
272 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
273 :
"r" ((
x86_reg) line_size),
"m" (h)
281 int line_size,
int h)
289 #define SUM(in0, in1, out0, out1) \
290 "movq (%0), " #out0 "\n" \
291 "movq (%1), %%mm2\n" \
292 "movq 8(%0), " #out1 "\n" \
293 "movq 8(%1), %%mm3\n" \
296 "psubb %%mm2, " #out0 "\n" \
297 "psubb %%mm3, " #out1 "\n" \
298 "pxor %%mm7, " #out0 "\n" \
299 "pxor %%mm7, " #out1 "\n" \
300 "psadbw " #out0 ", " #in0 "\n" \
301 "psadbw " #out1 ", " #in1 "\n" \
302 "paddw " #in1 ", " #in0 "\n" \
303 "paddw " #in0 ", %%mm6\n "
307 "pxor %%mm6, %%mm6\n"
308 "pcmpeqw %%mm7, %%mm7\n"
310 "packsswb %%mm7, %%mm7\n"
313 "movq 8(%0), %%mm1\n"
314 "movq 8(%1), %%mm3\n"
317 "psubb %%mm2, %%mm0\n"
318 "psubb %%mm3, %%mm1\n"
319 "pxor %%mm7, %%mm0\n"
320 "pxor %%mm7, %%mm1\n"
324 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
326 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
332 :
"+r" (pix1),
"+r" (pix2),
"=r" (tmp)
333 :
"r" ((
x86_reg) line_size),
"m" (h)
343 0x0000000000000000ULL,
344 0x0001000100010001ULL,
345 0x0002000200020002ULL,
356 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
357 "movq (%2, %%"REG_a
"), %%mm2 \n\t"
358 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
359 "add %3, %%"REG_a
" \n\t"
360 "psubusb %%mm0, %%mm2 \n\t"
361 "psubusb %%mm4, %%mm0 \n\t"
362 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
363 "movq (%2, %%"REG_a
"), %%mm3 \n\t"
364 "movq (%2, %%"REG_a
"), %%mm5 \n\t"
365 "psubusb %%mm1, %%mm3 \n\t"
366 "psubusb %%mm5, %%mm1 \n\t"
367 "por %%mm2, %%mm0 \n\t"
368 "por %%mm1, %%mm3 \n\t"
369 "movq %%mm0, %%mm1 \n\t"
370 "movq %%mm3, %%mm2 \n\t"
371 "punpcklbw %%mm7, %%mm0 \n\t"
372 "punpckhbw %%mm7, %%mm1 \n\t"
373 "punpcklbw %%mm7, %%mm3 \n\t"
374 "punpckhbw %%mm7, %%mm2 \n\t"
375 "paddw %%mm1, %%mm0 \n\t"
376 "paddw %%mm3, %%mm2 \n\t"
377 "paddw %%mm2, %%mm0 \n\t"
378 "paddw %%mm0, %%mm6 \n\t"
379 "add %3, %%"REG_a
" \n\t"
382 :
"r" (blk1 - len),
"r" (blk2 -
len),
"r" ((
x86_reg) stride));
391 "movq (%1), %%mm0 \n\t"
392 "movq (%1, %3), %%mm1 \n\t"
393 "psadbw (%2), %%mm0 \n\t"
394 "psadbw (%2, %3), %%mm1 \n\t"
395 "paddw %%mm0, %%mm6 \n\t"
396 "paddw %%mm1, %%mm6 \n\t"
397 "lea (%1,%3,2), %1 \n\t"
398 "lea (%2,%3,2), %2 \n\t"
401 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
410 "pxor %%xmm2, %%xmm2 \n\t"
413 "movdqu (%1), %%xmm0 \n\t"
414 "movdqu (%1, %4), %%xmm1 \n\t"
415 "psadbw (%2), %%xmm0 \n\t"
416 "psadbw (%2, %4), %%xmm1 \n\t"
417 "paddw %%xmm0, %%xmm2 \n\t"
418 "paddw %%xmm1, %%xmm2 \n\t"
419 "lea (%1,%4,2), %1 \n\t"
420 "lea (%2,%4,2), %2 \n\t"
423 "movhlps %%xmm2, %%xmm0 \n\t"
424 "paddw %%xmm0, %%xmm2 \n\t"
425 "movd %%xmm2, %3 \n\t"
426 :
"+r" (h),
"+r" (blk1),
"+r" (blk2),
"=r" (ret)
437 "movq (%1), %%mm0 \n\t"
438 "movq (%1, %3), %%mm1 \n\t"
439 "pavgb 1(%1), %%mm0 \n\t"
440 "pavgb 1(%1, %3), %%mm1 \n\t"
441 "psadbw (%2), %%mm0 \n\t"
442 "psadbw (%2, %3), %%mm1 \n\t"
443 "paddw %%mm0, %%mm6 \n\t"
444 "paddw %%mm1, %%mm6 \n\t"
445 "lea (%1,%3,2), %1 \n\t"
446 "lea (%2,%3,2), %2 \n\t"
449 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
457 "movq (%1), %%mm0 \n\t"
461 "movq (%1), %%mm1 \n\t"
462 "movq (%1, %3), %%mm2 \n\t"
463 "pavgb %%mm1, %%mm0 \n\t"
464 "pavgb %%mm2, %%mm1 \n\t"
465 "psadbw (%2), %%mm0 \n\t"
466 "psadbw (%2, %3), %%mm1 \n\t"
467 "paddw %%mm0, %%mm6 \n\t"
468 "paddw %%mm1, %%mm6 \n\t"
469 "movq %%mm2, %%mm0 \n\t"
470 "lea (%1,%3,2), %1 \n\t"
471 "lea (%2,%3,2), %2 \n\t"
474 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
482 "movq "MANGLE(bone)
", %%mm5 \n\t"
483 "movq (%1), %%mm0 \n\t"
484 "pavgb 1(%1), %%mm0 \n\t"
488 "movq (%1), %%mm1 \n\t"
489 "movq (%1,%3), %%mm2 \n\t"
490 "pavgb 1(%1), %%mm1 \n\t"
491 "pavgb 1(%1,%3), %%mm2 \n\t"
492 "psubusb %%mm5, %%mm1 \n\t"
493 "pavgb %%mm1, %%mm0 \n\t"
494 "pavgb %%mm2, %%mm1 \n\t"
495 "psadbw (%2), %%mm0 \n\t"
496 "psadbw (%2,%3), %%mm1 \n\t"
497 "paddw %%mm0, %%mm6 \n\t"
498 "paddw %%mm1, %%mm6 \n\t"
499 "movq %%mm2, %%mm0 \n\t"
500 "lea (%1,%3,2), %1 \n\t"
501 "lea (%2,%3,2), %2 \n\t"
504 :
"+r" (h),
"+r" (blk1),
"+r" (blk2)
516 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
517 "movq (%2, %%"REG_a
"), %%mm1 \n\t"
518 "movq (%1, %%"REG_a
"), %%mm2 \n\t"
519 "movq (%2, %%"REG_a
"), %%mm3 \n\t"
520 "punpcklbw %%mm7, %%mm0 \n\t"
521 "punpcklbw %%mm7, %%mm1 \n\t"
522 "punpckhbw %%mm7, %%mm2 \n\t"
523 "punpckhbw %%mm7, %%mm3 \n\t"
524 "paddw %%mm0, %%mm1 \n\t"
525 "paddw %%mm2, %%mm3 \n\t"
526 "movq (%3, %%"REG_a
"), %%mm4 \n\t"
527 "movq (%3, %%"REG_a
"), %%mm2 \n\t"
528 "paddw %%mm5, %%mm1 \n\t"
529 "paddw %%mm5, %%mm3 \n\t"
530 "psrlw $1, %%mm1 \n\t"
531 "psrlw $1, %%mm3 \n\t"
532 "packuswb %%mm3, %%mm1 \n\t"
533 "psubusb %%mm1, %%mm4 \n\t"
534 "psubusb %%mm2, %%mm1 \n\t"
535 "por %%mm4, %%mm1 \n\t"
536 "movq %%mm1, %%mm0 \n\t"
537 "punpcklbw %%mm7, %%mm0 \n\t"
538 "punpckhbw %%mm7, %%mm1 \n\t"
539 "paddw %%mm1, %%mm0 \n\t"
540 "paddw %%mm0, %%mm6 \n\t"
541 "add %4, %%"REG_a
" \n\t"
544 :
"r" (blk1a - len),
"r" (blk1b -
len),
"r" (blk2 - len),
548 static inline void sad8_4_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
552 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
553 "movq 1(%1, %%"REG_a
"), %%mm2 \n\t"
554 "movq %%mm0, %%mm1 \n\t"
555 "movq %%mm2, %%mm3 \n\t"
556 "punpcklbw %%mm7, %%mm0 \n\t"
557 "punpckhbw %%mm7, %%mm1 \n\t"
558 "punpcklbw %%mm7, %%mm2 \n\t"
559 "punpckhbw %%mm7, %%mm3 \n\t"
560 "paddw %%mm2, %%mm0 \n\t"
561 "paddw %%mm3, %%mm1 \n\t"
564 "movq (%2, %%"REG_a
"), %%mm2 \n\t"
565 "movq 1(%2, %%"REG_a
"), %%mm4 \n\t"
566 "movq %%mm2, %%mm3 \n\t"
567 "movq %%mm4, %%mm5 \n\t"
568 "punpcklbw %%mm7, %%mm2 \n\t"
569 "punpckhbw %%mm7, %%mm3 \n\t"
570 "punpcklbw %%mm7, %%mm4 \n\t"
571 "punpckhbw %%mm7, %%mm5 \n\t"
572 "paddw %%mm4, %%mm2 \n\t"
573 "paddw %%mm5, %%mm3 \n\t"
574 "movq %5, %%mm5 \n\t"
575 "paddw %%mm2, %%mm0 \n\t"
576 "paddw %%mm3, %%mm1 \n\t"
577 "paddw %%mm5, %%mm0 \n\t"
578 "paddw %%mm5, %%mm1 \n\t"
579 "movq (%3, %%"REG_a
"), %%mm4 \n\t"
580 "movq (%3, %%"REG_a
"), %%mm5 \n\t"
581 "psrlw $2, %%mm0 \n\t"
582 "psrlw $2, %%mm1 \n\t"
583 "packuswb %%mm1, %%mm0 \n\t"
584 "psubusb %%mm0, %%mm4 \n\t"
585 "psubusb %%mm5, %%mm0 \n\t"
586 "por %%mm4, %%mm0 \n\t"
587 "movq %%mm0, %%mm4 \n\t"
588 "punpcklbw %%mm7, %%mm0 \n\t"
589 "punpckhbw %%mm7, %%mm4 \n\t"
590 "paddw %%mm0, %%mm6 \n\t"
591 "paddw %%mm4, %%mm6 \n\t"
592 "movq %%mm2, %%mm0 \n\t"
593 "movq %%mm3, %%mm1 \n\t"
594 "add %4, %%"REG_a
" \n\t"
597 :
"r" (blk1 - len),
"r" (blk1 - len + stride),
"r" (blk2 - len),
598 "r" ((
x86_reg) stride),
"m" (round_tab[2]));
601 static inline int sum_mmx(
void)
605 "movq %%mm6, %%mm0 \n\t"
606 "psrlq $32, %%mm6 \n\t"
607 "paddw %%mm0, %%mm6 \n\t"
608 "movq %%mm6, %%mm0 \n\t"
609 "psrlq $16, %%mm6 \n\t"
610 "paddw %%mm0, %%mm6 \n\t"
611 "movd %%mm6, %0 \n\t"
616 static inline int sum_mmxext(
void)
620 "movd %%mm6, %0 \n\t"
625 static inline void sad8_x2a_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
627 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
630 static inline void sad8_y2a_mmx(
uint8_t *blk1,
uint8_t *blk2,
int stride,
int h)
632 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
635 #define PIX_SAD(suf) \
636 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
637 uint8_t *blk1, int stride, int h) \
639 av_assert2(h == 8); \
641 "pxor %%mm7, %%mm7 \n\t" \
642 "pxor %%mm6, %%mm6 \n\t" \
645 sad8_1_ ## suf(blk1, blk2, stride, 8); \
647 return sum_ ## suf(); \
650 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
651 uint8_t *blk1, int stride, int h) \
653 av_assert2(h == 8); \
655 "pxor %%mm7, %%mm7 \n\t" \
656 "pxor %%mm6, %%mm6 \n\t" \
657 "movq %0, %%mm5 \n\t" \
658 :: "m" (round_tab[1])); \
660 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
662 return sum_ ## suf(); \
665 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
666 uint8_t *blk1, int stride, int h) \
668 av_assert2(h == 8); \
670 "pxor %%mm7, %%mm7 \n\t" \
671 "pxor %%mm6, %%mm6 \n\t" \
672 "movq %0, %%mm5 \n\t" \
673 :: "m" (round_tab[1])); \
675 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
677 return sum_ ## suf(); \
680 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
681 uint8_t *blk1, int stride, int h) \
683 av_assert2(h == 8); \
685 "pxor %%mm7, %%mm7 \n\t" \
686 "pxor %%mm6, %%mm6 \n\t" \
689 sad8_4_ ## suf(blk1, blk2, stride, 8); \
691 return sum_ ## suf(); \
694 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
695 uint8_t *blk1, int stride, int h) \
698 "pxor %%mm7, %%mm7 \n\t" \
699 "pxor %%mm6, %%mm6 \n\t" \
702 sad8_1_ ## suf(blk1, blk2, stride, h); \
703 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
705 return sum_ ## suf(); \
708 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
709 uint8_t *blk1, int stride, int h) \
712 "pxor %%mm7, %%mm7 \n\t" \
713 "pxor %%mm6, %%mm6 \n\t" \
714 "movq %0, %%mm5 \n\t" \
715 :: "m" (round_tab[1])); \
717 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
718 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
720 return sum_ ## suf(); \
723 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
724 uint8_t *blk1, int stride, int h) \
727 "pxor %%mm7, %%mm7 \n\t" \
728 "pxor %%mm6, %%mm6 \n\t" \
729 "movq %0, %%mm5 \n\t" \
730 :: "m" (round_tab[1])); \
732 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
733 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
735 return sum_ ## suf(); \
738 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
739 uint8_t *blk1, int stride, int h) \
742 "pxor %%mm7, %%mm7 \n\t" \
743 "pxor %%mm6, %%mm6 \n\t" \
746 sad8_4_ ## suf(blk1, blk2, stride, h); \
747 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
749 return sum_ ## suf(); \
764 c->
pix_abs[0][1] = sad16_x2_mmx;
765 c->
pix_abs[0][2] = sad16_y2_mmx;
766 c->
pix_abs[0][3] = sad16_xy2_mmx;
768 c->
pix_abs[1][1] = sad8_x2_mmx;
769 c->
pix_abs[1][2] = sad8_y2_mmx;
770 c->
pix_abs[1][3] = sad8_xy2_mmx;
772 c->
sad[0] = sad16_mmx;
773 c->
sad[1] = sad8_mmx;
775 c->
vsad[4] = vsad_intra16_mmx;
778 c->
vsad[0] = vsad16_mmx;
783 c->
vsad[4] = vsad_intra16_mmxext;
785 c->
pix_abs[0][0] = sad16_mmxext;
786 c->
pix_abs[1][0] = sad8_mmxext;
788 c->
sad[0] = sad16_mmxext;
789 c->
sad[1] = sad8_mmxext;
791 c->
pix_abs[0][1] = sad16_x2_mmxext;
792 c->
pix_abs[0][2] = sad16_y2_mmxext;
793 c->
pix_abs[1][1] = sad8_x2_mmxext;
794 c->
pix_abs[1][2] = sad8_y2_mmxext;
797 c->
pix_abs[0][3] = sad16_xy2_mmxext;
798 c->
pix_abs[1][3] = sad8_xy2_mmxext;
800 c->
vsad[0] = vsad16_mmxext;
805 c->
sad[0] = sad16_sse2;
817 c->
nsse[0] = nsse16_mmx;
818 c->
nsse[1] = nsse8_mmx;
832 #if HAVE_ALIGNED_STACK
840 #if HAVE_ALIGNED_STACK