43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
201 #define SKIP_FOR_3DNOW
208 #undef SKIP_FOR_3DNOW
213 #define DEF(x) x ## _mmxext
216 #define PAVGB "pavgb"
225 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
226 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
227 #define put_pixels16_mmxext put_pixels16_mmx
228 #define put_pixels8_mmxext put_pixels8_mmx
229 #define put_pixels4_mmxext put_pixels4_mmx
230 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
231 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
247 "movq (%3), %%mm0 \n\t"
248 "movq 8(%3), %%mm1 \n\t"
249 "movq 16(%3), %%mm2 \n\t"
250 "movq 24(%3), %%mm3 \n\t"
251 "movq 32(%3), %%mm4 \n\t"
252 "movq 40(%3), %%mm5 \n\t"
253 "movq 48(%3), %%mm6 \n\t"
254 "movq 56(%3), %%mm7 \n\t"
255 "packuswb %%mm1, %%mm0 \n\t"
256 "packuswb %%mm3, %%mm2 \n\t"
257 "packuswb %%mm5, %%mm4 \n\t"
258 "packuswb %%mm7, %%mm6 \n\t"
259 "movq %%mm0, (%0) \n\t"
260 "movq %%mm2, (%0, %1) \n\t"
261 "movq %%mm4, (%0, %1, 2) \n\t"
262 "movq %%mm6, (%0, %2) \n\t"
263 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
266 pix += line_size * 4;
273 "movq (%3), %%mm0 \n\t"
274 "movq 8(%3), %%mm1 \n\t"
275 "movq 16(%3), %%mm2 \n\t"
276 "movq 24(%3), %%mm3 \n\t"
277 "movq 32(%3), %%mm4 \n\t"
278 "movq 40(%3), %%mm5 \n\t"
279 "movq 48(%3), %%mm6 \n\t"
280 "movq 56(%3), %%mm7 \n\t"
281 "packuswb %%mm1, %%mm0 \n\t"
282 "packuswb %%mm3, %%mm2 \n\t"
283 "packuswb %%mm5, %%mm4 \n\t"
284 "packuswb %%mm7, %%mm6 \n\t"
285 "movq %%mm0, (%0) \n\t"
286 "movq %%mm2, (%0, %1) \n\t"
287 "movq %%mm4, (%0, %1, 2) \n\t"
288 "movq %%mm6, (%0, %2) \n\t"
289 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
293 #define put_signed_pixels_clamped_mmx_half(off) \
294 "movq "#off"(%2), %%mm1 \n\t" \
295 "movq 16 + "#off"(%2), %%mm2 \n\t" \
296 "movq 32 + "#off"(%2), %%mm3 \n\t" \
297 "movq 48 + "#off"(%2), %%mm4 \n\t" \
298 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
299 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
300 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
301 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
302 "paddb %%mm0, %%mm1 \n\t" \
303 "paddb %%mm0, %%mm2 \n\t" \
304 "paddb %%mm0, %%mm3 \n\t" \
305 "paddb %%mm0, %%mm4 \n\t" \
306 "movq %%mm1, (%0) \n\t" \
307 "movq %%mm2, (%0, %3) \n\t" \
308 "movq %%mm3, (%0, %3, 2) \n\t" \
309 "movq %%mm4, (%0, %1) \n\t"
319 "lea (%3, %3, 2), %1 \n\t"
320 put_signed_pixels_clamped_mmx_half(0)
321 "lea (%0, %3, 4), %0 \n\t"
322 put_signed_pixels_clamped_mmx_half(64)
323 :
"+&r"(pixels),
"=&r"(line_skip3)
324 :
"r"(block),
"r"(line_skip)
342 "movq (%2), %%mm0 \n\t"
343 "movq 8(%2), %%mm1 \n\t"
344 "movq 16(%2), %%mm2 \n\t"
345 "movq 24(%2), %%mm3 \n\t"
346 "movq %0, %%mm4 \n\t"
347 "movq %1, %%mm6 \n\t"
348 "movq %%mm4, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm4 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm4, %%mm0 \n\t"
352 "paddsw %%mm5, %%mm1 \n\t"
353 "movq %%mm6, %%mm5 \n\t"
354 "punpcklbw %%mm7, %%mm6 \n\t"
355 "punpckhbw %%mm7, %%mm5 \n\t"
356 "paddsw %%mm6, %%mm2 \n\t"
357 "paddsw %%mm5, %%mm3 \n\t"
358 "packuswb %%mm1, %%mm0 \n\t"
359 "packuswb %%mm3, %%mm2 \n\t"
360 "movq %%mm0, %0 \n\t"
361 "movq %%mm2, %1 \n\t"
362 :
"+m"(*pix),
"+m"(*(pix + line_size))
365 pix += line_size * 2;
370 static void put_pixels8_mmx(
uint8_t *block,
const uint8_t *pixels,
371 int line_size,
int h)
374 "lea (%3, %3), %%"REG_a
" \n\t"
377 "movq (%1 ), %%mm0 \n\t"
378 "movq (%1, %3), %%mm1 \n\t"
379 "movq %%mm0, (%2) \n\t"
380 "movq %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a
", %1 \n\t"
382 "add %%"REG_a
", %2 \n\t"
383 "movq (%1 ), %%mm0 \n\t"
384 "movq (%1, %3), %%mm1 \n\t"
385 "movq %%mm0, (%2) \n\t"
386 "movq %%mm1, (%2, %3) \n\t"
387 "add %%"REG_a
", %1 \n\t"
388 "add %%"REG_a
", %2 \n\t"
391 :
"+g"(h),
"+r"(pixels),
"+r"(
block)
397 static void put_pixels16_mmx(
uint8_t *block,
const uint8_t *pixels,
398 int line_size,
int h)
401 "lea (%3, %3), %%"REG_a
" \n\t"
404 "movq (%1 ), %%mm0 \n\t"
405 "movq 8(%1 ), %%mm4 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq 8(%1, %3), %%mm5 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm4, 8(%2) \n\t"
410 "movq %%mm1, (%2, %3) \n\t"
411 "movq %%mm5, 8(%2, %3) \n\t"
412 "add %%"REG_a
", %1 \n\t"
413 "add %%"REG_a
", %2 \n\t"
414 "movq (%1 ), %%mm0 \n\t"
415 "movq 8(%1 ), %%mm4 \n\t"
416 "movq (%1, %3), %%mm1 \n\t"
417 "movq 8(%1, %3), %%mm5 \n\t"
418 "movq %%mm0, (%2) \n\t"
419 "movq %%mm4, 8(%2) \n\t"
420 "movq %%mm1, (%2, %3) \n\t"
421 "movq %%mm5, 8(%2, %3) \n\t"
422 "add %%"REG_a
", %1 \n\t"
423 "add %%"REG_a
", %2 \n\t"
426 :
"+g"(h),
"+r"(pixels),
"+r"(
block)
432 #define CLEAR_BLOCKS(name, n) \
433 static void name(DCTELEM *blocks) \
436 "pxor %%mm7, %%mm7 \n\t" \
437 "mov %1, %%"REG_a" \n\t" \
439 "movq %%mm7, (%0, %%"REG_a") \n\t" \
440 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
441 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
442 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
443 "add $32, %%"REG_a" \n\t" \
445 :: "r"(((uint8_t *)blocks) + 128 * n), \
450 CLEAR_BLOCKS(clear_blocks_mmx, 6)
451 CLEAR_BLOCKS(clear_block_mmx, 1)
453 static
void clear_block_sse(
DCTELEM *block)
456 "xorps %%xmm0, %%xmm0 \n"
457 "movaps %%xmm0, (%0) \n"
458 "movaps %%xmm0, 16(%0) \n"
459 "movaps %%xmm0, 32(%0) \n"
460 "movaps %%xmm0, 48(%0) \n"
461 "movaps %%xmm0, 64(%0) \n"
462 "movaps %%xmm0, 80(%0) \n"
463 "movaps %%xmm0, 96(%0) \n"
464 "movaps %%xmm0, 112(%0) \n"
470 static void clear_blocks_sse(
DCTELEM *blocks)
473 "xorps %%xmm0, %%xmm0 \n"
474 "mov %1, %%"REG_a
" \n"
476 "movaps %%xmm0, (%0, %%"REG_a
") \n"
477 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
478 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
479 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
480 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
481 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
482 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
483 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
484 "add $128, %%"REG_a
" \n"
486 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
498 "movq (%1, %0), %%mm0 \n\t"
499 "movq (%2, %0), %%mm1 \n\t"
500 "paddb %%mm0, %%mm1 \n\t"
501 "movq %%mm1, (%2, %0) \n\t"
502 "movq 8(%1, %0), %%mm0 \n\t"
503 "movq 8(%2, %0), %%mm1 \n\t"
504 "paddb %%mm0, %%mm1 \n\t"
505 "movq %%mm1, 8(%2, %0) \n\t"
514 dst[i + 0] += src[i + 0];
518 static void add_hfyu_median_prediction_cmov(
uint8_t *dst,
const uint8_t *top,
520 int *left,
int *left_top)
524 int l = *left & 0xff;
525 int tl = *left_top & 0xff;
530 "movzbl (%3, %4), %2 \n"
543 "add (%6, %4), %b0 \n"
544 "mov %b0, (%5, %4) \n"
547 :
"+&q"(l),
"+&q"(tl),
"=&r"(
t),
"=&q"(x),
"+&r"(w2)
548 :
"r"(dst + w),
"r"(diff + w),
"rm"(top + w)
557 "movd (%1), %%mm0 \n\t"
559 "movd (%1), %%mm1 \n\t"
560 "movd (%1,%3,1), %%mm2 \n\t"
561 "movd (%1,%3,2), %%mm3 \n\t"
562 "punpcklbw %%mm1, %%mm0 \n\t"
563 "punpcklbw %%mm3, %%mm2 \n\t"
564 "movq %%mm0, %%mm1 \n\t"
565 "punpcklwd %%mm2, %%mm0 \n\t"
566 "punpckhwd %%mm2, %%mm1 \n\t"
567 "movd %%mm0, (%0) \n\t"
569 "punpckhdq %%mm0, %%mm0 \n\t"
570 "movd %%mm0, (%0) \n\t"
571 "movd %%mm1, (%0,%2,1) \n\t"
572 "punpckhdq %%mm1, %%mm1 \n\t"
573 "movd %%mm1, (%0,%2,2) \n\t"
583 #define H263_LOOP_FILTER \
584 "pxor %%mm7, %%mm7 \n\t" \
585 "movq %0, %%mm0 \n\t" \
586 "movq %0, %%mm1 \n\t" \
587 "movq %3, %%mm2 \n\t" \
588 "movq %3, %%mm3 \n\t" \
589 "punpcklbw %%mm7, %%mm0 \n\t" \
590 "punpckhbw %%mm7, %%mm1 \n\t" \
591 "punpcklbw %%mm7, %%mm2 \n\t" \
592 "punpckhbw %%mm7, %%mm3 \n\t" \
593 "psubw %%mm2, %%mm0 \n\t" \
594 "psubw %%mm3, %%mm1 \n\t" \
595 "movq %1, %%mm2 \n\t" \
596 "movq %1, %%mm3 \n\t" \
597 "movq %2, %%mm4 \n\t" \
598 "movq %2, %%mm5 \n\t" \
599 "punpcklbw %%mm7, %%mm2 \n\t" \
600 "punpckhbw %%mm7, %%mm3 \n\t" \
601 "punpcklbw %%mm7, %%mm4 \n\t" \
602 "punpckhbw %%mm7, %%mm5 \n\t" \
603 "psubw %%mm2, %%mm4 \n\t" \
604 "psubw %%mm3, %%mm5 \n\t" \
605 "psllw $2, %%mm4 \n\t" \
606 "psllw $2, %%mm5 \n\t" \
607 "paddw %%mm0, %%mm4 \n\t" \
608 "paddw %%mm1, %%mm5 \n\t" \
609 "pxor %%mm6, %%mm6 \n\t" \
610 "pcmpgtw %%mm4, %%mm6 \n\t" \
611 "pcmpgtw %%mm5, %%mm7 \n\t" \
612 "pxor %%mm6, %%mm4 \n\t" \
613 "pxor %%mm7, %%mm5 \n\t" \
614 "psubw %%mm6, %%mm4 \n\t" \
615 "psubw %%mm7, %%mm5 \n\t" \
616 "psrlw $3, %%mm4 \n\t" \
617 "psrlw $3, %%mm5 \n\t" \
618 "packuswb %%mm5, %%mm4 \n\t" \
619 "packsswb %%mm7, %%mm6 \n\t" \
620 "pxor %%mm7, %%mm7 \n\t" \
621 "movd %4, %%mm2 \n\t" \
622 "punpcklbw %%mm2, %%mm2 \n\t" \
623 "punpcklbw %%mm2, %%mm2 \n\t" \
624 "punpcklbw %%mm2, %%mm2 \n\t" \
625 "psubusb %%mm4, %%mm2 \n\t" \
626 "movq %%mm2, %%mm3 \n\t" \
627 "psubusb %%mm4, %%mm3 \n\t" \
628 "psubb %%mm3, %%mm2 \n\t" \
629 "movq %1, %%mm3 \n\t" \
630 "movq %2, %%mm4 \n\t" \
631 "pxor %%mm6, %%mm3 \n\t" \
632 "pxor %%mm6, %%mm4 \n\t" \
633 "paddusb %%mm2, %%mm3 \n\t" \
634 "psubusb %%mm2, %%mm4 \n\t" \
635 "pxor %%mm6, %%mm3 \n\t" \
636 "pxor %%mm6, %%mm4 \n\t" \
637 "paddusb %%mm2, %%mm2 \n\t" \
638 "packsswb %%mm1, %%mm0 \n\t" \
639 "pcmpgtb %%mm0, %%mm7 \n\t" \
640 "pxor %%mm7, %%mm0 \n\t" \
641 "psubb %%mm7, %%mm0 \n\t" \
642 "movq %%mm0, %%mm1 \n\t" \
643 "psubusb %%mm2, %%mm0 \n\t" \
644 "psubb %%mm0, %%mm1 \n\t" \
645 "pand %5, %%mm1 \n\t" \
646 "psrlw $2, %%mm1 \n\t" \
647 "pxor %%mm7, %%mm1 \n\t" \
648 "psubb %%mm7, %%mm1 \n\t" \
649 "movq %0, %%mm5 \n\t" \
650 "movq %3, %%mm6 \n\t" \
651 "psubb %%mm1, %%mm5 \n\t" \
652 "paddb %%mm1, %%mm6 \n\t"
654 static void h263_v_loop_filter_mmx(
uint8_t *src,
int stride,
int qscale)
656 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
662 "movq %%mm3, %1 \n\t"
663 "movq %%mm4, %2 \n\t"
664 "movq %%mm5, %0 \n\t"
665 "movq %%mm6, %3 \n\t"
666 :
"+m"(*(uint64_t*)(src - 2 * stride)),
667 "+m"(*(uint64_t*)(src - 1 *
stride)),
668 "+m"(*(uint64_t*)(src + 0 * stride)),
669 "+m"(*(uint64_t*)(src + 1 *
stride))
675 static void h263_h_loop_filter_mmx(
uint8_t *src,
int stride,
int qscale)
677 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
684 transpose4x4(btemp, src, 8, stride);
685 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
697 "movq %%mm5, %%mm1 \n\t"
698 "movq %%mm4, %%mm0 \n\t"
699 "punpcklbw %%mm3, %%mm5 \n\t"
700 "punpcklbw %%mm6, %%mm4 \n\t"
701 "punpckhbw %%mm3, %%mm1 \n\t"
702 "punpckhbw %%mm6, %%mm0 \n\t"
703 "movq %%mm5, %%mm3 \n\t"
704 "movq %%mm1, %%mm6 \n\t"
705 "punpcklwd %%mm4, %%mm5 \n\t"
706 "punpcklwd %%mm0, %%mm1 \n\t"
707 "punpckhwd %%mm4, %%mm3 \n\t"
708 "punpckhwd %%mm0, %%mm6 \n\t"
709 "movd %%mm5, (%0) \n\t"
710 "punpckhdq %%mm5, %%mm5 \n\t"
711 "movd %%mm5, (%0, %2) \n\t"
712 "movd %%mm3, (%0, %2, 2) \n\t"
713 "punpckhdq %%mm3, %%mm3 \n\t"
714 "movd %%mm3, (%0, %3) \n\t"
715 "movd %%mm1, (%1) \n\t"
716 "punpckhdq %%mm1, %%mm1 \n\t"
717 "movd %%mm1, (%1, %2) \n\t"
718 "movd %%mm6, (%1, %2, 2) \n\t"
719 "punpckhdq %%mm6, %%mm6 \n\t"
720 "movd %%mm6, (%1, %3) \n\t"
722 "r"(src + 4 * stride),
732 int w,
int h,
int sides)
737 last_line = buf + (height - 1) * wrap;
743 "movd (%0), %%mm0 \n\t"
744 "punpcklbw %%mm0, %%mm0 \n\t"
745 "punpcklwd %%mm0, %%mm0 \n\t"
746 "punpckldq %%mm0, %%mm0 \n\t"
747 "movq %%mm0, -8(%0) \n\t"
748 "movq -8(%0, %2), %%mm1 \n\t"
749 "punpckhbw %%mm1, %%mm1 \n\t"
750 "punpckhwd %%mm1, %%mm1 \n\t"
751 "punpckhdq %%mm1, %%mm1 \n\t"
752 "movq %%mm1, (%0, %2) \n\t"
762 "movd (%0), %%mm0 \n\t"
763 "punpcklbw %%mm0, %%mm0 \n\t"
764 "punpcklwd %%mm0, %%mm0 \n\t"
765 "punpckldq %%mm0, %%mm0 \n\t"
766 "movq %%mm0, -8(%0) \n\t"
767 "movq %%mm0, -16(%0) \n\t"
768 "movq -8(%0, %2), %%mm1 \n\t"
769 "punpckhbw %%mm1, %%mm1 \n\t"
770 "punpckhwd %%mm1, %%mm1 \n\t"
771 "punpckhdq %%mm1, %%mm1 \n\t"
772 "movq %%mm1, (%0, %2) \n\t"
773 "movq %%mm1, 8(%0, %2) \n\t"
784 "movd (%0), %%mm0 \n\t"
785 "punpcklbw %%mm0, %%mm0 \n\t"
786 "punpcklwd %%mm0, %%mm0 \n\t"
787 "movd %%mm0, -4(%0) \n\t"
788 "movd -4(%0, %2), %%mm1 \n\t"
789 "punpcklbw %%mm1, %%mm1 \n\t"
790 "punpckhwd %%mm1, %%mm1 \n\t"
791 "punpckhdq %%mm1, %%mm1 \n\t"
792 "movd %%mm1, (%0, %2) \n\t"
803 for (i = 0; i < h; i += 4) {
804 ptr = buf - (i + 1) * wrap - w;
807 "movq (%1, %0), %%mm0 \n\t"
808 "movq %%mm0, (%0) \n\t"
809 "movq %%mm0, (%0, %2) \n\t"
810 "movq %%mm0, (%0, %2, 2) \n\t"
811 "movq %%mm0, (%0, %3) \n\t"
817 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
823 for (i = 0; i < h; i += 4) {
824 ptr = last_line + (i + 1) * wrap - w;
827 "movq (%1, %0), %%mm0 \n\t"
828 "movq %%mm0, (%0) \n\t"
829 "movq %%mm0, (%0, %2) \n\t"
830 "movq %%mm0, (%0, %2, 2) \n\t"
831 "movq %%mm0, (%0, %3) \n\t"
838 "r"(ptr + width + 2 * w)
844 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
845 in0, in1, in2, in7, out, OP) \
846 "paddw "#m4", "#m3" \n\t" \
847 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
848 "pmullw "#m3", %%mm4 \n\t" \
849 "movq "#in7", "#m3" \n\t" \
850 "movq "#in0", %%mm5 \n\t" \
851 "paddw "#m3", %%mm5 \n\t" \
852 "psubw %%mm5, %%mm4 \n\t" \
853 "movq "#in1", %%mm5 \n\t" \
854 "movq "#in2", %%mm6 \n\t" \
855 "paddw "#m6", %%mm5 \n\t" \
856 "paddw "#m5", %%mm6 \n\t" \
857 "paddw %%mm6, %%mm6 \n\t" \
858 "psubw %%mm6, %%mm5 \n\t" \
859 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
860 "paddw "#rnd", %%mm4 \n\t" \
861 "paddw %%mm4, %%mm5 \n\t" \
862 "psraw $5, %%mm5 \n\t" \
863 "packuswb %%mm5, %%mm5 \n\t" \
864 OP(%%mm5, out, %%mm7, d)
866 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
867 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
876 "pxor %%mm7, %%mm7 \n\t" \
878 "movq (%0), %%mm0 \n\t" \
879 "movq %%mm0, %%mm1 \n\t" \
880 "movq %%mm0, %%mm2 \n\t" \
881 "punpcklbw %%mm7, %%mm0 \n\t" \
882 "punpckhbw %%mm7, %%mm1 \n\t" \
883 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
884 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
885 "movq %%mm2, %%mm3 \n\t" \
886 "movq %%mm2, %%mm4 \n\t" \
887 "psllq $8, %%mm2 \n\t" \
888 "psllq $16, %%mm3 \n\t" \
889 "psllq $24, %%mm4 \n\t" \
890 "punpckhbw %%mm7, %%mm2 \n\t" \
891 "punpckhbw %%mm7, %%mm3 \n\t" \
892 "punpckhbw %%mm7, %%mm4 \n\t" \
893 "paddw %%mm3, %%mm5 \n\t" \
894 "paddw %%mm2, %%mm6 \n\t" \
895 "paddw %%mm5, %%mm5 \n\t" \
896 "psubw %%mm5, %%mm6 \n\t" \
897 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
898 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
899 "paddw %%mm4, %%mm0 \n\t" \
900 "paddw %%mm1, %%mm5 \n\t" \
901 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
902 "psubw %%mm5, %%mm0 \n\t" \
903 "paddw %6, %%mm6 \n\t" \
904 "paddw %%mm6, %%mm0 \n\t" \
905 "psraw $5, %%mm0 \n\t" \
906 "movq %%mm0, %5 \n\t" \
909 "movq 5(%0), %%mm0 \n\t" \
910 "movq %%mm0, %%mm5 \n\t" \
911 "movq %%mm0, %%mm6 \n\t" \
912 "psrlq $8, %%mm0 \n\t" \
913 "psrlq $16, %%mm5 \n\t" \
914 "punpcklbw %%mm7, %%mm0 \n\t" \
915 "punpcklbw %%mm7, %%mm5 \n\t" \
916 "paddw %%mm0, %%mm2 \n\t" \
917 "paddw %%mm5, %%mm3 \n\t" \
918 "paddw %%mm2, %%mm2 \n\t" \
919 "psubw %%mm2, %%mm3 \n\t" \
920 "movq %%mm6, %%mm2 \n\t" \
921 "psrlq $24, %%mm6 \n\t" \
922 "punpcklbw %%mm7, %%mm2 \n\t" \
923 "punpcklbw %%mm7, %%mm6 \n\t" \
924 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
925 "paddw %%mm2, %%mm1 \n\t" \
926 "paddw %%mm6, %%mm4 \n\t" \
927 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
928 "psubw %%mm4, %%mm3 \n\t" \
929 "paddw %6, %%mm1 \n\t" \
930 "paddw %%mm1, %%mm3 \n\t" \
931 "psraw $5, %%mm3 \n\t" \
932 "movq %5, %%mm1 \n\t" \
933 "packuswb %%mm3, %%mm1 \n\t" \
934 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
937 "movq 9(%0), %%mm1 \n\t" \
938 "movq %%mm1, %%mm4 \n\t" \
939 "movq %%mm1, %%mm3 \n\t" \
940 "psrlq $8, %%mm1 \n\t" \
941 "psrlq $16, %%mm4 \n\t" \
942 "punpcklbw %%mm7, %%mm1 \n\t" \
943 "punpcklbw %%mm7, %%mm4 \n\t" \
944 "paddw %%mm1, %%mm5 \n\t" \
945 "paddw %%mm4, %%mm0 \n\t" \
946 "paddw %%mm5, %%mm5 \n\t" \
947 "psubw %%mm5, %%mm0 \n\t" \
948 "movq %%mm3, %%mm5 \n\t" \
949 "psrlq $24, %%mm3 \n\t" \
950 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
951 "punpcklbw %%mm7, %%mm3 \n\t" \
952 "paddw %%mm3, %%mm2 \n\t" \
953 "psubw %%mm2, %%mm0 \n\t" \
954 "movq %%mm5, %%mm2 \n\t" \
955 "punpcklbw %%mm7, %%mm2 \n\t" \
956 "punpckhbw %%mm7, %%mm5 \n\t" \
957 "paddw %%mm2, %%mm6 \n\t" \
958 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
959 "paddw %6, %%mm0 \n\t" \
960 "paddw %%mm6, %%mm0 \n\t" \
961 "psraw $5, %%mm0 \n\t" \
965 "paddw %%mm5, %%mm3 \n\t" \
966 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
967 "paddw %%mm4, %%mm6 \n\t" \
968 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
969 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
970 "paddw %%mm1, %%mm4 \n\t" \
971 "paddw %%mm2, %%mm5 \n\t" \
972 "paddw %%mm6, %%mm6 \n\t" \
973 "psubw %%mm6, %%mm4 \n\t" \
974 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
975 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
976 "psubw %%mm5, %%mm3 \n\t" \
977 "paddw %6, %%mm4 \n\t" \
978 "paddw %%mm3, %%mm4 \n\t" \
979 "psraw $5, %%mm4 \n\t" \
980 "packuswb %%mm4, %%mm0 \n\t" \
981 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
987 : "+a"(src), "+c"(dst), "+D"(h) \
988 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
989 "m"(temp), "m"(ROUNDER) \
994 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
1000 __asm__ volatile ( \
1001 "pxor %%mm7, %%mm7 \n\t" \
1003 "movq (%0), %%mm0 \n\t" \
1004 "movq %%mm0, %%mm1 \n\t" \
1005 "movq %%mm0, %%mm2 \n\t" \
1006 "punpcklbw %%mm7, %%mm0 \n\t" \
1007 "punpckhbw %%mm7, %%mm1 \n\t" \
1008 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
1009 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
1010 "movq %%mm2, %%mm3 \n\t" \
1011 "movq %%mm2, %%mm4 \n\t" \
1012 "psllq $8, %%mm2 \n\t" \
1013 "psllq $16, %%mm3 \n\t" \
1014 "psllq $24, %%mm4 \n\t" \
1015 "punpckhbw %%mm7, %%mm2 \n\t" \
1016 "punpckhbw %%mm7, %%mm3 \n\t" \
1017 "punpckhbw %%mm7, %%mm4 \n\t" \
1018 "paddw %%mm3, %%mm5 \n\t" \
1019 "paddw %%mm2, %%mm6 \n\t" \
1020 "paddw %%mm5, %%mm5 \n\t" \
1021 "psubw %%mm5, %%mm6 \n\t" \
1022 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
1023 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
1024 "paddw %%mm4, %%mm0 \n\t" \
1025 "paddw %%mm1, %%mm5 \n\t" \
1026 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
1027 "psubw %%mm5, %%mm0 \n\t" \
1028 "paddw %5, %%mm6 \n\t" \
1029 "paddw %%mm6, %%mm0 \n\t" \
1030 "psraw $5, %%mm0 \n\t" \
1033 "movd 5(%0), %%mm5 \n\t" \
1034 "punpcklbw %%mm7, %%mm5 \n\t" \
1035 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
1036 "paddw %%mm5, %%mm1 \n\t" \
1037 "paddw %%mm6, %%mm2 \n\t" \
1038 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
1039 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
1040 "paddw %%mm6, %%mm3 \n\t" \
1041 "paddw %%mm5, %%mm4 \n\t" \
1042 "paddw %%mm2, %%mm2 \n\t" \
1043 "psubw %%mm2, %%mm3 \n\t" \
1044 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
1045 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
1046 "psubw %%mm4, %%mm3 \n\t" \
1047 "paddw %5, %%mm1 \n\t" \
1048 "paddw %%mm1, %%mm3 \n\t" \
1049 "psraw $5, %%mm3 \n\t" \
1050 "packuswb %%mm3, %%mm0 \n\t" \
1051 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1057 : "+a"(src), "+c"(dst), "+d"(h) \
1058 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1064 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1065 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1070 uint64_t temp[17 * 4]; \
1071 uint64_t *temp_ptr = temp; \
1075 __asm__ volatile ( \
1076 "pxor %%mm7, %%mm7 \n\t" \
1078 "movq (%0), %%mm0 \n\t" \
1079 "movq (%0), %%mm1 \n\t" \
1080 "movq 8(%0), %%mm2 \n\t" \
1081 "movq 8(%0), %%mm3 \n\t" \
1082 "punpcklbw %%mm7, %%mm0 \n\t" \
1083 "punpckhbw %%mm7, %%mm1 \n\t" \
1084 "punpcklbw %%mm7, %%mm2 \n\t" \
1085 "punpckhbw %%mm7, %%mm3 \n\t" \
1086 "movq %%mm0, (%1) \n\t" \
1087 "movq %%mm1, 17 * 8(%1) \n\t" \
1088 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1089 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1094 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1095 : "r"((x86_reg)srcStride) \
1103 __asm__ volatile ( \
1106 "movq (%0), %%mm0 \n\t" \
1107 "movq 8(%0), %%mm1 \n\t" \
1108 "movq 16(%0), %%mm2 \n\t" \
1109 "movq 24(%0), %%mm3 \n\t" \
1110 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1115 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1117 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1118 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1120 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1121 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1123 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1124 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1126 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1127 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1129 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1131 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1133 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1134 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1136 "add $136, %0 \n\t" \
1141 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1142 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1144 "g"(4 - 14 * (x86_reg)dstStride) \
1149 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1154 uint64_t temp[9 * 2]; \
1155 uint64_t *temp_ptr = temp; \
1159 __asm__ volatile ( \
1160 "pxor %%mm7, %%mm7 \n\t" \
1162 "movq (%0), %%mm0 \n\t" \
1163 "movq (%0), %%mm1 \n\t" \
1164 "punpcklbw %%mm7, %%mm0 \n\t" \
1165 "punpckhbw %%mm7, %%mm1 \n\t" \
1166 "movq %%mm0, (%1) \n\t" \
1167 "movq %%mm1, 9*8(%1) \n\t" \
1172 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1173 : "r"((x86_reg)srcStride) \
1181 __asm__ volatile ( \
1184 "movq (%0), %%mm0 \n\t" \
1185 "movq 8(%0), %%mm1 \n\t" \
1186 "movq 16(%0), %%mm2 \n\t" \
1187 "movq 24(%0), %%mm3 \n\t" \
1188 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1189 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1191 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1193 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1195 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1197 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1199 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1200 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1202 "add $72, %0 \n\t" \
1207 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1208 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1210 "g"(4 - 6 * (x86_reg)dstStride) \
1215 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1218 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1221 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1225 uint8_t * const half = (uint8_t*)temp; \
1226 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1228 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1231 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1234 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1238 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1242 uint8_t * const half = (uint8_t*)temp; \
1243 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1245 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1249 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1253 uint8_t * const half = (uint8_t*)temp; \
1254 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1255 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1258 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1261 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1264 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1268 uint8_t * const half = (uint8_t*)temp; \
1269 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1270 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1274 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1277 uint64_t half[8 + 9]; \
1278 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1279 uint8_t * const halfHV = ((uint8_t*)half); \
1280 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1282 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1283 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1284 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1287 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1290 uint64_t half[8 + 9]; \
1291 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1292 uint8_t * const halfHV = ((uint8_t*)half); \
1293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1295 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1297 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1298 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1301 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1304 uint64_t half[8 + 9]; \
1305 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1306 uint8_t * const halfHV = ((uint8_t*)half); \
1307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1309 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1310 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1311 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1314 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1317 uint64_t half[8 + 9]; \
1318 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1319 uint8_t * const halfHV = ((uint8_t*)half); \
1320 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1322 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1324 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1325 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1328 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1331 uint64_t half[8 + 9]; \
1332 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1333 uint8_t * const halfHV = ((uint8_t*)half); \
1334 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1337 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1340 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1343 uint64_t half[8 + 9]; \
1344 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1345 uint8_t * const halfHV = ((uint8_t*)half); \
1346 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1348 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1349 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1352 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1355 uint64_t half[8 + 9]; \
1356 uint8_t * const halfH = ((uint8_t*)half); \
1357 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1359 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1360 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1363 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1366 uint64_t half[8 + 9]; \
1367 uint8_t * const halfH = ((uint8_t*)half); \
1368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1370 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1372 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1375 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1379 uint8_t * const halfH = ((uint8_t*)half); \
1380 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1382 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1385 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1388 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1391 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1394 uint64_t temp[32]; \
1395 uint8_t * const half = (uint8_t*)temp; \
1396 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1398 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1401 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1404 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1405 stride, stride, 16); \
1408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1411 uint64_t temp[32]; \
1412 uint8_t * const half = (uint8_t*)temp; \
1413 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1415 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1416 stride, stride, 16); \
1419 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1422 uint64_t temp[32]; \
1423 uint8_t * const half = (uint8_t*)temp; \
1424 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1426 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1429 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1435 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1438 uint64_t temp[32]; \
1439 uint8_t * const half = (uint8_t*)temp; \
1440 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1442 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1443 stride, stride, 16); \
1446 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1449 uint64_t half[16 * 2 + 17 * 2]; \
1450 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1451 uint8_t * const halfHV = ((uint8_t*)half); \
1452 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1454 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1456 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1458 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1461 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1464 uint64_t half[16 * 2 + 17 * 2]; \
1465 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1466 uint8_t * const halfHV = ((uint8_t*)half); \
1467 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1469 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1471 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1473 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1476 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1479 uint64_t half[16 * 2 + 17 * 2]; \
1480 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1481 uint8_t * const halfHV = ((uint8_t*)half); \
1482 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1484 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1488 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1492 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1495 uint64_t half[16 * 2 + 17 * 2]; \
1496 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1497 uint8_t * const halfHV = ((uint8_t*)half); \
1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1500 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1504 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1508 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1511 uint64_t half[16 * 2 + 17 * 2]; \
1512 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1513 uint8_t * const halfHV = ((uint8_t*)half); \
1514 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1518 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1521 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1524 uint64_t half[16 * 2 + 17 * 2]; \
1525 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1526 uint8_t * const halfHV = ((uint8_t*)half); \
1527 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1529 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1531 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1535 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1538 uint64_t half[17 * 2]; \
1539 uint8_t * const halfH = ((uint8_t*)half); \
1540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1542 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1544 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1547 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1550 uint64_t half[17 * 2]; \
1551 uint8_t * const halfH = ((uint8_t*)half); \
1552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1554 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1556 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1559 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1562 uint64_t half[17 * 2]; \
1563 uint8_t * const halfH = ((uint8_t*)half); \
1564 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1566 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1569 #define PUT_OP(a, b, temp, size) \
1570 "mov"#size" "#a", "#b" \n\t"
1572 #define AVG_MMXEXT_OP(a, b, temp, size) \
1573 "mov"#size" "#b", "#temp" \n\t" \
1574 "pavgb "#temp", "#a" \n\t" \
1575 "mov"#size" "#a", "#b" \n\t"
1577 QPEL_BASE(put_,
ff_pw_16, _, PUT_OP)
1578 QPEL_BASE(avg_,
ff_pw_16, _, AVG_MMXEXT_OP)
1579 QPEL_BASE(put_no_rnd_,
ff_pw_15, _no_rnd_, PUT_OP)
1580 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1581 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1592 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1595 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1596 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1600 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1604 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1609 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1611 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1612 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1613 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1614 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1618 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1624 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1633 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1634 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1636 QPEL_2TAP(put_, 16, mmxext)
1637 QPEL_2TAP(avg_, 16, mmxext)
1638 QPEL_2TAP(put_, 8, mmxext)
1639 QPEL_2TAP(avg_, 8, mmxext)
1643 put_pixels8_xy2_mmx(dst, src, stride, 8);
1647 put_pixels16_xy2_mmx(dst, src, stride, 16);
1651 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1655 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1658 typedef void emulated_edge_mc_func(
uint8_t *dst,
const uint8_t *src,
1659 ptrdiff_t linesize,
int block_w,
int block_h,
1660 int src_x,
int src_y,
int w,
int h);
1663 int stride,
int h,
int ox,
int oy,
1664 int dxx,
int dxy,
int dyx,
int dyy,
1665 int shift,
int r,
int width,
int height,
1666 emulated_edge_mc_func *emu_edge_fn)
1669 const int ix = ox >> (16 +
shift);
1670 const int iy = oy >> (16 +
shift);
1671 const int oxs = ox >> 4;
1672 const int oys = oy >> 4;
1673 const int dxxs = dxx >> 4;
1674 const int dxys = dxy >> 4;
1675 const int dyxs = dyx >> 4;
1676 const int dyys = dyy >> 4;
1677 const uint16_t r4[4] = {
r,
r,
r, r };
1678 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1679 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1681 #define MAX_STRIDE 4096U
1683 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1686 const int dxw = (dxx - (1 << (16 +
shift))) * (w - 1);
1687 const int dyh = (dyy - (1 << (16 +
shift))) * (h - 1);
1688 const int dxh = dxy * (h - 1);
1689 const int dyw = dyx * (w - 1);
1690 int need_emu = (unsigned)ix >= width - w ||
1691 (
unsigned)iy >= height - h;
1694 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1695 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 +
shift)
1697 || (dxx | dxy | dyx | dyy) & 15
1698 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1700 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1701 shift,
r, width, height);
1707 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1712 "movd %0, %%mm6 \n\t"
1713 "pxor %%mm7, %%mm7 \n\t"
1714 "punpcklwd %%mm6, %%mm6 \n\t"
1715 "punpcklwd %%mm6, %%mm6 \n\t"
1719 for (x = 0; x < w; x += 4) {
1720 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1721 oxs - dxys + dxxs * (x + 1),
1722 oxs - dxys + dxxs * (x + 2),
1723 oxs - dxys + dxxs * (x + 3) };
1724 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1725 oys - dyys + dyxs * (x + 1),
1726 oys - dyys + dyxs * (x + 2),
1727 oys - dyys + dyxs * (x + 3) };
1729 for (y = 0; y < h; y++) {
1731 "movq %0, %%mm4 \n\t"
1732 "movq %1, %%mm5 \n\t"
1733 "paddw %2, %%mm4 \n\t"
1734 "paddw %3, %%mm5 \n\t"
1735 "movq %%mm4, %0 \n\t"
1736 "movq %%mm5, %1 \n\t"
1737 "psrlw $12, %%mm4 \n\t"
1738 "psrlw $12, %%mm5 \n\t"
1739 :
"+m"(*dx4),
"+m"(*dy4)
1740 :
"m"(*dxy4),
"m"(*dyy4)
1744 "movq %%mm6, %%mm2 \n\t"
1745 "movq %%mm6, %%mm1 \n\t"
1746 "psubw %%mm4, %%mm2 \n\t"
1747 "psubw %%mm5, %%mm1 \n\t"
1748 "movq %%mm2, %%mm0 \n\t"
1749 "movq %%mm4, %%mm3 \n\t"
1750 "pmullw %%mm1, %%mm0 \n\t"
1751 "pmullw %%mm5, %%mm3 \n\t"
1752 "pmullw %%mm5, %%mm2 \n\t"
1753 "pmullw %%mm4, %%mm1 \n\t"
1755 "movd %4, %%mm5 \n\t"
1756 "movd %3, %%mm4 \n\t"
1757 "punpcklbw %%mm7, %%mm5 \n\t"
1758 "punpcklbw %%mm7, %%mm4 \n\t"
1759 "pmullw %%mm5, %%mm3 \n\t"
1760 "pmullw %%mm4, %%mm2 \n\t"
1762 "movd %2, %%mm5 \n\t"
1763 "movd %1, %%mm4 \n\t"
1764 "punpcklbw %%mm7, %%mm5 \n\t"
1765 "punpcklbw %%mm7, %%mm4 \n\t"
1766 "pmullw %%mm5, %%mm1 \n\t"
1767 "pmullw %%mm4, %%mm0 \n\t"
1768 "paddw %5, %%mm1 \n\t"
1769 "paddw %%mm3, %%mm2 \n\t"
1770 "paddw %%mm1, %%mm0 \n\t"
1771 "paddw %%mm2, %%mm0 \n\t"
1773 "psrlw %6, %%mm0 \n\t"
1774 "packuswb %%mm0, %%mm0 \n\t"
1775 "movd %%mm0, %0 \n\t"
1777 :
"=m"(dst[x + y *
stride])
1778 :
"m"(src[0]),
"m"(src[1]),
1779 "m"(src[stride]),
"m"(src[stride + 1]),
1792 int stride,
int h,
int ox,
int oy,
1793 int dxx,
int dxy,
int dyx,
int dyy,
1794 int shift,
int r,
int width,
int height)
1796 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1797 width, height, &ff_emulated_edge_mc_8);
1801 int stride,
int h,
int ox,
int oy,
1802 int dxx,
int dxy,
int dyx,
int dyy,
1803 int shift,
int r,
int width,
int height)
1805 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1806 width, height, &ff_emulated_edge_mc_8);
1810 int stride,
int h,
int ox,
int oy,
1811 int dxx,
int dxy,
int dyx,
int dyy,
1812 int shift,
int r,
int width,
int height)
1814 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1815 width, height, &ff_emulated_edge_mc_8);
1825 int stride,
int h,
int x,
int y);
1827 int stride,
int h,
int x,
int y);
1829 int stride,
int h,
int x,
int y);
1832 int stride,
int h,
int x,
int y);
1834 int stride,
int h,
int x,
int y);
1836 int stride,
int h,
int x,
int y);
1839 int stride,
int h,
int x,
int y);
1841 int stride,
int h,
int x,
int y);
1844 int stride,
int h,
int x,
int y);
1846 int stride,
int h,
int x,
int y);
1849 int stride,
int h,
int x,
int y);
1851 int stride,
int h,
int x,
int y);
1853 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1854 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1855 (uint8_t *dst, uint8_t *src, \
1856 int stride, int h, int x, int y);
1872 put_pixels8_mmx(dst, src, stride, 8);
1877 avg_pixels8_mmx(dst, src, stride, 8);
1882 put_pixels16_mmx(dst, src, stride, 16);
1887 avg_pixels16_mmx(dst, src, stride, 16);
1892 int stride,
int rnd)
1894 put_pixels8_mmx(dst, src, stride, 8);
1898 int stride,
int rnd)
1900 avg_pixels8_mmxext(dst, src, stride, 8);
1910 "movq (%1), %%mm0 \n\t"
1911 "movq (%2), %%mm1 \n\t"
1912 "movq (%1,%4), %%mm2 \n\t"
1913 "movq (%2,%4), %%mm3 \n\t"
1914 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1915 "movq %%mm4, (%3) \n\t"
1916 "movq %%mm5, (%3,%4) \n\t"
1918 "movq (%1,%4,2), %%mm0 \n\t"
1919 "movq (%2,%4,2), %%mm1 \n\t"
1920 "movq (%1,%5), %%mm2 \n\t"
1921 "movq (%2,%5), %%mm3 \n\t"
1922 "lea (%1,%4,4), %1 \n\t"
1923 "lea (%2,%4,4), %2 \n\t"
1924 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1925 "movq %%mm4, (%3,%4,2) \n\t"
1926 "movq %%mm5, (%3,%5) \n\t"
1927 "lea (%3,%4,4), %3 \n\t"
1930 :
"+r"(h),
"+r"(a),
"+r"(
b),
"+r"(dst)
1935 static void put_vp_no_rnd_pixels16_l2_mmx(
uint8_t *dst,
const uint8_t *a,
const uint8_t *b,
int stride,
int h)
1937 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
1938 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
1941 #if CONFIG_DIRAC_DECODER
1942 #define DIRAC_PIXOP(OPNAME, EXT)\
1943 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1945 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1947 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1949 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1951 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1953 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1954 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1957 DIRAC_PIXOP(put, mmx)
1958 DIRAC_PIXOP(avg, mmx)
1959 DIRAC_PIXOP(avg, mmxext)
1964 ff_put_pixels16_sse2(dst, src[0], stride, h);
1968 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1972 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1973 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1977 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1978 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1986 static void ff_libmpeg2mmx_idct_put(
uint8_t *dest,
int line_size,
1993 static void ff_libmpeg2mmx_idct_add(
uint8_t *dest,
int line_size,
2000 static void ff_libmpeg2mmx2_idct_put(
uint8_t *dest,
int line_size,
2007 static void ff_libmpeg2mmx2_idct_add(
uint8_t *dest,
int line_size,
2015 static void vorbis_inverse_coupling_3dnow(
float *mag,
float *ang,
int blocksize)
2018 __asm__
volatile (
"pxor %%mm7, %%mm7":);
2019 for (i = 0; i < blocksize; i += 2) {
2021 "movq %0, %%mm0 \n\t"
2022 "movq %1, %%mm1 \n\t"
2023 "movq %%mm0, %%mm2 \n\t"
2024 "movq %%mm1, %%mm3 \n\t"
2025 "pfcmpge %%mm7, %%mm2 \n\t"
2026 "pfcmpge %%mm7, %%mm3 \n\t"
2027 "pslld $31, %%mm2 \n\t"
2028 "pxor %%mm2, %%mm1 \n\t"
2029 "movq %%mm3, %%mm4 \n\t"
2030 "pand %%mm1, %%mm3 \n\t"
2031 "pandn %%mm1, %%mm4 \n\t"
2032 "pfadd %%mm0, %%mm3 \n\t"
2033 "pfsub %%mm4, %%mm0 \n\t"
2034 "movq %%mm3, %1 \n\t"
2035 "movq %%mm0, %0 \n\t"
2036 :
"+m"(mag[i]),
"+m"(ang[i])
2040 __asm__
volatile (
"femms");
2043 static void vorbis_inverse_coupling_sse(
float *mag,
float *ang,
int blocksize)
2048 "movaps %0, %%xmm5 \n\t"
2051 for (i = 0; i < blocksize; i += 4) {
2053 "movaps %0, %%xmm0 \n\t"
2054 "movaps %1, %%xmm1 \n\t"
2055 "xorps %%xmm2, %%xmm2 \n\t"
2056 "xorps %%xmm3, %%xmm3 \n\t"
2057 "cmpleps %%xmm0, %%xmm2 \n\t"
2058 "cmpleps %%xmm1, %%xmm3 \n\t"
2059 "andps %%xmm5, %%xmm2 \n\t"
2060 "xorps %%xmm2, %%xmm1 \n\t"
2061 "movaps %%xmm3, %%xmm4 \n\t"
2062 "andps %%xmm1, %%xmm3 \n\t"
2063 "andnps %%xmm1, %%xmm4 \n\t"
2064 "addps %%xmm0, %%xmm3 \n\t"
2065 "subps %%xmm4, %%xmm0 \n\t"
2066 "movaps %%xmm3, %1 \n\t"
2067 "movaps %%xmm0, %0 \n\t"
2068 :
"+m"(mag[i]),
"+m"(ang[i])
2075 static void vector_fmul_window_3dnowext(
float *dst,
const float *src0,
2076 const float *src1,
const float *win,
2083 "pswapd (%5, %1), %%mm1 \n"
2084 "movq (%5, %0), %%mm0 \n"
2085 "pswapd (%4, %1), %%mm5 \n"
2086 "movq (%3, %0), %%mm4 \n"
2087 "movq %%mm0, %%mm2 \n"
2088 "movq %%mm1, %%mm3 \n"
2089 "pfmul %%mm4, %%mm2 \n"
2090 "pfmul %%mm5, %%mm3 \n"
2091 "pfmul %%mm4, %%mm1 \n"
2092 "pfmul %%mm5, %%mm0 \n"
2093 "pfadd %%mm3, %%mm2 \n"
2094 "pfsub %%mm0, %%mm1 \n"
2095 "pswapd %%mm2, %%mm2 \n"
2096 "movq %%mm1, (%2, %0) \n"
2097 "movq %%mm2, (%2, %1) \n"
2103 :
"r"(dst +
len),
"r"(src0 + len),
"r"(src1),
"r"(win + len)
2107 static void vector_fmul_window_sse(
float *dst,
const float *src0,
2108 const float *src1,
const float *win,
int len)
2114 "movaps (%5, %1), %%xmm1 \n"
2115 "movaps (%5, %0), %%xmm0 \n"
2116 "movaps (%4, %1), %%xmm5 \n"
2117 "movaps (%3, %0), %%xmm4 \n"
2118 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2119 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2120 "movaps %%xmm0, %%xmm2 \n"
2121 "movaps %%xmm1, %%xmm3 \n"
2122 "mulps %%xmm4, %%xmm2 \n"
2123 "mulps %%xmm5, %%xmm3 \n"
2124 "mulps %%xmm4, %%xmm1 \n"
2125 "mulps %%xmm5, %%xmm0 \n"
2126 "addps %%xmm3, %%xmm2 \n"
2127 "subps %%xmm0, %%xmm1 \n"
2128 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2129 "movaps %%xmm1, (%2, %0) \n"
2130 "movaps %%xmm2, (%2, %1) \n"
2135 :
"r"(dst +
len),
"r"(src0 + len),
"r"(src1),
"r"(win + len)
2140 static void vector_clipf_sse(
float *dst,
const float *src,
2141 float min,
float max,
int len)
2145 "movss %3, %%xmm4 \n\t"
2146 "movss %4, %%xmm5 \n\t"
2147 "shufps $0, %%xmm4, %%xmm4 \n\t"
2148 "shufps $0, %%xmm5, %%xmm5 \n\t"
2150 "movaps (%2, %0), %%xmm0 \n\t"
2151 "movaps 16(%2, %0), %%xmm1 \n\t"
2152 "movaps 32(%2, %0), %%xmm2 \n\t"
2153 "movaps 48(%2, %0), %%xmm3 \n\t"
2154 "maxps %%xmm4, %%xmm0 \n\t"
2155 "maxps %%xmm4, %%xmm1 \n\t"
2156 "maxps %%xmm4, %%xmm2 \n\t"
2157 "maxps %%xmm4, %%xmm3 \n\t"
2158 "minps %%xmm5, %%xmm0 \n\t"
2159 "minps %%xmm5, %%xmm1 \n\t"
2160 "minps %%xmm5, %%xmm2 \n\t"
2161 "minps %%xmm5, %%xmm3 \n\t"
2162 "movaps %%xmm0, (%1, %0) \n\t"
2163 "movaps %%xmm1, 16(%1, %0) \n\t"
2164 "movaps %%xmm2, 32(%1, %0) \n\t"
2165 "movaps %%xmm3, 48(%1, %0) \n\t"
2169 :
"r"(dst),
"r"(src),
"m"(min),
"m"(max)
2182 int order,
int mul);
2185 int order,
int mul);
2188 int order,
int mul);
2191 const int16_t *window,
unsigned int len);
2193 const int16_t *window,
unsigned int len);
2195 const int16_t *window,
unsigned int len);
2197 const int16_t *window,
unsigned int len);
2199 const int16_t *window,
unsigned int len);
2201 const int16_t *window,
unsigned int len);
2208 int *left,
int *left_top);
2217 const float *src1,
int len);
2219 const float *src1,
int len);
2222 const float *src2,
int len);
2224 const float *src2,
int len);
2236 const float *src1,
int len);
2238 const float *src1,
int len);
2240 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2242 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2243 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2244 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2245 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2246 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2247 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2248 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2249 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2250 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2251 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2252 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2253 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2254 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2255 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2256 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2257 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2260 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2262 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2263 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2264 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2265 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2268 #define H264_QPEL_FUNCS(x, y, CPU) \
2270 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2271 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2272 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2273 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2276 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2278 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2279 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2280 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2281 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2293 if (!high_bit_depth) {
2308 #if ARCH_X86_32 || !HAVE_YASM
2317 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2324 if (!high_bit_depth && CONFIG_H264CHROMA) {
2338 const int high_bit_depth = bit_depth > 8;
2353 if (!high_bit_depth) {
2370 if (!high_bit_depth) {
2388 #if HAVE_MMXEXT_EXTERNAL
2389 if (CONFIG_H264QPEL) {
2390 if (!high_bit_depth) {
2397 }
else if (bit_depth == 10) {
2409 if (!high_bit_depth && CONFIG_H264CHROMA) {
2415 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2443 if (!high_bit_depth) {
2479 if (!high_bit_depth && CONFIG_H264CHROMA) {
2489 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2499 if (!high_bit_depth) {
2523 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2533 const int high_bit_depth = bit_depth > 8;
2535 #if HAVE_SSE2_INLINE
2544 #if HAVE_SSE2_EXTERNAL
2547 if (!high_bit_depth) {
2551 if (CONFIG_H264QPEL)
2556 if (!high_bit_depth && CONFIG_H264QPEL) {
2571 if (bit_depth == 10) {
2572 if (CONFIG_H264QPEL) {
2581 if (CONFIG_H264CHROMA) {
2596 }
else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2606 #if HAVE_SSSE3_EXTERNAL
2610 if (!high_bit_depth && CONFIG_H264QPEL) {
2624 if (bit_depth == 10 && CONFIG_H264QPEL) {
2629 if (!high_bit_depth && CONFIG_H264CHROMA) {
2652 #if HAVE_SSE4_EXTERNAL
2659 #if HAVE_AVX_EXTERNAL
2662 if (bit_depth == 10) {
2665 if (CONFIG_H264QPEL) {
2671 if (CONFIG_H264CHROMA) {
2686 #if HAVE_7REGS && HAVE_INLINE_ASM
2704 c->
idct_put = ff_libmpeg2mmx2_idct_put;
2705 c->
idct_add = ff_libmpeg2mmx2_idct_add;
2708 c->
idct_put = ff_libmpeg2mmx_idct_put;
2709 c->
idct_add = ff_libmpeg2mmx_idct_add;
2760 if (CONFIG_ENCODERS)