36 #if HAVE_6REGS && HAVE_INLINE_ASM
39 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
42 #define NORMALIZE_MMX(SHIFT) \
43 "paddw %%mm7, %%mm3 \n\t" \
44 "paddw %%mm7, %%mm4 \n\t" \
45 "psraw "SHIFT", %%mm3 \n\t" \
46 "psraw "SHIFT", %%mm4 \n\t"
48 #define TRANSFER_DO_PACK(OP) \
49 "packuswb %%mm4, %%mm3 \n\t" \
51 "movq %%mm3, (%2) \n\t"
53 #define TRANSFER_DONT_PACK(OP) \
56 "movq %%mm3, 0(%2) \n\t" \
57 "movq %%mm4, 8(%2) \n\t"
60 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
61 #define DONT_UNPACK(reg)
64 #define LOAD_ROUNDER_MMX(ROUND) \
65 "movd "ROUND", %%mm7 \n\t" \
66 "punpcklwd %%mm7, %%mm7 \n\t" \
67 "punpckldq %%mm7, %%mm7 \n\t"
69 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
70 "paddw %%mm"#R2", %%mm"#R1" \n\t" \
71 "movd (%0,%3), %%mm"#R0" \n\t" \
72 "pmullw %%mm6, %%mm"#R1" \n\t" \
73 "punpcklbw %%mm0, %%mm"#R0" \n\t" \
74 "movd (%0,%2), %%mm"#R3" \n\t" \
75 "psubw %%mm"#R0", %%mm"#R1" \n\t" \
76 "punpcklbw %%mm0, %%mm"#R3" \n\t" \
77 "paddw %%mm7, %%mm"#R1" \n\t" \
78 "psubw %%mm"#R3", %%mm"#R1" \n\t" \
79 "psraw %4, %%mm"#R1" \n\t" \
80 "movq %%mm"#R1", "#OFF"(%1) \n\t" \
84 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
86 int rnd, int64_t
shift)
89 "mov $3, %%"REG_c
" \n\t"
90 LOAD_ROUNDER_MMX(
"%5")
93 "movd (%0), %%mm2 \n\t"
95 "movd (%0), %%mm3 \n\t"
96 "punpcklbw %%mm0, %%mm2 \n\t"
97 "punpcklbw %%mm0, %%mm3 \n\t"
98 SHIFT2_LINE( 0, 1, 2, 3, 4)
99 SHIFT2_LINE( 24, 2, 3, 4, 1)
100 SHIFT2_LINE( 48, 3, 4, 1, 2)
101 SHIFT2_LINE( 72, 4, 1, 2, 3)
102 SHIFT2_LINE( 96, 1, 2, 3, 4)
103 SHIFT2_LINE(120, 2, 3, 4, 1)
104 SHIFT2_LINE(144, 3, 4, 1, 2)
105 SHIFT2_LINE(168, 4, 1, 2, 3)
110 : "+
r"(src), "+
r"(dst)
111 : "
r"(stride), "
r"(-2*stride),
112 "
m"(shift), "
m"(rnd), "
r"(9*stride-4)
122 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
123 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
124 const int16_t *src, int rnd)\
129 rnd -= (-1+9+9-1)*1024; \
131 LOAD_ROUNDER_MMX("%4")\
132 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
133 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
135 "movq 2*0+0(%1), %%mm1 \n\t"\
136 "movq 2*0+8(%1), %%mm2 \n\t"\
137 "movq 2*1+0(%1), %%mm3 \n\t"\
138 "movq 2*1+8(%1), %%mm4 \n\t"\
139 "paddw 2*3+0(%1), %%mm1 \n\t"\
140 "paddw 2*3+8(%1), %%mm2 \n\t"\
141 "paddw 2*2+0(%1), %%mm3 \n\t"\
142 "paddw 2*2+8(%1), %%mm4 \n\t"\
143 "pmullw %%mm5, %%mm3 \n\t"\
144 "pmullw %%mm5, %%mm4 \n\t"\
145 "psubw %%mm1, %%mm3 \n\t"\
146 "psubw %%mm2, %%mm4 \n\t"\
149 "paddw %%mm6, %%mm3 \n\t"\
150 "paddw %%mm6, %%mm4 \n\t"\
151 TRANSFER_DO_PACK(OP)\
156 : "+r"(h), "+r" (src), "+r" (dst)\
157 : "r"(stride), "m"(rnd)\
158 NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
163 VC1_HOR_16b_SHIFT2(
OP_PUT, put_)
164 VC1_HOR_16b_SHIFT2(
OP_AVG, avg_)
171 #define VC1_SHIFT2(OP, OPNAME)\
172 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
173 x86_reg stride, int rnd, x86_reg offset)\
177 "mov $8, %%"REG_c" \n\t"\
178 LOAD_ROUNDER_MMX("%5")\
179 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
181 "movd 0(%0 ), %%mm3 \n\t"\
182 "movd 4(%0 ), %%mm4 \n\t"\
183 "movd 0(%0,%2), %%mm1 \n\t"\
184 "movd 4(%0,%2), %%mm2 \n\t"\
186 "punpcklbw %%mm0, %%mm3 \n\t"\
187 "punpcklbw %%mm0, %%mm4 \n\t"\
188 "punpcklbw %%mm0, %%mm1 \n\t"\
189 "punpcklbw %%mm0, %%mm2 \n\t"\
190 "paddw %%mm1, %%mm3 \n\t"\
191 "paddw %%mm2, %%mm4 \n\t"\
192 "movd 0(%0,%3), %%mm1 \n\t"\
193 "movd 4(%0,%3), %%mm2 \n\t"\
194 "pmullw %%mm6, %%mm3 \n\t" \
195 "pmullw %%mm6, %%mm4 \n\t" \
196 "punpcklbw %%mm0, %%mm1 \n\t"\
197 "punpcklbw %%mm0, %%mm2 \n\t"\
198 "psubw %%mm1, %%mm3 \n\t" \
199 "psubw %%mm2, %%mm4 \n\t" \
200 "movd 0(%0,%2), %%mm1 \n\t"\
201 "movd 4(%0,%2), %%mm2 \n\t"\
202 "punpcklbw %%mm0, %%mm1 \n\t"\
203 "punpcklbw %%mm0, %%mm2 \n\t"\
204 "psubw %%mm1, %%mm3 \n\t" \
205 "psubw %%mm2, %%mm4 \n\t" \
207 "packuswb %%mm4, %%mm3 \n\t"\
209 "movq %%mm3, (%1) \n\t"\
212 "dec %%"REG_c" \n\t"\
214 : "+r"(src), "+r"(dst)\
215 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
217 NAMED_CONSTRAINTS_ADD(ff_pw_9)\
218 : "%"REG_c, "memory"\
223 VC1_SHIFT2(OP_AVG, avg_)
235 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
236 MOVQ "*0+"A1", %%mm1 \n\t" \
237 MOVQ "*4+"A1", %%mm2 \n\t" \
240 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
241 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
242 MOVQ "*0+"A2", %%mm3 \n\t" \
243 MOVQ "*4+"A2", %%mm4 \n\t" \
246 "pmullw %%mm6, %%mm3 \n\t" \
247 "pmullw %%mm6, %%mm4 \n\t" \
248 "psubw %%mm1, %%mm3 \n\t" \
249 "psubw %%mm2, %%mm4 \n\t" \
250 MOVQ "*0+"A4", %%mm1 \n\t" \
251 MOVQ "*4+"A4", %%mm2 \n\t" \
254 "psllw $2, %%mm1 \n\t" \
255 "psllw $2, %%mm2 \n\t" \
256 "psubw %%mm1, %%mm3 \n\t" \
257 "psubw %%mm2, %%mm4 \n\t" \
258 MOVQ "*0+"A3", %%mm1 \n\t" \
259 MOVQ "*4+"A3", %%mm2 \n\t" \
262 "pmullw %%mm5, %%mm1 \n\t" \
263 "pmullw %%mm5, %%mm2 \n\t" \
264 "paddw %%mm1, %%mm3 \n\t" \
265 "paddw %%mm2, %%mm4 \n\t"
275 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
277 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
278 x86_reg src_stride, \
279 int rnd, int64_t shift) \
284 LOAD_ROUNDER_MMX("%5") \
285 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
286 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
289 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
290 NORMALIZE_MMX("%6") \
291 TRANSFER_DONT_PACK(OP_PUT) \
293 "movd 8+"A1", %%mm1 \n\t" \
295 "movq %%mm1, %%mm3 \n\t" \
296 "paddw %%mm1, %%mm1 \n\t" \
297 "paddw %%mm3, %%mm1 \n\t" \
298 "movd 8+"A2", %%mm3 \n\t" \
300 "pmullw %%mm6, %%mm3 \n\t" \
301 "psubw %%mm1, %%mm3 \n\t" \
302 "movd 8+"A3", %%mm1 \n\t" \
304 "pmullw %%mm5, %%mm1 \n\t" \
305 "paddw %%mm1, %%mm3 \n\t" \
306 "movd 8+"A4", %%mm1 \n\t" \
308 "psllw $2, %%mm1 \n\t" \
309 "psubw %%mm1, %%mm3 \n\t" \
310 "paddw %%mm7, %%mm3 \n\t" \
311 "psraw %6, %%mm3 \n\t" \
312 "movq %%mm3, 16(%2) \n\t" \
317 : "+r"(h), "+r" (src), "+r" (dst) \
318 : "r"(src_stride), "r"(3*src_stride), \
319 "m"(rnd), "m"(shift) \
320 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \
332 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
334 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
335 const int16_t *src, int rnd) \
339 rnd -= (-4+58+13-3)*256; \
341 LOAD_ROUNDER_MMX("%4") \
342 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
343 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
346 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
347 NORMALIZE_MMX("$7") \
349 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
350 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
351 TRANSFER_DO_PACK(OP) \
356 : "+r"(h), "+r" (src), "+r" (dst) \
357 : "r"(stride), "m"(rnd) \
358 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \
371 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
373 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
374 x86_reg stride, int rnd, x86_reg offset) \
380 LOAD_ROUNDER_MMX("%6") \
381 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
382 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
385 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
386 NORMALIZE_MMX("$6") \
387 TRANSFER_DO_PACK(OP) \
392 : "+r"(h), "+r" (src), "+r" (dst) \
393 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
394 NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \
400 MSPEL_FILTER13_8B (
shift1,
"0(%1,%4 )",
"0(%1,%3,2)",
"0(%1,%3 )",
"0(%1 )",
OP_PUT, put_)
401 MSPEL_FILTER13_8B (
shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
402 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
403 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)",
OP_PUT, put_)
404 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
407 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
408 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
409 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
410 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
411 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
413 typedef
void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const
uint8_t *src,
x86_reg src_stride,
int rnd, int64_t shift);
414 typedef
void (*vc1_mspel_mc_filter_hor_16bits)(
uint8_t *dst,
x86_reg dst_stride, const int16_t *src,
int rnd);
428 #define VC1_MSPEL_MC(OP)\
429 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
430 int hmode, int vmode, int rnd)\
432 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
433 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
434 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
435 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
436 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
437 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
440 "pxor %%mm0, %%mm0 \n\t"\
446 static const int shift_value[] = { 0, 5, 1, 5 };\
447 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
449 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
451 r = (1<<(shift-1)) + rnd-1;\
452 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
454 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
458 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
464 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
466 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
467 int stride, int hmode, int vmode, int rnd)\
469 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
470 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
471 dst += 8*stride; src += 8*stride; \
472 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
473 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
480 #define DECLARE_FUNCTION(a, b) \
481 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
482 const uint8_t *src, \
486 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
488 static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
489 const uint8_t *src, \
493 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
495 static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \
496 const uint8_t *src, \
500 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
502 static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \
507 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
510 DECLARE_FUNCTION(0, 1)
511 DECLARE_FUNCTION(0, 2)
512 DECLARE_FUNCTION(0, 3)
514 DECLARE_FUNCTION(1, 0)
515 DECLARE_FUNCTION(1, 1)
516 DECLARE_FUNCTION(1, 2)
517 DECLARE_FUNCTION(1, 3)
519 DECLARE_FUNCTION(2, 0)
520 DECLARE_FUNCTION(2, 1)
521 DECLARE_FUNCTION(2, 2)
522 DECLARE_FUNCTION(2, 3)
524 DECLARE_FUNCTION(3, 0)
525 DECLARE_FUNCTION(3, 1)
526 DECLARE_FUNCTION(3, 2)
527 DECLARE_FUNCTION(3, 3)
529 static
void vc1_inv_trans_4x4_dc_mmxext(
uint8_t *dest,
int linesize,
533 dc = (17 * dc + 4) >> 3;
534 dc = (17 * dc + 64) >> 7;
536 "movd %0, %%mm0 \n\t"
537 "pshufw $0, %%mm0, %%mm0 \n\t"
538 "pxor %%mm1, %%mm1 \n\t"
539 "psubw %%mm0, %%mm1 \n\t"
540 "packuswb %%mm0, %%mm0 \n\t"
541 "packuswb %%mm1, %%mm1 \n\t"
545 "movd %0, %%mm2 \n\t"
546 "movd %1, %%mm3 \n\t"
547 "movd %2, %%mm4 \n\t"
548 "movd %3, %%mm5 \n\t"
549 "paddusb %%mm0, %%mm2 \n\t"
550 "paddusb %%mm0, %%mm3 \n\t"
551 "paddusb %%mm0, %%mm4 \n\t"
552 "paddusb %%mm0, %%mm5 \n\t"
553 "psubusb %%mm1, %%mm2 \n\t"
554 "psubusb %%mm1, %%mm3 \n\t"
555 "psubusb %%mm1, %%mm4 \n\t"
556 "psubusb %%mm1, %%mm5 \n\t"
557 "movd %%mm2, %0 \n\t"
558 "movd %%mm3, %1 \n\t"
559 "movd %%mm4, %2 \n\t"
560 "movd %%mm5, %3 \n\t"
561 :
"+m"(*(uint32_t*)(dest+0*linesize)),
562 "+m"(*(uint32_t*)(dest+1*linesize)),
563 "+m"(*(uint32_t*)(dest+2*linesize)),
564 "+m"(*(uint32_t*)(dest+3*linesize))
568 static void vc1_inv_trans_4x8_dc_mmxext(
uint8_t *dest,
int linesize,
572 dc = (17 * dc + 4) >> 3;
573 dc = (12 * dc + 64) >> 7;
575 "movd %0, %%mm0 \n\t"
576 "pshufw $0, %%mm0, %%mm0 \n\t"
577 "pxor %%mm1, %%mm1 \n\t"
578 "psubw %%mm0, %%mm1 \n\t"
579 "packuswb %%mm0, %%mm0 \n\t"
580 "packuswb %%mm1, %%mm1 \n\t"
584 "movd %0, %%mm2 \n\t"
585 "movd %1, %%mm3 \n\t"
586 "movd %2, %%mm4 \n\t"
587 "movd %3, %%mm5 \n\t"
588 "paddusb %%mm0, %%mm2 \n\t"
589 "paddusb %%mm0, %%mm3 \n\t"
590 "paddusb %%mm0, %%mm4 \n\t"
591 "paddusb %%mm0, %%mm5 \n\t"
592 "psubusb %%mm1, %%mm2 \n\t"
593 "psubusb %%mm1, %%mm3 \n\t"
594 "psubusb %%mm1, %%mm4 \n\t"
595 "psubusb %%mm1, %%mm5 \n\t"
596 "movd %%mm2, %0 \n\t"
597 "movd %%mm3, %1 \n\t"
598 "movd %%mm4, %2 \n\t"
599 "movd %%mm5, %3 \n\t"
600 :
"+m"(*(uint32_t*)(dest+0*linesize)),
601 "+m"(*(uint32_t*)(dest+1*linesize)),
602 "+m"(*(uint32_t*)(dest+2*linesize)),
603 "+m"(*(uint32_t*)(dest+3*linesize))
607 "movd %0, %%mm2 \n\t"
608 "movd %1, %%mm3 \n\t"
609 "movd %2, %%mm4 \n\t"
610 "movd %3, %%mm5 \n\t"
611 "paddusb %%mm0, %%mm2 \n\t"
612 "paddusb %%mm0, %%mm3 \n\t"
613 "paddusb %%mm0, %%mm4 \n\t"
614 "paddusb %%mm0, %%mm5 \n\t"
615 "psubusb %%mm1, %%mm2 \n\t"
616 "psubusb %%mm1, %%mm3 \n\t"
617 "psubusb %%mm1, %%mm4 \n\t"
618 "psubusb %%mm1, %%mm5 \n\t"
619 "movd %%mm2, %0 \n\t"
620 "movd %%mm3, %1 \n\t"
621 "movd %%mm4, %2 \n\t"
622 "movd %%mm5, %3 \n\t"
623 :
"+m"(*(uint32_t*)(dest+0*linesize)),
624 "+m"(*(uint32_t*)(dest+1*linesize)),
625 "+m"(*(uint32_t*)(dest+2*linesize)),
626 "+m"(*(uint32_t*)(dest+3*linesize))
630 static void vc1_inv_trans_8x4_dc_mmxext(
uint8_t *dest,
int linesize,
634 dc = ( 3 * dc + 1) >> 1;
635 dc = (17 * dc + 64) >> 7;
637 "movd %0, %%mm0 \n\t"
638 "pshufw $0, %%mm0, %%mm0 \n\t"
639 "pxor %%mm1, %%mm1 \n\t"
640 "psubw %%mm0, %%mm1 \n\t"
641 "packuswb %%mm0, %%mm0 \n\t"
642 "packuswb %%mm1, %%mm1 \n\t"
646 "movq %0, %%mm2 \n\t"
647 "movq %1, %%mm3 \n\t"
648 "movq %2, %%mm4 \n\t"
649 "movq %3, %%mm5 \n\t"
650 "paddusb %%mm0, %%mm2 \n\t"
651 "paddusb %%mm0, %%mm3 \n\t"
652 "paddusb %%mm0, %%mm4 \n\t"
653 "paddusb %%mm0, %%mm5 \n\t"
654 "psubusb %%mm1, %%mm2 \n\t"
655 "psubusb %%mm1, %%mm3 \n\t"
656 "psubusb %%mm1, %%mm4 \n\t"
657 "psubusb %%mm1, %%mm5 \n\t"
658 "movq %%mm2, %0 \n\t"
659 "movq %%mm3, %1 \n\t"
660 "movq %%mm4, %2 \n\t"
661 "movq %%mm5, %3 \n\t"
662 :
"+m"(*(uint32_t*)(dest+0*linesize)),
663 "+m"(*(uint32_t*)(dest+1*linesize)),
664 "+m"(*(uint32_t*)(dest+2*linesize)),
665 "+m"(*(uint32_t*)(dest+3*linesize))
669 static void vc1_inv_trans_8x8_dc_mmxext(
uint8_t *dest,
int linesize,
673 dc = (3 * dc + 1) >> 1;
674 dc = (3 * dc + 16) >> 5;
676 "movd %0, %%mm0 \n\t"
677 "pshufw $0, %%mm0, %%mm0 \n\t"
678 "pxor %%mm1, %%mm1 \n\t"
679 "psubw %%mm0, %%mm1 \n\t"
680 "packuswb %%mm0, %%mm0 \n\t"
681 "packuswb %%mm1, %%mm1 \n\t"
685 "movq %0, %%mm2 \n\t"
686 "movq %1, %%mm3 \n\t"
687 "movq %2, %%mm4 \n\t"
688 "movq %3, %%mm5 \n\t"
689 "paddusb %%mm0, %%mm2 \n\t"
690 "paddusb %%mm0, %%mm3 \n\t"
691 "paddusb %%mm0, %%mm4 \n\t"
692 "paddusb %%mm0, %%mm5 \n\t"
693 "psubusb %%mm1, %%mm2 \n\t"
694 "psubusb %%mm1, %%mm3 \n\t"
695 "psubusb %%mm1, %%mm4 \n\t"
696 "psubusb %%mm1, %%mm5 \n\t"
697 "movq %%mm2, %0 \n\t"
698 "movq %%mm3, %1 \n\t"
699 "movq %%mm4, %2 \n\t"
700 "movq %%mm5, %3 \n\t"
701 :
"+m"(*(uint32_t*)(dest+0*linesize)),
702 "+m"(*(uint32_t*)(dest+1*linesize)),
703 "+m"(*(uint32_t*)(dest+2*linesize)),
704 "+m"(*(uint32_t*)(dest+3*linesize))
708 "movq %0, %%mm2 \n\t"
709 "movq %1, %%mm3 \n\t"
710 "movq %2, %%mm4 \n\t"
711 "movq %3, %%mm5 \n\t"
712 "paddusb %%mm0, %%mm2 \n\t"
713 "paddusb %%mm0, %%mm3 \n\t"
714 "paddusb %%mm0, %%mm4 \n\t"
715 "paddusb %%mm0, %%mm5 \n\t"
716 "psubusb %%mm1, %%mm2 \n\t"
717 "psubusb %%mm1, %%mm3 \n\t"
718 "psubusb %%mm1, %%mm4 \n\t"
719 "psubusb %%mm1, %%mm5 \n\t"
720 "movq %%mm2, %0 \n\t"
721 "movq %%mm3, %1 \n\t"
722 "movq %%mm4, %2 \n\t"
723 "movq %%mm5, %3 \n\t"
724 :
"+m"(*(uint32_t*)(dest+0*linesize)),
725 "+m"(*(uint32_t*)(dest+1*linesize)),
726 "+m"(*(uint32_t*)(dest+2*linesize)),
727 "+m"(*(uint32_t*)(dest+3*linesize))
731 #if HAVE_MMX_EXTERNAL
732 static void put_vc1_mspel_mc00_mmx(
uint8_t *dst,
const uint8_t *src,
733 ptrdiff_t stride,
int rnd)
737 static void put_vc1_mspel_mc00_16_mmx(
uint8_t *dst,
const uint8_t *src,
738 ptrdiff_t stride,
int rnd)
742 static void avg_vc1_mspel_mc00_mmx(
uint8_t *dst,
const uint8_t *src,
743 ptrdiff_t stride,
int rnd)
747 static void avg_vc1_mspel_mc00_16_mmx(
uint8_t *dst,
const uint8_t *src,
748 ptrdiff_t stride,
int rnd)
754 #define FN_ASSIGN(OP, X, Y, INSN) \
755 dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
756 dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
760 #if HAVE_MMX_EXTERNAL