00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86/asm.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "dsputil_mmx.h"
00032 #include "idct_xvid.h"
00033 #include "diracdsp_mmx.h"
00034
00035
00036
00037
00038
00039 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
00040 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00041
00042 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00043 { 0x8000000080000000ULL, 0x8000000080000000ULL };
00044
00045 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
00046 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
00047 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
00048 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
00049 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
00050 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
00051 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
00052 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
00053 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
00054 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
00055 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
00056 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
00057 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
00058 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
00059 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
00060 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
00061 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
00062 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
00063 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
00064 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
00065 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00066 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00067 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
00068 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
00069
00070 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
00071 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
00072 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
00073 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
00074 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
00075 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
00076 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
00077 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
00078 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
00079 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
00080 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
00081 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
00082 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
00083
00084 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00085 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00086
00087 #if HAVE_INLINE_ASM
00088
00089 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
00090 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
00091
00092 #define MOVQ_BFE(regd) \
00093 __asm__ volatile ( \
00094 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00095 "paddb %%"#regd", %%"#regd" \n\t" ::)
00096
00097 #ifndef PIC
00098 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
00099 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
00100 #else
00101
00102
00103 #define MOVQ_BONE(regd) \
00104 __asm__ volatile ( \
00105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00106 "psrlw $15, %%"#regd" \n\t" \
00107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
00108
00109 #define MOVQ_WTWO(regd) \
00110 __asm__ volatile ( \
00111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00112 "psrlw $15, %%"#regd" \n\t" \
00113 "psllw $1, %%"#regd" \n\t"::)
00114
00115 #endif
00116
00117
00118
00119
00120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00121 "movq "#rega", "#regr" \n\t" \
00122 "pand "#regb", "#regr" \n\t" \
00123 "pxor "#rega", "#regb" \n\t" \
00124 "pand "#regfe", "#regb" \n\t" \
00125 "psrlq $1, "#regb" \n\t" \
00126 "paddb "#regb", "#regr" \n\t"
00127
00128 #define PAVGB_MMX(rega, regb, regr, regfe) \
00129 "movq "#rega", "#regr" \n\t" \
00130 "por "#regb", "#regr" \n\t" \
00131 "pxor "#rega", "#regb" \n\t" \
00132 "pand "#regfe", "#regb" \n\t" \
00133 "psrlq $1, "#regb" \n\t" \
00134 "psubb "#regb", "#regr" \n\t"
00135
00136
00137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00138 "movq "#rega", "#regr" \n\t" \
00139 "movq "#regc", "#regp" \n\t" \
00140 "pand "#regb", "#regr" \n\t" \
00141 "pand "#regd", "#regp" \n\t" \
00142 "pxor "#rega", "#regb" \n\t" \
00143 "pxor "#regc", "#regd" \n\t" \
00144 "pand %%mm6, "#regb" \n\t" \
00145 "pand %%mm6, "#regd" \n\t" \
00146 "psrlq $1, "#regb" \n\t" \
00147 "psrlq $1, "#regd" \n\t" \
00148 "paddb "#regb", "#regr" \n\t" \
00149 "paddb "#regd", "#regp" \n\t"
00150
00151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00152 "movq "#rega", "#regr" \n\t" \
00153 "movq "#regc", "#regp" \n\t" \
00154 "por "#regb", "#regr" \n\t" \
00155 "por "#regd", "#regp" \n\t" \
00156 "pxor "#rega", "#regb" \n\t" \
00157 "pxor "#regc", "#regd" \n\t" \
00158 "pand %%mm6, "#regb" \n\t" \
00159 "pand %%mm6, "#regd" \n\t" \
00160 "psrlq $1, "#regd" \n\t" \
00161 "psrlq $1, "#regb" \n\t" \
00162 "psubb "#regb", "#regr" \n\t" \
00163 "psubb "#regd", "#regp" \n\t"
00164
00165
00166
00167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
00168 #define SET_RND MOVQ_WONE
00169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
00172
00173 #include "dsputil_rnd_template.c"
00174
00175 #undef DEF
00176 #undef SET_RND
00177 #undef PAVGBP
00178 #undef PAVGB
00179
00180
00181
00182 #define DEF(x, y) x ## _ ## y ## _mmx
00183 #define SET_RND MOVQ_WTWO
00184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00186
00187 #include "dsputil_rnd_template.c"
00188
00189 #undef DEF
00190 #undef SET_RND
00191 #undef PAVGBP
00192 #undef PAVGB
00193 #undef OP_AVG
00194
00195
00196
00197
00198 #define DEF(x) x ## _3dnow
00199 #define PAVGB "pavgusb"
00200 #define OP_AVG PAVGB
00201
00202 #include "dsputil_avg_template.c"
00203
00204 #undef DEF
00205 #undef PAVGB
00206 #undef OP_AVG
00207
00208
00209
00210
00211 #define DEF(x) x ## _mmx2
00212
00213
00214 #define PAVGB "pavgb"
00215 #define OP_AVG PAVGB
00216
00217 #include "dsputil_avg_template.c"
00218
00219 #undef DEF
00220 #undef PAVGB
00221 #undef OP_AVG
00222
00223 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00224 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00225 #define put_pixels16_mmx2 put_pixels16_mmx
00226 #define put_pixels8_mmx2 put_pixels8_mmx
00227 #define put_pixels4_mmx2 put_pixels4_mmx
00228 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00229 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00230 #define put_pixels16_3dnow put_pixels16_mmx
00231 #define put_pixels8_3dnow put_pixels8_mmx
00232 #define put_pixels4_3dnow put_pixels4_mmx
00233 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00234 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00235
00236
00237
00238
00239 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00240 int line_size)
00241 {
00242 const DCTELEM *p;
00243 uint8_t *pix;
00244
00245
00246 p = block;
00247 pix = pixels;
00248
00249 __asm__ volatile (
00250 "movq (%3), %%mm0 \n\t"
00251 "movq 8(%3), %%mm1 \n\t"
00252 "movq 16(%3), %%mm2 \n\t"
00253 "movq 24(%3), %%mm3 \n\t"
00254 "movq 32(%3), %%mm4 \n\t"
00255 "movq 40(%3), %%mm5 \n\t"
00256 "movq 48(%3), %%mm6 \n\t"
00257 "movq 56(%3), %%mm7 \n\t"
00258 "packuswb %%mm1, %%mm0 \n\t"
00259 "packuswb %%mm3, %%mm2 \n\t"
00260 "packuswb %%mm5, %%mm4 \n\t"
00261 "packuswb %%mm7, %%mm6 \n\t"
00262 "movq %%mm0, (%0) \n\t"
00263 "movq %%mm2, (%0, %1) \n\t"
00264 "movq %%mm4, (%0, %1, 2) \n\t"
00265 "movq %%mm6, (%0, %2) \n\t"
00266 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
00267 "r"(p)
00268 : "memory");
00269 pix += line_size * 4;
00270 p += 32;
00271
00272
00273
00274
00275 __asm__ volatile (
00276 "movq (%3), %%mm0 \n\t"
00277 "movq 8(%3), %%mm1 \n\t"
00278 "movq 16(%3), %%mm2 \n\t"
00279 "movq 24(%3), %%mm3 \n\t"
00280 "movq 32(%3), %%mm4 \n\t"
00281 "movq 40(%3), %%mm5 \n\t"
00282 "movq 48(%3), %%mm6 \n\t"
00283 "movq 56(%3), %%mm7 \n\t"
00284 "packuswb %%mm1, %%mm0 \n\t"
00285 "packuswb %%mm3, %%mm2 \n\t"
00286 "packuswb %%mm5, %%mm4 \n\t"
00287 "packuswb %%mm7, %%mm6 \n\t"
00288 "movq %%mm0, (%0) \n\t"
00289 "movq %%mm2, (%0, %1) \n\t"
00290 "movq %%mm4, (%0, %1, 2) \n\t"
00291 "movq %%mm6, (%0, %2) \n\t"
00292 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
00293 : "memory");
00294 }
00295
00296 #define put_signed_pixels_clamped_mmx_half(off) \
00297 "movq "#off"(%2), %%mm1 \n\t" \
00298 "movq 16 + "#off"(%2), %%mm2 \n\t" \
00299 "movq 32 + "#off"(%2), %%mm3 \n\t" \
00300 "movq 48 + "#off"(%2), %%mm4 \n\t" \
00301 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
00302 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
00303 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
00304 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
00305 "paddb %%mm0, %%mm1 \n\t" \
00306 "paddb %%mm0, %%mm2 \n\t" \
00307 "paddb %%mm0, %%mm3 \n\t" \
00308 "paddb %%mm0, %%mm4 \n\t" \
00309 "movq %%mm1, (%0) \n\t" \
00310 "movq %%mm2, (%0, %3) \n\t" \
00311 "movq %%mm3, (%0, %3, 2) \n\t" \
00312 "movq %%mm4, (%0, %1) \n\t"
00313
00314 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00315 int line_size)
00316 {
00317 x86_reg line_skip = line_size;
00318 x86_reg line_skip3;
00319
00320 __asm__ volatile (
00321 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
00322 "lea (%3, %3, 2), %1 \n\t"
00323 put_signed_pixels_clamped_mmx_half(0)
00324 "lea (%0, %3, 4), %0 \n\t"
00325 put_signed_pixels_clamped_mmx_half(64)
00326 : "+&r"(pixels), "=&r"(line_skip3)
00327 : "r"(block), "r"(line_skip)
00328 : "memory");
00329 }
00330
00331 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00332 int line_size)
00333 {
00334 const DCTELEM *p;
00335 uint8_t *pix;
00336 int i;
00337
00338
00339 p = block;
00340 pix = pixels;
00341 MOVQ_ZERO(mm7);
00342 i = 4;
00343 do {
00344 __asm__ volatile (
00345 "movq (%2), %%mm0 \n\t"
00346 "movq 8(%2), %%mm1 \n\t"
00347 "movq 16(%2), %%mm2 \n\t"
00348 "movq 24(%2), %%mm3 \n\t"
00349 "movq %0, %%mm4 \n\t"
00350 "movq %1, %%mm6 \n\t"
00351 "movq %%mm4, %%mm5 \n\t"
00352 "punpcklbw %%mm7, %%mm4 \n\t"
00353 "punpckhbw %%mm7, %%mm5 \n\t"
00354 "paddsw %%mm4, %%mm0 \n\t"
00355 "paddsw %%mm5, %%mm1 \n\t"
00356 "movq %%mm6, %%mm5 \n\t"
00357 "punpcklbw %%mm7, %%mm6 \n\t"
00358 "punpckhbw %%mm7, %%mm5 \n\t"
00359 "paddsw %%mm6, %%mm2 \n\t"
00360 "paddsw %%mm5, %%mm3 \n\t"
00361 "packuswb %%mm1, %%mm0 \n\t"
00362 "packuswb %%mm3, %%mm2 \n\t"
00363 "movq %%mm0, %0 \n\t"
00364 "movq %%mm2, %1 \n\t"
00365 : "+m"(*pix), "+m"(*(pix + line_size))
00366 : "r"(p)
00367 : "memory");
00368 pix += line_size * 2;
00369 p += 16;
00370 } while (--i);
00371 }
00372
00373 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
00374 int line_size, int h)
00375 {
00376 __asm__ volatile (
00377 "lea (%3, %3), %%"REG_a" \n\t"
00378 ".p2align 3 \n\t"
00379 "1: \n\t"
00380 "movd (%1 ), %%mm0 \n\t"
00381 "movd (%1, %3), %%mm1 \n\t"
00382 "movd %%mm0, (%2) \n\t"
00383 "movd %%mm1, (%2, %3) \n\t"
00384 "add %%"REG_a", %1 \n\t"
00385 "add %%"REG_a", %2 \n\t"
00386 "movd (%1 ), %%mm0 \n\t"
00387 "movd (%1, %3), %%mm1 \n\t"
00388 "movd %%mm0, (%2) \n\t"
00389 "movd %%mm1, (%2, %3) \n\t"
00390 "add %%"REG_a", %1 \n\t"
00391 "add %%"REG_a", %2 \n\t"
00392 "subl $4, %0 \n\t"
00393 "jnz 1b \n\t"
00394 : "+g"(h), "+r"(pixels), "+r"(block)
00395 : "r"((x86_reg)line_size)
00396 : "%"REG_a, "memory"
00397 );
00398 }
00399
00400 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
00401 int line_size, int h)
00402 {
00403 __asm__ volatile (
00404 "lea (%3, %3), %%"REG_a" \n\t"
00405 ".p2align 3 \n\t"
00406 "1: \n\t"
00407 "movq (%1 ), %%mm0 \n\t"
00408 "movq (%1, %3), %%mm1 \n\t"
00409 "movq %%mm0, (%2) \n\t"
00410 "movq %%mm1, (%2, %3) \n\t"
00411 "add %%"REG_a", %1 \n\t"
00412 "add %%"REG_a", %2 \n\t"
00413 "movq (%1 ), %%mm0 \n\t"
00414 "movq (%1, %3), %%mm1 \n\t"
00415 "movq %%mm0, (%2) \n\t"
00416 "movq %%mm1, (%2, %3) \n\t"
00417 "add %%"REG_a", %1 \n\t"
00418 "add %%"REG_a", %2 \n\t"
00419 "subl $4, %0 \n\t"
00420 "jnz 1b \n\t"
00421 : "+g"(h), "+r"(pixels), "+r"(block)
00422 : "r"((x86_reg)line_size)
00423 : "%"REG_a, "memory"
00424 );
00425 }
00426
00427 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
00428 int line_size, int h)
00429 {
00430 __asm__ volatile (
00431 "lea (%3, %3), %%"REG_a" \n\t"
00432 ".p2align 3 \n\t"
00433 "1: \n\t"
00434 "movq (%1 ), %%mm0 \n\t"
00435 "movq 8(%1 ), %%mm4 \n\t"
00436 "movq (%1, %3), %%mm1 \n\t"
00437 "movq 8(%1, %3), %%mm5 \n\t"
00438 "movq %%mm0, (%2) \n\t"
00439 "movq %%mm4, 8(%2) \n\t"
00440 "movq %%mm1, (%2, %3) \n\t"
00441 "movq %%mm5, 8(%2, %3) \n\t"
00442 "add %%"REG_a", %1 \n\t"
00443 "add %%"REG_a", %2 \n\t"
00444 "movq (%1 ), %%mm0 \n\t"
00445 "movq 8(%1 ), %%mm4 \n\t"
00446 "movq (%1, %3), %%mm1 \n\t"
00447 "movq 8(%1, %3), %%mm5 \n\t"
00448 "movq %%mm0, (%2) \n\t"
00449 "movq %%mm4, 8(%2) \n\t"
00450 "movq %%mm1, (%2, %3) \n\t"
00451 "movq %%mm5, 8(%2, %3) \n\t"
00452 "add %%"REG_a", %1 \n\t"
00453 "add %%"REG_a", %2 \n\t"
00454 "subl $4, %0 \n\t"
00455 "jnz 1b \n\t"
00456 : "+g"(h), "+r"(pixels), "+r"(block)
00457 : "r"((x86_reg)line_size)
00458 : "%"REG_a, "memory"
00459 );
00460 }
00461
00462 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
00463 int line_size, int h)
00464 {
00465 __asm__ volatile (
00466 "1: \n\t"
00467 "movdqu (%1 ), %%xmm0 \n\t"
00468 "movdqu (%1, %3 ), %%xmm1 \n\t"
00469 "movdqu (%1, %3, 2), %%xmm2 \n\t"
00470 "movdqu (%1, %4 ), %%xmm3 \n\t"
00471 "lea (%1, %3, 4), %1 \n\t"
00472 "movdqa %%xmm0, (%2) \n\t"
00473 "movdqa %%xmm1, (%2, %3) \n\t"
00474 "movdqa %%xmm2, (%2, %3, 2) \n\t"
00475 "movdqa %%xmm3, (%2, %4) \n\t"
00476 "subl $4, %0 \n\t"
00477 "lea (%2, %3, 4), %2 \n\t"
00478 "jnz 1b \n\t"
00479 : "+g"(h), "+r"(pixels), "+r"(block)
00480 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
00481 : "memory"
00482 );
00483 }
00484
00485 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
00486 int line_size, int h)
00487 {
00488 __asm__ volatile (
00489 "1: \n\t"
00490 "movdqu (%1 ), %%xmm0 \n\t"
00491 "movdqu (%1, %3 ), %%xmm1 \n\t"
00492 "movdqu (%1, %3, 2), %%xmm2 \n\t"
00493 "movdqu (%1, %4 ), %%xmm3 \n\t"
00494 "lea (%1, %3, 4), %1 \n\t"
00495 "pavgb (%2 ), %%xmm0 \n\t"
00496 "pavgb (%2, %3 ), %%xmm1 \n\t"
00497 "pavgb (%2, %3, 2), %%xmm2 \n\t"
00498 "pavgb (%2, %4), %%xmm3 \n\t"
00499 "movdqa %%xmm0, (%2) \n\t"
00500 "movdqa %%xmm1, (%2, %3) \n\t"
00501 "movdqa %%xmm2, (%2, %3, 2) \n\t"
00502 "movdqa %%xmm3, (%2, %4) \n\t"
00503 "subl $4, %0 \n\t"
00504 "lea (%2, %3, 4), %2 \n\t"
00505 "jnz 1b \n\t"
00506 : "+g"(h), "+r"(pixels), "+r"(block)
00507 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
00508 : "memory"
00509 );
00510 }
00511
00512 #define CLEAR_BLOCKS(name, n) \
00513 static void name(DCTELEM *blocks) \
00514 { \
00515 __asm__ volatile ( \
00516 "pxor %%mm7, %%mm7 \n\t" \
00517 "mov %1, %%"REG_a" \n\t" \
00518 "1: \n\t" \
00519 "movq %%mm7, (%0, %%"REG_a") \n\t" \
00520 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
00521 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
00522 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
00523 "add $32, %%"REG_a" \n\t" \
00524 "js 1b \n\t" \
00525 :: "r"(((uint8_t *)blocks) + 128 * n), \
00526 "i"(-128 * n) \
00527 : "%"REG_a \
00528 ); \
00529 }
00530 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00531 CLEAR_BLOCKS(clear_block_mmx, 1)
00532
00533 static void clear_block_sse(DCTELEM *block)
00534 {
00535 __asm__ volatile (
00536 "xorps %%xmm0, %%xmm0 \n"
00537 "movaps %%xmm0, (%0) \n"
00538 "movaps %%xmm0, 16(%0) \n"
00539 "movaps %%xmm0, 32(%0) \n"
00540 "movaps %%xmm0, 48(%0) \n"
00541 "movaps %%xmm0, 64(%0) \n"
00542 "movaps %%xmm0, 80(%0) \n"
00543 "movaps %%xmm0, 96(%0) \n"
00544 "movaps %%xmm0, 112(%0) \n"
00545 :: "r"(block)
00546 : "memory"
00547 );
00548 }
00549
00550 static void clear_blocks_sse(DCTELEM *blocks)
00551 {
00552 __asm__ volatile (
00553 "xorps %%xmm0, %%xmm0 \n"
00554 "mov %1, %%"REG_a" \n"
00555 "1: \n"
00556 "movaps %%xmm0, (%0, %%"REG_a") \n"
00557 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
00558 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
00559 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
00560 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
00561 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
00562 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
00563 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
00564 "add $128, %%"REG_a" \n"
00565 "js 1b \n"
00566 :: "r"(((uint8_t *)blocks) + 128 * 6),
00567 "i"(-128 * 6)
00568 : "%"REG_a
00569 );
00570 }
00571
00572 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
00573 {
00574 x86_reg i = 0;
00575 __asm__ volatile (
00576 "jmp 2f \n\t"
00577 "1: \n\t"
00578 "movq (%1, %0), %%mm0 \n\t"
00579 "movq (%2, %0), %%mm1 \n\t"
00580 "paddb %%mm0, %%mm1 \n\t"
00581 "movq %%mm1, (%2, %0) \n\t"
00582 "movq 8(%1, %0), %%mm0 \n\t"
00583 "movq 8(%2, %0), %%mm1 \n\t"
00584 "paddb %%mm0, %%mm1 \n\t"
00585 "movq %%mm1, 8(%2, %0) \n\t"
00586 "add $16, %0 \n\t"
00587 "2: \n\t"
00588 "cmp %3, %0 \n\t"
00589 "js 1b \n\t"
00590 : "+r"(i)
00591 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
00592 );
00593 for ( ; i < w; i++)
00594 dst[i + 0] += src[i + 0];
00595 }
00596
00597 #if HAVE_7REGS
00598 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
00599 const uint8_t *diff, int w,
00600 int *left, int *left_top)
00601 {
00602 x86_reg w2 = -w;
00603 x86_reg x;
00604 int l = *left & 0xff;
00605 int tl = *left_top & 0xff;
00606 int t;
00607 __asm__ volatile (
00608 "mov %7, %3 \n"
00609 "1: \n"
00610 "movzbl (%3, %4), %2 \n"
00611 "mov %2, %k3 \n"
00612 "sub %b1, %b3 \n"
00613 "add %b0, %b3 \n"
00614 "mov %2, %1 \n"
00615 "cmp %0, %2 \n"
00616 "cmovg %0, %2 \n"
00617 "cmovg %1, %0 \n"
00618 "cmp %k3, %0 \n"
00619 "cmovg %k3, %0 \n"
00620 "mov %7, %3 \n"
00621 "cmp %2, %0 \n"
00622 "cmovl %2, %0 \n"
00623 "add (%6, %4), %b0 \n"
00624 "mov %b0, (%5, %4) \n"
00625 "inc %4 \n"
00626 "jl 1b \n"
00627 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00628 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
00629 );
00630 *left = l;
00631 *left_top = tl;
00632 }
00633 #endif
00634
00635 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
00636 __asm__ volatile(
00637 "movd (%1), %%mm0 \n\t"
00638 "add %3, %1 \n\t"
00639 "movd (%1), %%mm1 \n\t"
00640 "movd (%1,%3,1), %%mm2 \n\t"
00641 "movd (%1,%3,2), %%mm3 \n\t"
00642 "punpcklbw %%mm1, %%mm0 \n\t"
00643 "punpcklbw %%mm3, %%mm2 \n\t"
00644 "movq %%mm0, %%mm1 \n\t"
00645 "punpcklwd %%mm2, %%mm0 \n\t"
00646 "punpckhwd %%mm2, %%mm1 \n\t"
00647 "movd %%mm0, (%0) \n\t"
00648 "add %2, %0 \n\t"
00649 "punpckhdq %%mm0, %%mm0 \n\t"
00650 "movd %%mm0, (%0) \n\t"
00651 "movd %%mm1, (%0,%2,1) \n\t"
00652 "punpckhdq %%mm1, %%mm1 \n\t"
00653 "movd %%mm1, (%0,%2,2) \n\t"
00654
00655 : "+&r" (dst),
00656 "+&r" (src)
00657 : "r" (dst_stride),
00658 "r" (src_stride)
00659 : "memory"
00660 );
00661 }
00662
00663 #define H263_LOOP_FILTER \
00664 "pxor %%mm7, %%mm7 \n\t" \
00665 "movq %0, %%mm0 \n\t" \
00666 "movq %0, %%mm1 \n\t" \
00667 "movq %3, %%mm2 \n\t" \
00668 "movq %3, %%mm3 \n\t" \
00669 "punpcklbw %%mm7, %%mm0 \n\t" \
00670 "punpckhbw %%mm7, %%mm1 \n\t" \
00671 "punpcklbw %%mm7, %%mm2 \n\t" \
00672 "punpckhbw %%mm7, %%mm3 \n\t" \
00673 "psubw %%mm2, %%mm0 \n\t" \
00674 "psubw %%mm3, %%mm1 \n\t" \
00675 "movq %1, %%mm2 \n\t" \
00676 "movq %1, %%mm3 \n\t" \
00677 "movq %2, %%mm4 \n\t" \
00678 "movq %2, %%mm5 \n\t" \
00679 "punpcklbw %%mm7, %%mm2 \n\t" \
00680 "punpckhbw %%mm7, %%mm3 \n\t" \
00681 "punpcklbw %%mm7, %%mm4 \n\t" \
00682 "punpckhbw %%mm7, %%mm5 \n\t" \
00683 "psubw %%mm2, %%mm4 \n\t" \
00684 "psubw %%mm3, %%mm5 \n\t" \
00685 "psllw $2, %%mm4 \n\t" \
00686 "psllw $2, %%mm5 \n\t" \
00687 "paddw %%mm0, %%mm4 \n\t" \
00688 "paddw %%mm1, %%mm5 \n\t" \
00689 "pxor %%mm6, %%mm6 \n\t" \
00690 "pcmpgtw %%mm4, %%mm6 \n\t" \
00691 "pcmpgtw %%mm5, %%mm7 \n\t" \
00692 "pxor %%mm6, %%mm4 \n\t" \
00693 "pxor %%mm7, %%mm5 \n\t" \
00694 "psubw %%mm6, %%mm4 \n\t" \
00695 "psubw %%mm7, %%mm5 \n\t" \
00696 "psrlw $3, %%mm4 \n\t" \
00697 "psrlw $3, %%mm5 \n\t" \
00698 "packuswb %%mm5, %%mm4 \n\t" \
00699 "packsswb %%mm7, %%mm6 \n\t" \
00700 "pxor %%mm7, %%mm7 \n\t" \
00701 "movd %4, %%mm2 \n\t" \
00702 "punpcklbw %%mm2, %%mm2 \n\t" \
00703 "punpcklbw %%mm2, %%mm2 \n\t" \
00704 "punpcklbw %%mm2, %%mm2 \n\t" \
00705 "psubusb %%mm4, %%mm2 \n\t" \
00706 "movq %%mm2, %%mm3 \n\t" \
00707 "psubusb %%mm4, %%mm3 \n\t" \
00708 "psubb %%mm3, %%mm2 \n\t" \
00709 "movq %1, %%mm3 \n\t" \
00710 "movq %2, %%mm4 \n\t" \
00711 "pxor %%mm6, %%mm3 \n\t" \
00712 "pxor %%mm6, %%mm4 \n\t" \
00713 "paddusb %%mm2, %%mm3 \n\t" \
00714 "psubusb %%mm2, %%mm4 \n\t" \
00715 "pxor %%mm6, %%mm3 \n\t" \
00716 "pxor %%mm6, %%mm4 \n\t" \
00717 "paddusb %%mm2, %%mm2 \n\t" \
00718 "packsswb %%mm1, %%mm0 \n\t" \
00719 "pcmpgtb %%mm0, %%mm7 \n\t" \
00720 "pxor %%mm7, %%mm0 \n\t" \
00721 "psubb %%mm7, %%mm0 \n\t" \
00722 "movq %%mm0, %%mm1 \n\t" \
00723 "psubusb %%mm2, %%mm0 \n\t" \
00724 "psubb %%mm0, %%mm1 \n\t" \
00725 "pand %5, %%mm1 \n\t" \
00726 "psrlw $2, %%mm1 \n\t" \
00727 "pxor %%mm7, %%mm1 \n\t" \
00728 "psubb %%mm7, %%mm1 \n\t" \
00729 "movq %0, %%mm5 \n\t" \
00730 "movq %3, %%mm6 \n\t" \
00731 "psubb %%mm1, %%mm5 \n\t" \
00732 "paddb %%mm1, %%mm6 \n\t"
00733
00734 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00735 {
00736 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00737 const int strength = ff_h263_loop_filter_strength[qscale];
00738
00739 __asm__ volatile (
00740 H263_LOOP_FILTER
00741
00742 "movq %%mm3, %1 \n\t"
00743 "movq %%mm4, %2 \n\t"
00744 "movq %%mm5, %0 \n\t"
00745 "movq %%mm6, %3 \n\t"
00746 : "+m"(*(uint64_t*)(src - 2 * stride)),
00747 "+m"(*(uint64_t*)(src - 1 * stride)),
00748 "+m"(*(uint64_t*)(src + 0 * stride)),
00749 "+m"(*(uint64_t*)(src + 1 * stride))
00750 : "g"(2 * strength), "m"(ff_pb_FC)
00751 );
00752 }
00753 }
00754
00755 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00756 {
00757 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00758 const int strength = ff_h263_loop_filter_strength[qscale];
00759 DECLARE_ALIGNED(8, uint64_t, temp)[4];
00760 uint8_t *btemp = (uint8_t*)temp;
00761
00762 src -= 2;
00763
00764 transpose4x4(btemp, src, 8, stride);
00765 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
00766 __asm__ volatile (
00767 H263_LOOP_FILTER
00768
00769 : "+m"(temp[0]),
00770 "+m"(temp[1]),
00771 "+m"(temp[2]),
00772 "+m"(temp[3])
00773 : "g"(2 * strength), "m"(ff_pb_FC)
00774 );
00775
00776 __asm__ volatile (
00777 "movq %%mm5, %%mm1 \n\t"
00778 "movq %%mm4, %%mm0 \n\t"
00779 "punpcklbw %%mm3, %%mm5 \n\t"
00780 "punpcklbw %%mm6, %%mm4 \n\t"
00781 "punpckhbw %%mm3, %%mm1 \n\t"
00782 "punpckhbw %%mm6, %%mm0 \n\t"
00783 "movq %%mm5, %%mm3 \n\t"
00784 "movq %%mm1, %%mm6 \n\t"
00785 "punpcklwd %%mm4, %%mm5 \n\t"
00786 "punpcklwd %%mm0, %%mm1 \n\t"
00787 "punpckhwd %%mm4, %%mm3 \n\t"
00788 "punpckhwd %%mm0, %%mm6 \n\t"
00789 "movd %%mm5, (%0) \n\t"
00790 "punpckhdq %%mm5, %%mm5 \n\t"
00791 "movd %%mm5, (%0, %2) \n\t"
00792 "movd %%mm3, (%0, %2, 2) \n\t"
00793 "punpckhdq %%mm3, %%mm3 \n\t"
00794 "movd %%mm3, (%0, %3) \n\t"
00795 "movd %%mm1, (%1) \n\t"
00796 "punpckhdq %%mm1, %%mm1 \n\t"
00797 "movd %%mm1, (%1, %2) \n\t"
00798 "movd %%mm6, (%1, %2, 2) \n\t"
00799 "punpckhdq %%mm6, %%mm6 \n\t"
00800 "movd %%mm6, (%1, %3) \n\t"
00801 :: "r"(src),
00802 "r"(src + 4 * stride),
00803 "r"((x86_reg)stride),
00804 "r"((x86_reg)(3 * stride))
00805 );
00806 }
00807 }
00808
00809
00810
00811 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
00812 int w, int h, int sides)
00813 {
00814 uint8_t *ptr, *last_line;
00815 int i;
00816
00817 last_line = buf + (height - 1) * wrap;
00818
00819 ptr = buf;
00820 if (w == 8) {
00821 __asm__ volatile (
00822 "1: \n\t"
00823 "movd (%0), %%mm0 \n\t"
00824 "punpcklbw %%mm0, %%mm0 \n\t"
00825 "punpcklwd %%mm0, %%mm0 \n\t"
00826 "punpckldq %%mm0, %%mm0 \n\t"
00827 "movq %%mm0, -8(%0) \n\t"
00828 "movq -8(%0, %2), %%mm1 \n\t"
00829 "punpckhbw %%mm1, %%mm1 \n\t"
00830 "punpckhwd %%mm1, %%mm1 \n\t"
00831 "punpckhdq %%mm1, %%mm1 \n\t"
00832 "movq %%mm1, (%0, %2) \n\t"
00833 "add %1, %0 \n\t"
00834 "cmp %3, %0 \n\t"
00835 "jb 1b \n\t"
00836 : "+r"(ptr)
00837 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00838 );
00839 } else if(w==16){
00840 __asm__ volatile (
00841 "1: \n\t"
00842 "movd (%0), %%mm0 \n\t"
00843 "punpcklbw %%mm0, %%mm0 \n\t"
00844 "punpcklwd %%mm0, %%mm0 \n\t"
00845 "punpckldq %%mm0, %%mm0 \n\t"
00846 "movq %%mm0, -8(%0) \n\t"
00847 "movq %%mm0, -16(%0) \n\t"
00848 "movq -8(%0, %2), %%mm1 \n\t"
00849 "punpckhbw %%mm1, %%mm1 \n\t"
00850 "punpckhwd %%mm1, %%mm1 \n\t"
00851 "punpckhdq %%mm1, %%mm1 \n\t"
00852 "movq %%mm1, (%0, %2) \n\t"
00853 "movq %%mm1, 8(%0, %2) \n\t"
00854 "add %1, %0 \n\t"
00855 "cmp %3, %0 \n\t"
00856 "jb 1b \n\t"
00857 : "+r"(ptr)
00858 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00859 );
00860 } else {
00861 av_assert1(w == 4);
00862 __asm__ volatile (
00863 "1: \n\t"
00864 "movd (%0), %%mm0 \n\t"
00865 "punpcklbw %%mm0, %%mm0 \n\t"
00866 "punpcklwd %%mm0, %%mm0 \n\t"
00867 "movd %%mm0, -4(%0) \n\t"
00868 "movd -4(%0, %2), %%mm1 \n\t"
00869 "punpcklbw %%mm1, %%mm1 \n\t"
00870 "punpckhwd %%mm1, %%mm1 \n\t"
00871 "punpckhdq %%mm1, %%mm1 \n\t"
00872 "movd %%mm1, (%0, %2) \n\t"
00873 "add %1, %0 \n\t"
00874 "cmp %3, %0 \n\t"
00875 "jb 1b \n\t"
00876 : "+r"(ptr)
00877 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00878 );
00879 }
00880
00881
00882 if (sides & EDGE_TOP) {
00883 for (i = 0; i < h; i += 4) {
00884 ptr = buf - (i + 1) * wrap - w;
00885 __asm__ volatile (
00886 "1: \n\t"
00887 "movq (%1, %0), %%mm0 \n\t"
00888 "movq %%mm0, (%0) \n\t"
00889 "movq %%mm0, (%0, %2) \n\t"
00890 "movq %%mm0, (%0, %2, 2) \n\t"
00891 "movq %%mm0, (%0, %3) \n\t"
00892 "add $8, %0 \n\t"
00893 "cmp %4, %0 \n\t"
00894 "jb 1b \n\t"
00895 : "+r"(ptr)
00896 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
00897 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
00898 );
00899 }
00900 }
00901
00902 if (sides & EDGE_BOTTOM) {
00903 for (i = 0; i < h; i += 4) {
00904 ptr = last_line + (i + 1) * wrap - w;
00905 __asm__ volatile (
00906 "1: \n\t"
00907 "movq (%1, %0), %%mm0 \n\t"
00908 "movq %%mm0, (%0) \n\t"
00909 "movq %%mm0, (%0, %2) \n\t"
00910 "movq %%mm0, (%0, %2, 2) \n\t"
00911 "movq %%mm0, (%0, %3) \n\t"
00912 "add $8, %0 \n\t"
00913 "cmp %4, %0 \n\t"
00914 "jb 1b \n\t"
00915 : "+r"(ptr)
00916 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
00917 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
00918 "r"(ptr + width + 2 * w)
00919 );
00920 }
00921 }
00922 }
00923
00924 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
00925 in0, in1, in2, in7, out, OP) \
00926 "paddw "#m4", "#m3" \n\t" \
00927 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00928 "pmullw "#m3", %%mm4 \n\t" \
00929 "movq "#in7", "#m3" \n\t" \
00930 "movq "#in0", %%mm5 \n\t" \
00931 "paddw "#m3", %%mm5 \n\t" \
00932 "psubw %%mm5, %%mm4 \n\t" \
00933 "movq "#in1", %%mm5 \n\t" \
00934 "movq "#in2", %%mm6 \n\t" \
00935 "paddw "#m6", %%mm5 \n\t" \
00936 "paddw "#m5", %%mm6 \n\t" \
00937 "paddw %%mm6, %%mm6 \n\t" \
00938 "psubw %%mm6, %%mm5 \n\t" \
00939 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00940 "paddw "#rnd", %%mm4 \n\t" \
00941 "paddw %%mm4, %%mm5 \n\t" \
00942 "psraw $5, %%mm5 \n\t" \
00943 "packuswb %%mm5, %%mm5 \n\t" \
00944 OP(%%mm5, out, %%mm7, d)
00945
00946 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
00947 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
00948 uint8_t *src, \
00949 int dstStride, \
00950 int srcStride, \
00951 int h) \
00952 { \
00953 uint64_t temp; \
00954 \
00955 __asm__ volatile ( \
00956 "pxor %%mm7, %%mm7 \n\t" \
00957 "1: \n\t" \
00958 "movq (%0), %%mm0 \n\t" \
00959 "movq %%mm0, %%mm1 \n\t" \
00960 "movq %%mm0, %%mm2 \n\t" \
00961 "punpcklbw %%mm7, %%mm0 \n\t" \
00962 "punpckhbw %%mm7, %%mm1 \n\t" \
00963 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00964 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00965 "movq %%mm2, %%mm3 \n\t" \
00966 "movq %%mm2, %%mm4 \n\t" \
00967 "psllq $8, %%mm2 \n\t" \
00968 "psllq $16, %%mm3 \n\t" \
00969 "psllq $24, %%mm4 \n\t" \
00970 "punpckhbw %%mm7, %%mm2 \n\t" \
00971 "punpckhbw %%mm7, %%mm3 \n\t" \
00972 "punpckhbw %%mm7, %%mm4 \n\t" \
00973 "paddw %%mm3, %%mm5 \n\t" \
00974 "paddw %%mm2, %%mm6 \n\t" \
00975 "paddw %%mm5, %%mm5 \n\t" \
00976 "psubw %%mm5, %%mm6 \n\t" \
00977 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
00978 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
00979 "paddw %%mm4, %%mm0 \n\t" \
00980 "paddw %%mm1, %%mm5 \n\t" \
00981 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
00982 "psubw %%mm5, %%mm0 \n\t" \
00983 "paddw %6, %%mm6 \n\t" \
00984 "paddw %%mm6, %%mm0 \n\t" \
00985 "psraw $5, %%mm0 \n\t" \
00986 "movq %%mm0, %5 \n\t" \
00987 \
00988 \
00989 "movq 5(%0), %%mm0 \n\t" \
00990 "movq %%mm0, %%mm5 \n\t" \
00991 "movq %%mm0, %%mm6 \n\t" \
00992 "psrlq $8, %%mm0 \n\t" \
00993 "psrlq $16, %%mm5 \n\t" \
00994 "punpcklbw %%mm7, %%mm0 \n\t" \
00995 "punpcklbw %%mm7, %%mm5 \n\t" \
00996 "paddw %%mm0, %%mm2 \n\t" \
00997 "paddw %%mm5, %%mm3 \n\t" \
00998 "paddw %%mm2, %%mm2 \n\t" \
00999 "psubw %%mm2, %%mm3 \n\t" \
01000 "movq %%mm6, %%mm2 \n\t" \
01001 "psrlq $24, %%mm6 \n\t" \
01002 "punpcklbw %%mm7, %%mm2 \n\t" \
01003 "punpcklbw %%mm7, %%mm6 \n\t" \
01004 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01005 "paddw %%mm2, %%mm1 \n\t" \
01006 "paddw %%mm6, %%mm4 \n\t" \
01007 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01008 "psubw %%mm4, %%mm3 \n\t" \
01009 "paddw %6, %%mm1 \n\t" \
01010 "paddw %%mm1, %%mm3 \n\t" \
01011 "psraw $5, %%mm3 \n\t" \
01012 "movq %5, %%mm1 \n\t" \
01013 "packuswb %%mm3, %%mm1 \n\t" \
01014 OP_MMX2(%%mm1, (%1), %%mm4, q) \
01015 \
01016 \
01017 "movq 9(%0), %%mm1 \n\t" \
01018 "movq %%mm1, %%mm4 \n\t" \
01019 "movq %%mm1, %%mm3 \n\t" \
01020 "psrlq $8, %%mm1 \n\t" \
01021 "psrlq $16, %%mm4 \n\t" \
01022 "punpcklbw %%mm7, %%mm1 \n\t" \
01023 "punpcklbw %%mm7, %%mm4 \n\t" \
01024 "paddw %%mm1, %%mm5 \n\t" \
01025 "paddw %%mm4, %%mm0 \n\t" \
01026 "paddw %%mm5, %%mm5 \n\t" \
01027 "psubw %%mm5, %%mm0 \n\t" \
01028 "movq %%mm3, %%mm5 \n\t" \
01029 "psrlq $24, %%mm3 \n\t" \
01030 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
01031 "punpcklbw %%mm7, %%mm3 \n\t" \
01032 "paddw %%mm3, %%mm2 \n\t" \
01033 "psubw %%mm2, %%mm0 \n\t" \
01034 "movq %%mm5, %%mm2 \n\t" \
01035 "punpcklbw %%mm7, %%mm2 \n\t" \
01036 "punpckhbw %%mm7, %%mm5 \n\t" \
01037 "paddw %%mm2, %%mm6 \n\t" \
01038 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
01039 "paddw %6, %%mm0 \n\t" \
01040 "paddw %%mm6, %%mm0 \n\t" \
01041 "psraw $5, %%mm0 \n\t" \
01042 \
01043 \
01044 \
01045 "paddw %%mm5, %%mm3 \n\t" \
01046 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01047 "paddw %%mm4, %%mm6 \n\t" \
01048 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
01049 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01050 "paddw %%mm1, %%mm4 \n\t" \
01051 "paddw %%mm2, %%mm5 \n\t" \
01052 "paddw %%mm6, %%mm6 \n\t" \
01053 "psubw %%mm6, %%mm4 \n\t" \
01054 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
01055 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
01056 "psubw %%mm5, %%mm3 \n\t" \
01057 "paddw %6, %%mm4 \n\t" \
01058 "paddw %%mm3, %%mm4 \n\t" \
01059 "psraw $5, %%mm4 \n\t" \
01060 "packuswb %%mm4, %%mm0 \n\t" \
01061 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
01062 \
01063 "add %3, %0 \n\t" \
01064 "add %4, %1 \n\t" \
01065 "decl %2 \n\t" \
01066 "jnz 1b \n\t" \
01067 : "+a"(src), "+c"(dst), "+D"(h) \
01068 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
01069 "m"(temp), "m"(ROUNDER) \
01070 : "memory" \
01071 ); \
01072 } \
01073 \
01074 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
01075 uint8_t *src, \
01076 int dstStride, \
01077 int srcStride, \
01078 int h) \
01079 { \
01080 int i; \
01081 int16_t temp[16]; \
01082 \
01083 for (i = 0; i < h; i++) { \
01084 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
01085 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
01086 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
01087 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
01088 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
01089 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
01090 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
01091 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
01092 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
01093 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
01094 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
01095 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
01096 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
01097 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
01098 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
01099 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
01100 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
01101 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
01102 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
01103 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
01104 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
01105 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
01106 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
01107 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
01108 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
01109 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
01110 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
01111 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
01112 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
01113 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
01114 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
01115 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
01116 __asm__ volatile ( \
01117 "movq (%0), %%mm0 \n\t" \
01118 "movq 8(%0), %%mm1 \n\t" \
01119 "paddw %2, %%mm0 \n\t" \
01120 "paddw %2, %%mm1 \n\t" \
01121 "psraw $5, %%mm0 \n\t" \
01122 "psraw $5, %%mm1 \n\t" \
01123 "packuswb %%mm1, %%mm0 \n\t" \
01124 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
01125 "movq 16(%0), %%mm0 \n\t" \
01126 "movq 24(%0), %%mm1 \n\t" \
01127 "paddw %2, %%mm0 \n\t" \
01128 "paddw %2, %%mm1 \n\t" \
01129 "psraw $5, %%mm0 \n\t" \
01130 "psraw $5, %%mm1 \n\t" \
01131 "packuswb %%mm1, %%mm0 \n\t" \
01132 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
01133 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
01134 : "memory" \
01135 ); \
01136 dst += dstStride; \
01137 src += srcStride; \
01138 } \
01139 } \
01140 \
01141 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
01142 uint8_t *src, \
01143 int dstStride, \
01144 int srcStride, \
01145 int h) \
01146 { \
01147 __asm__ volatile ( \
01148 "pxor %%mm7, %%mm7 \n\t" \
01149 "1: \n\t" \
01150 "movq (%0), %%mm0 \n\t" \
01151 "movq %%mm0, %%mm1 \n\t" \
01152 "movq %%mm0, %%mm2 \n\t" \
01153 "punpcklbw %%mm7, %%mm0 \n\t" \
01154 "punpckhbw %%mm7, %%mm1 \n\t" \
01155 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01156 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01157 "movq %%mm2, %%mm3 \n\t" \
01158 "movq %%mm2, %%mm4 \n\t" \
01159 "psllq $8, %%mm2 \n\t" \
01160 "psllq $16, %%mm3 \n\t" \
01161 "psllq $24, %%mm4 \n\t" \
01162 "punpckhbw %%mm7, %%mm2 \n\t" \
01163 "punpckhbw %%mm7, %%mm3 \n\t" \
01164 "punpckhbw %%mm7, %%mm4 \n\t" \
01165 "paddw %%mm3, %%mm5 \n\t" \
01166 "paddw %%mm2, %%mm6 \n\t" \
01167 "paddw %%mm5, %%mm5 \n\t" \
01168 "psubw %%mm5, %%mm6 \n\t" \
01169 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01170 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01171 "paddw %%mm4, %%mm0 \n\t" \
01172 "paddw %%mm1, %%mm5 \n\t" \
01173 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01174 "psubw %%mm5, %%mm0 \n\t" \
01175 "paddw %5, %%mm6 \n\t" \
01176 "paddw %%mm6, %%mm0 \n\t" \
01177 "psraw $5, %%mm0 \n\t" \
01178 \
01179 \
01180 "movd 5(%0), %%mm5 \n\t" \
01181 "punpcklbw %%mm7, %%mm5 \n\t" \
01182 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01183 "paddw %%mm5, %%mm1 \n\t" \
01184 "paddw %%mm6, %%mm2 \n\t" \
01185 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01186 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01187 "paddw %%mm6, %%mm3 \n\t" \
01188 "paddw %%mm5, %%mm4 \n\t" \
01189 "paddw %%mm2, %%mm2 \n\t" \
01190 "psubw %%mm2, %%mm3 \n\t" \
01191 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01192 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01193 "psubw %%mm4, %%mm3 \n\t" \
01194 "paddw %5, %%mm1 \n\t" \
01195 "paddw %%mm1, %%mm3 \n\t" \
01196 "psraw $5, %%mm3 \n\t" \
01197 "packuswb %%mm3, %%mm0 \n\t" \
01198 OP_MMX2(%%mm0, (%1), %%mm4, q) \
01199 \
01200 "add %3, %0 \n\t" \
01201 "add %4, %1 \n\t" \
01202 "decl %2 \n\t" \
01203 "jnz 1b \n\t" \
01204 : "+a"(src), "+c"(dst), "+d"(h) \
01205 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
01206 "m"(ROUNDER) \
01207 : "memory" \
01208 ); \
01209 } \
01210 \
01211 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
01212 uint8_t *src, \
01213 int dstStride, \
01214 int srcStride, \
01215 int h) \
01216 { \
01217 int i; \
01218 int16_t temp[8]; \
01219 \
01220 for (i = 0; i < h; i++) { \
01221 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
01222 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
01223 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
01224 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
01225 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
01226 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
01227 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
01228 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
01229 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
01230 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
01231 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
01232 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
01233 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
01234 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
01235 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
01236 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
01237 __asm__ volatile ( \
01238 "movq (%0), %%mm0 \n\t" \
01239 "movq 8(%0), %%mm1 \n\t" \
01240 "paddw %2, %%mm0 \n\t" \
01241 "paddw %2, %%mm1 \n\t" \
01242 "psraw $5, %%mm0 \n\t" \
01243 "psraw $5, %%mm1 \n\t" \
01244 "packuswb %%mm1, %%mm0 \n\t" \
01245 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
01246 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
01247 : "memory" \
01248 ); \
01249 dst += dstStride; \
01250 src += srcStride; \
01251 } \
01252 }
01253
01254 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
01255 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
01256 uint8_t *src, \
01257 int dstStride, \
01258 int srcStride) \
01259 { \
01260 uint64_t temp[17 * 4]; \
01261 uint64_t *temp_ptr = temp; \
01262 int count = 17; \
01263 \
01264 \
01265 __asm__ volatile ( \
01266 "pxor %%mm7, %%mm7 \n\t" \
01267 "1: \n\t" \
01268 "movq (%0), %%mm0 \n\t" \
01269 "movq (%0), %%mm1 \n\t" \
01270 "movq 8(%0), %%mm2 \n\t" \
01271 "movq 8(%0), %%mm3 \n\t" \
01272 "punpcklbw %%mm7, %%mm0 \n\t" \
01273 "punpckhbw %%mm7, %%mm1 \n\t" \
01274 "punpcklbw %%mm7, %%mm2 \n\t" \
01275 "punpckhbw %%mm7, %%mm3 \n\t" \
01276 "movq %%mm0, (%1) \n\t" \
01277 "movq %%mm1, 17 * 8(%1) \n\t" \
01278 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
01279 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
01280 "add $8, %1 \n\t" \
01281 "add %3, %0 \n\t" \
01282 "decl %2 \n\t" \
01283 "jnz 1b \n\t" \
01284 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
01285 : "r"((x86_reg)srcStride) \
01286 : "memory" \
01287 ); \
01288 \
01289 temp_ptr = temp; \
01290 count = 4; \
01291 \
01292 \
01293 __asm__ volatile ( \
01294 \
01295 "1: \n\t" \
01296 "movq (%0), %%mm0 \n\t" \
01297 "movq 8(%0), %%mm1 \n\t" \
01298 "movq 16(%0), %%mm2 \n\t" \
01299 "movq 24(%0), %%mm3 \n\t" \
01300 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
01301 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
01302 "add %4, %1 \n\t" \
01303 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
01304 \
01305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
01306 "add %4, %1 \n\t" \
01307 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
01308 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
01309 "add %4, %1 \n\t" \
01310 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
01311 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
01312 "add %4, %1 \n\t" \
01313 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
01314 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
01315 "add %4, %1 \n\t" \
01316 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
01317 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
01318 "add %4, %1 \n\t" \
01319 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
01320 \
01321 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
01322 "add %4, %1 \n\t" \
01323 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
01324 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
01325 \
01326 "add $136, %0 \n\t" \
01327 "add %6, %1 \n\t" \
01328 "decl %2 \n\t" \
01329 "jnz 1b \n\t" \
01330 \
01331 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
01332 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
01333 "m"(ROUNDER), \
01334 "g"(4 - 14 * (x86_reg)dstStride) \
01335 : "memory" \
01336 ); \
01337 } \
01338 \
01339 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
01340 uint8_t *src, \
01341 int dstStride, \
01342 int srcStride) \
01343 { \
01344 uint64_t temp[9 * 2]; \
01345 uint64_t *temp_ptr = temp; \
01346 int count = 9; \
01347 \
01348 \
01349 __asm__ volatile ( \
01350 "pxor %%mm7, %%mm7 \n\t" \
01351 "1: \n\t" \
01352 "movq (%0), %%mm0 \n\t" \
01353 "movq (%0), %%mm1 \n\t" \
01354 "punpcklbw %%mm7, %%mm0 \n\t" \
01355 "punpckhbw %%mm7, %%mm1 \n\t" \
01356 "movq %%mm0, (%1) \n\t" \
01357 "movq %%mm1, 9*8(%1) \n\t" \
01358 "add $8, %1 \n\t" \
01359 "add %3, %0 \n\t" \
01360 "decl %2 \n\t" \
01361 "jnz 1b \n\t" \
01362 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
01363 : "r"((x86_reg)srcStride) \
01364 : "memory" \
01365 ); \
01366 \
01367 temp_ptr = temp; \
01368 count = 2; \
01369 \
01370 \
01371 __asm__ volatile ( \
01372 \
01373 "1: \n\t" \
01374 "movq (%0), %%mm0 \n\t" \
01375 "movq 8(%0), %%mm1 \n\t" \
01376 "movq 16(%0), %%mm2 \n\t" \
01377 "movq 24(%0), %%mm3 \n\t" \
01378 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
01379 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
01380 "add %4, %1 \n\t" \
01381 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
01382 \
01383 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
01384 "add %4, %1 \n\t" \
01385 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
01386 \
01387 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
01388 "add %4, %1 \n\t" \
01389 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
01390 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
01391 \
01392 "add $72, %0 \n\t" \
01393 "add %6, %1 \n\t" \
01394 "decl %2 \n\t" \
01395 "jnz 1b \n\t" \
01396 \
01397 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
01398 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
01399 "m"(ROUNDER), \
01400 "g"(4 - 6 * (x86_reg)dstStride) \
01401 : "memory" \
01402 ); \
01403 } \
01404 \
01405 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
01406 int stride) \
01407 { \
01408 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
01409 } \
01410 \
01411 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
01412 int stride) \
01413 { \
01414 uint64_t temp[8]; \
01415 uint8_t * const half = (uint8_t*)temp; \
01416 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
01417 stride, 8); \
01418 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
01419 } \
01420 \
01421 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
01422 int stride) \
01423 { \
01424 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
01425 stride, 8); \
01426 } \
01427 \
01428 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
01429 int stride) \
01430 { \
01431 uint64_t temp[8]; \
01432 uint8_t * const half = (uint8_t*)temp; \
01433 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
01434 stride, 8); \
01435 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
01436 stride, 8); \
01437 } \
01438 \
01439 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
01440 int stride) \
01441 { \
01442 uint64_t temp[8]; \
01443 uint8_t * const half = (uint8_t*)temp; \
01444 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
01445 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
01446 } \
01447 \
01448 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
01449 int stride) \
01450 { \
01451 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
01452 } \
01453 \
01454 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
01455 int stride) \
01456 { \
01457 uint64_t temp[8]; \
01458 uint8_t * const half = (uint8_t*)temp; \
01459 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
01460 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
01461 stride, 8); \
01462 } \
01463 \
01464 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
01465 int stride) \
01466 { \
01467 uint64_t half[8 + 9]; \
01468 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01469 uint8_t * const halfHV = ((uint8_t*)half); \
01470 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01471 stride, 9); \
01472 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01473 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01474 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01475 } \
01476 \
01477 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
01478 int stride) \
01479 { \
01480 uint64_t half[8 + 9]; \
01481 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01482 uint8_t * const halfHV = ((uint8_t*)half); \
01483 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01484 stride, 9); \
01485 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01486 stride, 9); \
01487 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01488 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01489 } \
01490 \
01491 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
01492 int stride) \
01493 { \
01494 uint64_t half[8 + 9]; \
01495 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01496 uint8_t * const halfHV = ((uint8_t*)half); \
01497 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01498 stride, 9); \
01499 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01500 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01501 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01502 } \
01503 \
01504 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
01505 int stride) \
01506 { \
01507 uint64_t half[8 + 9]; \
01508 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01509 uint8_t * const halfHV = ((uint8_t*)half); \
01510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01511 stride, 9); \
01512 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01513 stride, 9); \
01514 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01515 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01516 } \
01517 \
01518 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
01519 int stride) \
01520 { \
01521 uint64_t half[8 + 9]; \
01522 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01523 uint8_t * const halfHV = ((uint8_t*)half); \
01524 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01525 stride, 9); \
01526 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01527 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01528 } \
01529 \
01530 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
01531 int stride) \
01532 { \
01533 uint64_t half[8 + 9]; \
01534 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01535 uint8_t * const halfHV = ((uint8_t*)half); \
01536 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01537 stride, 9); \
01538 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01539 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01540 } \
01541 \
01542 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
01543 int stride) \
01544 { \
01545 uint64_t half[8 + 9]; \
01546 uint8_t * const halfH = ((uint8_t*)half); \
01547 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01548 stride, 9); \
01549 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01550 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01551 } \
01552 \
01553 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
01554 int stride) \
01555 { \
01556 uint64_t half[8 + 9]; \
01557 uint8_t * const halfH = ((uint8_t*)half); \
01558 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01559 stride, 9); \
01560 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01561 stride, 9); \
01562 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01563 } \
01564 \
01565 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
01566 int stride) \
01567 { \
01568 uint64_t half[9]; \
01569 uint8_t * const halfH = ((uint8_t*)half); \
01570 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01571 stride, 9); \
01572 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01573 } \
01574 \
01575 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
01576 int stride) \
01577 { \
01578 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
01579 } \
01580 \
01581 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
01582 int stride) \
01583 { \
01584 uint64_t temp[32]; \
01585 uint8_t * const half = (uint8_t*)temp; \
01586 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
01587 stride, 16); \
01588 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
01589 } \
01590 \
01591 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
01592 int stride) \
01593 { \
01594 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
01595 stride, stride, 16); \
01596 } \
01597 \
01598 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
01599 int stride) \
01600 { \
01601 uint64_t temp[32]; \
01602 uint8_t * const half = (uint8_t*)temp; \
01603 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
01604 stride, 16); \
01605 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
01606 stride, stride, 16); \
01607 } \
01608 \
01609 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
01610 int stride) \
01611 { \
01612 uint64_t temp[32]; \
01613 uint8_t * const half = (uint8_t*)temp; \
01614 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
01615 stride); \
01616 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
01617 } \
01618 \
01619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
01620 int stride) \
01621 { \
01622 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
01623 } \
01624 \
01625 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
01626 int stride) \
01627 { \
01628 uint64_t temp[32]; \
01629 uint8_t * const half = (uint8_t*)temp; \
01630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
01631 stride); \
01632 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
01633 stride, stride, 16); \
01634 } \
01635 \
01636 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
01637 int stride) \
01638 { \
01639 uint64_t half[16 * 2 + 17 * 2]; \
01640 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01641 uint8_t * const halfHV = ((uint8_t*)half); \
01642 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01643 stride, 17); \
01644 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01645 stride, 17); \
01646 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01647 16, 16); \
01648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01649 } \
01650 \
01651 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
01652 int stride) \
01653 { \
01654 uint64_t half[16 * 2 + 17 * 2]; \
01655 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01656 uint8_t * const halfHV = ((uint8_t*)half); \
01657 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01658 stride, 17); \
01659 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01660 stride, 17); \
01661 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01662 16, 16); \
01663 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01664 } \
01665 \
01666 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
01667 int stride) \
01668 { \
01669 uint64_t half[16 * 2 + 17 * 2]; \
01670 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01671 uint8_t * const halfHV = ((uint8_t*)half); \
01672 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01673 stride, 17); \
01674 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01675 stride, 17); \
01676 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01677 16, 16); \
01678 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01679 16, 16); \
01680 } \
01681 \
01682 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
01683 int stride) \
01684 { \
01685 uint64_t half[16 * 2 + 17 * 2]; \
01686 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01687 uint8_t * const halfHV = ((uint8_t*)half); \
01688 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01689 stride, 17); \
01690 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01691 stride, 17); \
01692 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01693 16, 16); \
01694 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01695 16, 16); \
01696 } \
01697 \
01698 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
01699 int stride) \
01700 { \
01701 uint64_t half[16 * 2 + 17 * 2]; \
01702 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01703 uint8_t * const halfHV = ((uint8_t*)half); \
01704 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01705 stride, 17); \
01706 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01707 16, 16); \
01708 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01709 } \
01710 \
01711 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
01712 int stride) \
01713 { \
01714 uint64_t half[16 * 2 + 17 * 2]; \
01715 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01716 uint8_t * const halfHV = ((uint8_t*)half); \
01717 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01718 stride, 17); \
01719 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01720 16, 16); \
01721 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01722 16, 16); \
01723 } \
01724 \
01725 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
01726 int stride) \
01727 { \
01728 uint64_t half[17 * 2]; \
01729 uint8_t * const halfH = ((uint8_t*)half); \
01730 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01731 stride, 17); \
01732 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01733 stride, 17); \
01734 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01735 } \
01736 \
01737 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
01738 int stride) \
01739 { \
01740 uint64_t half[17 * 2]; \
01741 uint8_t * const halfH = ((uint8_t*)half); \
01742 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01743 stride, 17); \
01744 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01745 stride, 17); \
01746 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01747 } \
01748 \
01749 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
01750 int stride) \
01751 { \
01752 uint64_t half[17 * 2]; \
01753 uint8_t * const halfH = ((uint8_t*)half); \
01754 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01755 stride, 17); \
01756 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01757 }
01758
01759 #define PUT_OP(a, b, temp, size) \
01760 "mov"#size" "#a", "#b" \n\t"
01761
01762 #define AVG_3DNOW_OP(a, b, temp, size) \
01763 "mov"#size" "#b", "#temp" \n\t" \
01764 "pavgusb "#temp", "#a" \n\t" \
01765 "mov"#size" "#a", "#b" \n\t"
01766
01767 #define AVG_MMX2_OP(a, b, temp, size) \
01768 "mov"#size" "#b", "#temp" \n\t" \
01769 "pavgb "#temp", "#a" \n\t" \
01770 "mov"#size" "#a", "#b" \n\t"
01771
01772 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
01773 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
01774 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
01775 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
01776 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
01777 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
01778 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
01779 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
01780 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
01781
01782
01783
01784
01785 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
01786 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01787 uint8_t *src, \
01788 int stride) \
01789 { \
01790 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
01791 }
01792
01793 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
01794 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01795 uint8_t *src, \
01796 int stride) \
01797 { \
01798 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
01799 S1, S2); \
01800 }
01801
01802 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
01803 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
01804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
01805 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
01806 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
01807 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
01808 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
01809 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
01810 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
01811 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
01812 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
01813 uint8_t *src, \
01814 int stride) \
01815 { \
01816 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
01817 } \
01818 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
01819 uint8_t *src, \
01820 int stride) \
01821 { \
01822 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
01823 stride, SIZE); \
01824 } \
01825 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
01826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
01827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
01828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
01829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
01830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
01831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
01832 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
01833
01834 QPEL_2TAP(put_, 16, mmx2)
01835 QPEL_2TAP(avg_, 16, mmx2)
01836 QPEL_2TAP(put_, 8, mmx2)
01837 QPEL_2TAP(avg_, 8, mmx2)
01838 QPEL_2TAP(put_, 16, 3dnow)
01839 QPEL_2TAP(avg_, 16, 3dnow)
01840 QPEL_2TAP(put_, 8, 3dnow)
01841 QPEL_2TAP(avg_, 8, 3dnow)
01842
01843 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01844 {
01845 put_pixels8_xy2_mmx(dst, src, stride, 8);
01846 }
01847 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01848 {
01849 put_pixels16_xy2_mmx(dst, src, stride, 16);
01850 }
01851 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01852 {
01853 avg_pixels8_xy2_mmx(dst, src, stride, 8);
01854 }
01855 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01856 {
01857 avg_pixels16_xy2_mmx(dst, src, stride, 16);
01858 }
01859
01860 #endif
01861
01862 #if HAVE_YASM
01863 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
01864 x86_reg linesize, x86_reg start_y,
01865 x86_reg end_y, x86_reg block_h,
01866 x86_reg start_x, x86_reg end_x,
01867 x86_reg block_w);
01868 extern emu_edge_core_func ff_emu_edge_core_mmx;
01869 extern emu_edge_core_func ff_emu_edge_core_sse;
01870
01871 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
01872 int linesize,
01873 int block_w, int block_h,
01874 int src_x, int src_y,
01875 int w, int h,
01876 emu_edge_core_func *core_fn)
01877 {
01878 int start_y, start_x, end_y, end_x, src_y_add = 0;
01879
01880 if (src_y >= h) {
01881 src_y_add = h - 1 - src_y;
01882 src_y = h - 1;
01883 } else if (src_y <= -block_h) {
01884 src_y_add = 1 - block_h - src_y;
01885 src_y = 1 - block_h;
01886 }
01887 if (src_x >= w) {
01888 src += w - 1 - src_x;
01889 src_x = w - 1;
01890 } else if (src_x <= -block_w) {
01891 src += 1 - block_w - src_x;
01892 src_x = 1 - block_w;
01893 }
01894
01895 start_y = FFMAX(0, -src_y);
01896 start_x = FFMAX(0, -src_x);
01897 end_y = FFMIN(block_h, h-src_y);
01898 end_x = FFMIN(block_w, w-src_x);
01899 assert(start_x < end_x && block_w > 0);
01900 assert(start_y < end_y && block_h > 0);
01901
01902
01903 src += (src_y_add + start_y) * linesize + start_x;
01904 buf += start_x;
01905 core_fn(buf, src, linesize, start_y, end_y,
01906 block_h, start_x, end_x, block_w);
01907 }
01908
01909 #if ARCH_X86_32
01910 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
01911 int linesize,
01912 int block_w, int block_h,
01913 int src_x, int src_y, int w, int h)
01914 {
01915 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01916 w, h, &ff_emu_edge_core_mmx);
01917 }
01918 #endif
01919
01920 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
01921 int linesize,
01922 int block_w, int block_h,
01923 int src_x, int src_y, int w, int h)
01924 {
01925 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01926 w, h, &ff_emu_edge_core_sse);
01927 }
01928 #endif
01929
01930 #if HAVE_INLINE_ASM
01931
01932 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
01933 int linesize, int block_w, int block_h,
01934 int src_x, int src_y, int w, int h);
01935
01936 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
01937 int stride, int h, int ox, int oy,
01938 int dxx, int dxy, int dyx, int dyy,
01939 int shift, int r, int width, int height,
01940 emulated_edge_mc_func *emu_edge_fn)
01941 {
01942 const int w = 8;
01943 const int ix = ox >> (16 + shift);
01944 const int iy = oy >> (16 + shift);
01945 const int oxs = ox >> 4;
01946 const int oys = oy >> 4;
01947 const int dxxs = dxx >> 4;
01948 const int dxys = dxy >> 4;
01949 const int dyxs = dyx >> 4;
01950 const int dyys = dyy >> 4;
01951 const uint16_t r4[4] = { r, r, r, r };
01952 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
01953 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
01954 const uint64_t shift2 = 2 * shift;
01955 uint8_t edge_buf[(h + 1) * stride];
01956 int x, y;
01957
01958 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
01959 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
01960 const int dxh = dxy * (h - 1);
01961 const int dyw = dyx * (w - 1);
01962 if (
01963 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
01964 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
01965
01966 || (dxx | dxy | dyx | dyy) & 15) {
01967
01968 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
01969 shift, r, width, height);
01970 return;
01971 }
01972
01973 src += ix + iy * stride;
01974 if ((unsigned)ix >= width - w ||
01975 (unsigned)iy >= height - h) {
01976 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
01977 src = edge_buf;
01978 }
01979
01980 __asm__ volatile (
01981 "movd %0, %%mm6 \n\t"
01982 "pxor %%mm7, %%mm7 \n\t"
01983 "punpcklwd %%mm6, %%mm6 \n\t"
01984 "punpcklwd %%mm6, %%mm6 \n\t"
01985 :: "r"(1<<shift)
01986 );
01987
01988 for (x = 0; x < w; x += 4) {
01989 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
01990 oxs - dxys + dxxs * (x + 1),
01991 oxs - dxys + dxxs * (x + 2),
01992 oxs - dxys + dxxs * (x + 3) };
01993 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
01994 oys - dyys + dyxs * (x + 1),
01995 oys - dyys + dyxs * (x + 2),
01996 oys - dyys + dyxs * (x + 3) };
01997
01998 for (y = 0; y < h; y++) {
01999 __asm__ volatile (
02000 "movq %0, %%mm4 \n\t"
02001 "movq %1, %%mm5 \n\t"
02002 "paddw %2, %%mm4 \n\t"
02003 "paddw %3, %%mm5 \n\t"
02004 "movq %%mm4, %0 \n\t"
02005 "movq %%mm5, %1 \n\t"
02006 "psrlw $12, %%mm4 \n\t"
02007 "psrlw $12, %%mm5 \n\t"
02008 : "+m"(*dx4), "+m"(*dy4)
02009 : "m"(*dxy4), "m"(*dyy4)
02010 );
02011
02012 __asm__ volatile (
02013 "movq %%mm6, %%mm2 \n\t"
02014 "movq %%mm6, %%mm1 \n\t"
02015 "psubw %%mm4, %%mm2 \n\t"
02016 "psubw %%mm5, %%mm1 \n\t"
02017 "movq %%mm2, %%mm0 \n\t"
02018 "movq %%mm4, %%mm3 \n\t"
02019 "pmullw %%mm1, %%mm0 \n\t"
02020 "pmullw %%mm5, %%mm3 \n\t"
02021 "pmullw %%mm5, %%mm2 \n\t"
02022 "pmullw %%mm4, %%mm1 \n\t"
02023
02024 "movd %4, %%mm5 \n\t"
02025 "movd %3, %%mm4 \n\t"
02026 "punpcklbw %%mm7, %%mm5 \n\t"
02027 "punpcklbw %%mm7, %%mm4 \n\t"
02028 "pmullw %%mm5, %%mm3 \n\t"
02029 "pmullw %%mm4, %%mm2 \n\t"
02030
02031 "movd %2, %%mm5 \n\t"
02032 "movd %1, %%mm4 \n\t"
02033 "punpcklbw %%mm7, %%mm5 \n\t"
02034 "punpcklbw %%mm7, %%mm4 \n\t"
02035 "pmullw %%mm5, %%mm1 \n\t"
02036 "pmullw %%mm4, %%mm0 \n\t"
02037 "paddw %5, %%mm1 \n\t"
02038 "paddw %%mm3, %%mm2 \n\t"
02039 "paddw %%mm1, %%mm0 \n\t"
02040 "paddw %%mm2, %%mm0 \n\t"
02041
02042 "psrlw %6, %%mm0 \n\t"
02043 "packuswb %%mm0, %%mm0 \n\t"
02044 "movd %%mm0, %0 \n\t"
02045
02046 : "=m"(dst[x + y * stride])
02047 : "m"(src[0]), "m"(src[1]),
02048 "m"(src[stride]), "m"(src[stride + 1]),
02049 "m"(*r4), "m"(shift2)
02050 );
02051 src += stride;
02052 }
02053 src += 4 - h * stride;
02054 }
02055 }
02056
02057 #if HAVE_YASM
02058 #if ARCH_X86_32
02059 static void gmc_mmx(uint8_t *dst, uint8_t *src,
02060 int stride, int h, int ox, int oy,
02061 int dxx, int dxy, int dyx, int dyy,
02062 int shift, int r, int width, int height)
02063 {
02064 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02065 width, height, &emulated_edge_mc_mmx);
02066 }
02067 #endif
02068 static void gmc_sse(uint8_t *dst, uint8_t *src,
02069 int stride, int h, int ox, int oy,
02070 int dxx, int dxy, int dyx, int dyy,
02071 int shift, int r, int width, int height)
02072 {
02073 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02074 width, height, &emulated_edge_mc_sse);
02075 }
02076 #else
02077 static void gmc_mmx(uint8_t *dst, uint8_t *src,
02078 int stride, int h, int ox, int oy,
02079 int dxx, int dxy, int dyx, int dyy,
02080 int shift, int r, int width, int height)
02081 {
02082 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02083 width, height, &ff_emulated_edge_mc_8);
02084 }
02085 #endif
02086
02087 #define PREFETCH(name, op) \
02088 static void name(void *mem, int stride, int h) \
02089 { \
02090 const uint8_t *p = mem; \
02091 do { \
02092 __asm__ volatile (#op" %0" :: "m"(*p)); \
02093 p += stride; \
02094 } while (--h); \
02095 }
02096
02097 PREFETCH(prefetch_mmx2, prefetcht0)
02098 PREFETCH(prefetch_3dnow, prefetch)
02099 #undef PREFETCH
02100
02101 #endif
02102
02103 #include "h264_qpel.c"
02104
02105 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
02106 int stride, int h, int x, int y);
02107 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
02108 int stride, int h, int x, int y);
02109 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
02110 int stride, int h, int x, int y);
02111
02112 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
02113 int stride, int h, int x, int y);
02114 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
02115 int stride, int h, int x, int y);
02116 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
02117 int stride, int h, int x, int y);
02118
02119 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
02120 int stride, int h, int x, int y);
02121 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
02122 int stride, int h, int x, int y);
02123
02124 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
02125 int stride, int h, int x, int y);
02126 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
02127 int stride, int h, int x, int y);
02128
02129 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
02130 int stride, int h, int x, int y);
02131 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
02132 int stride, int h, int x, int y);
02133
02134 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
02135 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
02136 (uint8_t *dst, uint8_t *src, \
02137 int stride, int h, int x, int y);
02138
02139 CHROMA_MC(put, 2, 10, mmx2)
02140 CHROMA_MC(avg, 2, 10, mmx2)
02141 CHROMA_MC(put, 4, 10, mmx2)
02142 CHROMA_MC(avg, 4, 10, mmx2)
02143 CHROMA_MC(put, 8, 10, sse2)
02144 CHROMA_MC(avg, 8, 10, sse2)
02145 CHROMA_MC(put, 8, 10, avx)
02146 CHROMA_MC(avg, 8, 10, avx)
02147
02148 #if HAVE_INLINE_ASM
02149
02150
02151 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02152 {
02153 put_pixels8_mmx(dst, src, stride, 8);
02154 }
02155
02156 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02157 {
02158 avg_pixels8_mmx(dst, src, stride, 8);
02159 }
02160
02161 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02162 {
02163 put_pixels16_mmx(dst, src, stride, 16);
02164 }
02165
02166 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02167 {
02168 avg_pixels16_mmx(dst, src, stride, 16);
02169 }
02170
02171
02172 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
02173 int stride, int rnd)
02174 {
02175 put_pixels8_mmx(dst, src, stride, 8);
02176 }
02177
02178 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
02179 int stride, int rnd)
02180 {
02181 avg_pixels8_mmx2(dst, src, stride, 8);
02182 }
02183
02184
02185 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
02186 {
02187
02188 MOVQ_BFE(mm6);
02189 __asm__ volatile(
02190 "1: \n\t"
02191 "movq (%1), %%mm0 \n\t"
02192 "movq (%2), %%mm1 \n\t"
02193 "movq (%1,%4), %%mm2 \n\t"
02194 "movq (%2,%4), %%mm3 \n\t"
02195 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
02196 "movq %%mm4, (%3) \n\t"
02197 "movq %%mm5, (%3,%4) \n\t"
02198
02199 "movq (%1,%4,2), %%mm0 \n\t"
02200 "movq (%2,%4,2), %%mm1 \n\t"
02201 "movq (%1,%5), %%mm2 \n\t"
02202 "movq (%2,%5), %%mm3 \n\t"
02203 "lea (%1,%4,4), %1 \n\t"
02204 "lea (%2,%4,4), %2 \n\t"
02205 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
02206 "movq %%mm4, (%3,%4,2) \n\t"
02207 "movq %%mm5, (%3,%5) \n\t"
02208 "lea (%3,%4,4), %3 \n\t"
02209 "subl $4, %0 \n\t"
02210 "jnz 1b \n\t"
02211 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
02212 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
02213 :"memory");
02214
02215 }
02216 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
02217 {
02218 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
02219 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
02220 }
02221
02222 #if CONFIG_DIRAC_DECODER
02223 #define DIRAC_PIXOP(OPNAME, EXT)\
02224 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02225 {\
02226 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
02227 }\
02228 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02229 {\
02230 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
02231 }\
02232 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02233 {\
02234 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
02235 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
02236 }
02237
02238 DIRAC_PIXOP(put, mmx)
02239 DIRAC_PIXOP(avg, mmx)
02240 DIRAC_PIXOP(avg, mmx2)
02241
02242 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02243 {
02244 put_pixels16_sse2(dst, src[0], stride, h);
02245 }
02246 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02247 {
02248 avg_pixels16_sse2(dst, src[0], stride, h);
02249 }
02250 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02251 {
02252 put_pixels16_sse2(dst , src[0] , stride, h);
02253 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
02254 }
02255 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02256 {
02257 avg_pixels16_sse2(dst , src[0] , stride, h);
02258 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
02259 }
02260 #endif
02261
02262
02263
02264 #if CONFIG_GPL
02265 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
02266 DCTELEM *block)
02267 {
02268 ff_mmx_idct(block);
02269 ff_put_pixels_clamped_mmx(block, dest, line_size);
02270 }
02271
02272 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
02273 DCTELEM *block)
02274 {
02275 ff_mmx_idct(block);
02276 ff_add_pixels_clamped_mmx(block, dest, line_size);
02277 }
02278
02279 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
02280 DCTELEM *block)
02281 {
02282 ff_mmxext_idct(block);
02283 ff_put_pixels_clamped_mmx(block, dest, line_size);
02284 }
02285
02286 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
02287 DCTELEM *block)
02288 {
02289 ff_mmxext_idct(block);
02290 ff_add_pixels_clamped_mmx(block, dest, line_size);
02291 }
02292 #endif
02293
02294 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02295 {
02296 int i;
02297 __asm__ volatile ("pxor %%mm7, %%mm7":);
02298 for (i = 0; i < blocksize; i += 2) {
02299 __asm__ volatile (
02300 "movq %0, %%mm0 \n\t"
02301 "movq %1, %%mm1 \n\t"
02302 "movq %%mm0, %%mm2 \n\t"
02303 "movq %%mm1, %%mm3 \n\t"
02304 "pfcmpge %%mm7, %%mm2 \n\t"
02305 "pfcmpge %%mm7, %%mm3 \n\t"
02306 "pslld $31, %%mm2 \n\t"
02307 "pxor %%mm2, %%mm1 \n\t"
02308 "movq %%mm3, %%mm4 \n\t"
02309 "pand %%mm1, %%mm3 \n\t"
02310 "pandn %%mm1, %%mm4 \n\t"
02311 "pfadd %%mm0, %%mm3 \n\t"
02312 "pfsub %%mm4, %%mm0 \n\t"
02313 "movq %%mm3, %1 \n\t"
02314 "movq %%mm0, %0 \n\t"
02315 : "+m"(mag[i]), "+m"(ang[i])
02316 :: "memory"
02317 );
02318 }
02319 __asm__ volatile ("femms");
02320 }
02321
02322 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02323 {
02324 int i;
02325
02326 __asm__ volatile (
02327 "movaps %0, %%xmm5 \n\t"
02328 :: "m"(ff_pdw_80000000[0])
02329 );
02330 for (i = 0; i < blocksize; i += 4) {
02331 __asm__ volatile (
02332 "movaps %0, %%xmm0 \n\t"
02333 "movaps %1, %%xmm1 \n\t"
02334 "xorps %%xmm2, %%xmm2 \n\t"
02335 "xorps %%xmm3, %%xmm3 \n\t"
02336 "cmpleps %%xmm0, %%xmm2 \n\t"
02337 "cmpleps %%xmm1, %%xmm3 \n\t"
02338 "andps %%xmm5, %%xmm2 \n\t"
02339 "xorps %%xmm2, %%xmm1 \n\t"
02340 "movaps %%xmm3, %%xmm4 \n\t"
02341 "andps %%xmm1, %%xmm3 \n\t"
02342 "andnps %%xmm1, %%xmm4 \n\t"
02343 "addps %%xmm0, %%xmm3 \n\t"
02344 "subps %%xmm4, %%xmm0 \n\t"
02345 "movaps %%xmm3, %1 \n\t"
02346 "movaps %%xmm0, %0 \n\t"
02347 : "+m"(mag[i]), "+m"(ang[i])
02348 :: "memory"
02349 );
02350 }
02351 }
02352
02353 #if HAVE_6REGS
02354 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
02355 const float *src1, const float *win,
02356 int len)
02357 {
02358 x86_reg i = -len * 4;
02359 x86_reg j = len * 4 - 8;
02360 __asm__ volatile (
02361 "1: \n"
02362 "pswapd (%5, %1), %%mm1 \n"
02363 "movq (%5, %0), %%mm0 \n"
02364 "pswapd (%4, %1), %%mm5 \n"
02365 "movq (%3, %0), %%mm4 \n"
02366 "movq %%mm0, %%mm2 \n"
02367 "movq %%mm1, %%mm3 \n"
02368 "pfmul %%mm4, %%mm2 \n"
02369 "pfmul %%mm5, %%mm3 \n"
02370 "pfmul %%mm4, %%mm1 \n"
02371 "pfmul %%mm5, %%mm0 \n"
02372 "pfadd %%mm3, %%mm2 \n"
02373 "pfsub %%mm0, %%mm1 \n"
02374 "pswapd %%mm2, %%mm2 \n"
02375 "movq %%mm1, (%2, %0) \n"
02376 "movq %%mm2, (%2, %1) \n"
02377 "sub $8, %1 \n"
02378 "add $8, %0 \n"
02379 "jl 1b \n"
02380 "femms \n"
02381 : "+r"(i), "+r"(j)
02382 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02383 );
02384 }
02385
02386 static void vector_fmul_window_sse(float *dst, const float *src0,
02387 const float *src1, const float *win, int len)
02388 {
02389 x86_reg i = -len * 4;
02390 x86_reg j = len * 4 - 16;
02391 __asm__ volatile (
02392 "1: \n"
02393 "movaps (%5, %1), %%xmm1 \n"
02394 "movaps (%5, %0), %%xmm0 \n"
02395 "movaps (%4, %1), %%xmm5 \n"
02396 "movaps (%3, %0), %%xmm4 \n"
02397 "shufps $0x1b, %%xmm1, %%xmm1 \n"
02398 "shufps $0x1b, %%xmm5, %%xmm5 \n"
02399 "movaps %%xmm0, %%xmm2 \n"
02400 "movaps %%xmm1, %%xmm3 \n"
02401 "mulps %%xmm4, %%xmm2 \n"
02402 "mulps %%xmm5, %%xmm3 \n"
02403 "mulps %%xmm4, %%xmm1 \n"
02404 "mulps %%xmm5, %%xmm0 \n"
02405 "addps %%xmm3, %%xmm2 \n"
02406 "subps %%xmm0, %%xmm1 \n"
02407 "shufps $0x1b, %%xmm2, %%xmm2 \n"
02408 "movaps %%xmm1, (%2, %0) \n"
02409 "movaps %%xmm2, (%2, %1) \n"
02410 "sub $16, %1 \n"
02411 "add $16, %0 \n"
02412 "jl 1b \n"
02413 : "+r"(i), "+r"(j)
02414 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02415 );
02416 }
02417 #endif
02418
02419 static void vector_clipf_sse(float *dst, const float *src,
02420 float min, float max, int len)
02421 {
02422 x86_reg i = (len - 16) * 4;
02423 __asm__ volatile (
02424 "movss %3, %%xmm4 \n\t"
02425 "movss %4, %%xmm5 \n\t"
02426 "shufps $0, %%xmm4, %%xmm4 \n\t"
02427 "shufps $0, %%xmm5, %%xmm5 \n\t"
02428 "1: \n\t"
02429 "movaps (%2, %0), %%xmm0 \n\t"
02430 "movaps 16(%2, %0), %%xmm1 \n\t"
02431 "movaps 32(%2, %0), %%xmm2 \n\t"
02432 "movaps 48(%2, %0), %%xmm3 \n\t"
02433 "maxps %%xmm4, %%xmm0 \n\t"
02434 "maxps %%xmm4, %%xmm1 \n\t"
02435 "maxps %%xmm4, %%xmm2 \n\t"
02436 "maxps %%xmm4, %%xmm3 \n\t"
02437 "minps %%xmm5, %%xmm0 \n\t"
02438 "minps %%xmm5, %%xmm1 \n\t"
02439 "minps %%xmm5, %%xmm2 \n\t"
02440 "minps %%xmm5, %%xmm3 \n\t"
02441 "movaps %%xmm0, (%1, %0) \n\t"
02442 "movaps %%xmm1, 16(%1, %0) \n\t"
02443 "movaps %%xmm2, 32(%1, %0) \n\t"
02444 "movaps %%xmm3, 48(%1, %0) \n\t"
02445 "sub $64, %0 \n\t"
02446 "jge 1b \n\t"
02447 : "+&r"(i)
02448 : "r"(dst), "r"(src), "m"(min), "m"(max)
02449 : "memory"
02450 );
02451 }
02452
02453 #endif
02454
02455 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
02456 int order);
02457 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
02458 int order);
02459 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
02460 const int16_t *v3,
02461 int order, int mul);
02462 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
02463 const int16_t *v3,
02464 int order, int mul);
02465 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
02466 const int16_t *v3,
02467 int order, int mul);
02468
02469 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
02470 const int16_t *window, unsigned int len);
02471 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
02472 const int16_t *window, unsigned int len);
02473 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
02474 const int16_t *window, unsigned int len);
02475 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
02476 const int16_t *window, unsigned int len);
02477 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
02478 const int16_t *window, unsigned int len);
02479 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
02480 const int16_t *window, unsigned int len);
02481
02482 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
02483 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
02484
02485 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
02486 const uint8_t *diff, int w,
02487 int *left, int *left_top);
02488 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
02489 int w, int left);
02490 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
02491 int w, int left);
02492
02493 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02494
02495 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
02496 const float *src1, int len);
02497 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
02498 const float *src1, int len);
02499
02500 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02501 const float *src2, int len);
02502 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
02503 const float *src2, int len);
02504
02505 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
02506 int32_t min, int32_t max, unsigned int len);
02507 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
02508 int32_t min, int32_t max, unsigned int len);
02509 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
02510 int32_t min, int32_t max, unsigned int len);
02511 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
02512 int32_t min, int32_t max, unsigned int len);
02513
02514 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
02515 const float *src1, int len);
02516 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
02517 const float *src1, int len);
02518
02519 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
02520 do { \
02521 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
02522 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
02523 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
02524 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
02525 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
02526 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
02527 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
02528 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
02529 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
02530 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
02531 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
02532 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
02533 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
02534 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
02535 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
02536 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
02537 } while (0)
02538
02539 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02540 do { \
02541 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
02542 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
02543 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
02544 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
02545 } while (0)
02546
02547 #define H264_QPEL_FUNCS(x, y, CPU) \
02548 do { \
02549 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02550 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
02551 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02552 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
02553 } while (0)
02554
02555 #define H264_QPEL_FUNCS_10(x, y, CPU) \
02556 do { \
02557 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02558 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
02559 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02560 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
02561 } while (0)
02562
02563 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02564 {
02565 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02566
02567 #if HAVE_INLINE_ASM
02568 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
02569 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02570 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
02571
02572 if (!high_bit_depth) {
02573 c->clear_block = clear_block_mmx;
02574 c->clear_blocks = clear_blocks_mmx;
02575 c->draw_edges = draw_edges_mmx;
02576
02577 SET_HPEL_FUNCS(put, 0, 16, mmx);
02578 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02579 SET_HPEL_FUNCS(avg, 0, 16, mmx);
02580 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02581 SET_HPEL_FUNCS(put, 1, 8, mmx);
02582 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
02583 SET_HPEL_FUNCS(avg, 1, 8, mmx);
02584 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
02585 }
02586
02587 #if ARCH_X86_32 || !HAVE_YASM
02588 c->gmc = gmc_mmx;
02589 #endif
02590
02591 c->add_bytes = add_bytes_mmx;
02592
02593 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
02594 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
02595
02596 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02597 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
02598 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
02599 }
02600 #endif
02601
02602 #if HAVE_YASM
02603 #if ARCH_X86_32
02604 if (!high_bit_depth)
02605 c->emulated_edge_mc = emulated_edge_mc_mmx;
02606 #endif
02607
02608 if (!high_bit_depth && CONFIG_H264CHROMA) {
02609 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
02610 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
02611 }
02612
02613 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
02614 #endif
02615
02616 }
02617
02618 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
02619 int mm_flags)
02620 {
02621 const int bit_depth = avctx->bits_per_raw_sample;
02622 const int high_bit_depth = bit_depth > 8;
02623
02624 #if HAVE_INLINE_ASM
02625 c->prefetch = prefetch_mmx2;
02626
02627 if (!high_bit_depth) {
02628 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02629 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02630
02631 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02632 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02633 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02634
02635 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02636 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02637
02638 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02639 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02640 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02641 }
02642
02643 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
02644 if (!high_bit_depth) {
02645 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02646 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02647 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02648 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02649
02650 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02651 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02652 }
02653 }
02654
02655 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02656 avctx->codec_id == AV_CODEC_ID_THEORA)) {
02657 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
02658 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
02659 }
02660 #endif
02661
02662 if (CONFIG_H264QPEL) {
02663 #if HAVE_INLINE_ASM
02664 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
02665 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
02666 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
02667 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
02668 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
02669 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
02670 #endif
02671
02672 if (!high_bit_depth) {
02673 #if HAVE_INLINE_ASM
02674 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
02675 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
02676 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
02677 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
02678 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
02679 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
02680 #endif
02681 } else if (bit_depth == 10) {
02682 #if HAVE_YASM
02683 #if !ARCH_X86_64
02684 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
02685 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
02686 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
02687 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
02688 #endif
02689 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
02690 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
02691 #endif
02692 }
02693
02694 #if HAVE_INLINE_ASM
02695 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
02696 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
02697 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
02698 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
02699 #endif
02700 }
02701
02702 #if HAVE_YASM
02703 if (!high_bit_depth && CONFIG_H264CHROMA) {
02704 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
02705 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
02706 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
02707 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
02708 }
02709 if (bit_depth == 10 && CONFIG_H264CHROMA) {
02710 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
02711 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
02712 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
02713 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
02714 }
02715
02716
02717 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
02718 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
02719
02720 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
02721 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
02722
02723 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02724 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
02725 } else {
02726 c->apply_window_int16 = ff_apply_window_int16_mmxext;
02727 }
02728 #endif
02729 }
02730
02731 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
02732 int mm_flags)
02733 {
02734 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02735
02736 #if HAVE_INLINE_ASM
02737 c->prefetch = prefetch_3dnow;
02738
02739 if (!high_bit_depth) {
02740 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02741 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02742
02743 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02744 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02745 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02746
02747 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02748 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02749
02750 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02751 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02752 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02753
02754 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
02755 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02756 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02757 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02758 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02759
02760 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02761 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02762 }
02763 }
02764
02765 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02766 avctx->codec_id == AV_CODEC_ID_THEORA)) {
02767 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02768 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02769 }
02770
02771 if (CONFIG_H264QPEL) {
02772 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
02773 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
02774 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
02775 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
02776 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
02777 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
02778
02779 if (!high_bit_depth) {
02780 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
02781 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
02782 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
02783 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
02784 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
02785 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
02786 }
02787
02788 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
02789 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
02790 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
02791 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
02792 }
02793
02794 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02795 #endif
02796
02797 #if HAVE_YASM
02798 if (!high_bit_depth && CONFIG_H264CHROMA) {
02799 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
02800 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
02801 }
02802 #endif
02803 }
02804
02805 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
02806 int mm_flags)
02807 {
02808 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
02809 c->vector_fmul_window = vector_fmul_window_3dnowext;
02810 #endif
02811 }
02812
02813 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02814 {
02815 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02816
02817 #if HAVE_INLINE_ASM
02818 if (!high_bit_depth) {
02819 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
02820
02821 c->clear_block = clear_block_sse;
02822 c->clear_blocks = clear_blocks_sse;
02823 }
02824 }
02825
02826 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02827
02828 #if HAVE_6REGS
02829 c->vector_fmul_window = vector_fmul_window_sse;
02830 #endif
02831
02832 c->vector_clipf = vector_clipf_sse;
02833 #endif
02834
02835 #if HAVE_YASM
02836 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
02837 c->vector_fmul_add = ff_vector_fmul_add_sse;
02838
02839 c->scalarproduct_float = ff_scalarproduct_float_sse;
02840 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
02841
02842 if (!high_bit_depth)
02843 c->emulated_edge_mc = emulated_edge_mc_sse;
02844 #if HAVE_INLINE_ASM
02845 c->gmc = gmc_sse;
02846 #endif
02847 #endif
02848 }
02849
02850 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
02851 int mm_flags)
02852 {
02853 const int bit_depth = avctx->bits_per_raw_sample;
02854
02855 #if HAVE_INLINE_ASM
02856 const int high_bit_depth = bit_depth > 8;
02857
02858 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02859
02860 if (!high_bit_depth) {
02861 c->put_pixels_tab[0][0] = put_pixels16_sse2;
02862 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
02863 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
02864 if (CONFIG_H264QPEL)
02865 H264_QPEL_FUNCS(0, 0, sse2);
02866 }
02867 }
02868
02869 if (!high_bit_depth && CONFIG_H264QPEL) {
02870 H264_QPEL_FUNCS(0, 1, sse2);
02871 H264_QPEL_FUNCS(0, 2, sse2);
02872 H264_QPEL_FUNCS(0, 3, sse2);
02873 H264_QPEL_FUNCS(1, 1, sse2);
02874 H264_QPEL_FUNCS(1, 2, sse2);
02875 H264_QPEL_FUNCS(1, 3, sse2);
02876 H264_QPEL_FUNCS(2, 1, sse2);
02877 H264_QPEL_FUNCS(2, 2, sse2);
02878 H264_QPEL_FUNCS(2, 3, sse2);
02879 H264_QPEL_FUNCS(3, 1, sse2);
02880 H264_QPEL_FUNCS(3, 2, sse2);
02881 H264_QPEL_FUNCS(3, 3, sse2);
02882 }
02883 #endif
02884
02885 #if HAVE_YASM
02886 if (bit_depth == 10) {
02887 if (CONFIG_H264QPEL) {
02888 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
02889 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
02890 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
02891 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
02892 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
02893 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
02894 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
02895 }
02896 if (CONFIG_H264CHROMA) {
02897 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
02898 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
02899 }
02900 }
02901
02902 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
02903 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02904 if (mm_flags & AV_CPU_FLAG_ATOM) {
02905 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
02906 } else {
02907 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
02908 }
02909 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02910 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
02911 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02912 c->apply_window_int16 = ff_apply_window_int16_sse2;
02913 }
02914 c->bswap_buf = ff_bswap32_buf_sse2;
02915 #endif
02916 }
02917
02918 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
02919 int mm_flags)
02920 {
02921 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02922 const int bit_depth = avctx->bits_per_raw_sample;
02923
02924 #if HAVE_SSSE3_INLINE
02925 if (!high_bit_depth && CONFIG_H264QPEL) {
02926 H264_QPEL_FUNCS(1, 0, ssse3);
02927 H264_QPEL_FUNCS(1, 1, ssse3);
02928 H264_QPEL_FUNCS(1, 2, ssse3);
02929 H264_QPEL_FUNCS(1, 3, ssse3);
02930 H264_QPEL_FUNCS(2, 0, ssse3);
02931 H264_QPEL_FUNCS(2, 1, ssse3);
02932 H264_QPEL_FUNCS(2, 2, ssse3);
02933 H264_QPEL_FUNCS(2, 3, ssse3);
02934 H264_QPEL_FUNCS(3, 0, ssse3);
02935 H264_QPEL_FUNCS(3, 1, ssse3);
02936 H264_QPEL_FUNCS(3, 2, ssse3);
02937 H264_QPEL_FUNCS(3, 3, ssse3);
02938 }
02939 #endif
02940
02941 #if HAVE_SSSE3_EXTERNAL
02942 if (bit_depth == 10 && CONFIG_H264QPEL) {
02943 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
02944 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
02945 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
02946 }
02947 if (!high_bit_depth && CONFIG_H264CHROMA) {
02948 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
02949 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
02950 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
02951 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
02952 }
02953 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02954 if (mm_flags & AV_CPU_FLAG_SSE4)
02955 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02956
02957 if (mm_flags & AV_CPU_FLAG_ATOM)
02958 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
02959 else
02960 c->apply_window_int16 = ff_apply_window_int16_ssse3;
02961 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)))
02962 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02963 c->bswap_buf = ff_bswap32_buf_ssse3;
02964 #endif
02965 }
02966
02967 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
02968 int mm_flags)
02969 {
02970 #if HAVE_SSE4_EXTERNAL
02971 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
02972 #endif
02973 }
02974
02975 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02976 {
02977 #if HAVE_AVX_EXTERNAL
02978 const int bit_depth = avctx->bits_per_raw_sample;
02979
02980 if (bit_depth == 10) {
02981
02982
02983 if (CONFIG_H264QPEL) {
02984 H264_QPEL_FUNCS_10(1, 0, sse2);
02985 H264_QPEL_FUNCS_10(2, 0, sse2);
02986 H264_QPEL_FUNCS_10(3, 0, sse2);
02987 }
02988
02989 if (CONFIG_H264CHROMA) {
02990 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
02991 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
02992 }
02993 }
02994 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
02995 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
02996 c->vector_fmul_add = ff_vector_fmul_add_avx;
02997 #endif
02998 }
02999
03000 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
03001 {
03002 int mm_flags = av_get_cpu_flags();
03003
03004 #if HAVE_7REGS && HAVE_INLINE_ASM
03005 if (mm_flags & AV_CPU_FLAG_CMOV)
03006 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
03007 #endif
03008
03009 if (mm_flags & AV_CPU_FLAG_MMX) {
03010 #if HAVE_INLINE_ASM
03011 const int idct_algo = avctx->idct_algo;
03012
03013 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
03014 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
03015 c->idct_put = ff_simple_idct_put_mmx;
03016 c->idct_add = ff_simple_idct_add_mmx;
03017 c->idct = ff_simple_idct_mmx;
03018 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
03019 #if CONFIG_GPL
03020 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
03021 if (mm_flags & AV_CPU_FLAG_MMX2) {
03022 c->idct_put = ff_libmpeg2mmx2_idct_put;
03023 c->idct_add = ff_libmpeg2mmx2_idct_add;
03024 c->idct = ff_mmxext_idct;
03025 } else {
03026 c->idct_put = ff_libmpeg2mmx_idct_put;
03027 c->idct_add = ff_libmpeg2mmx_idct_add;
03028 c->idct = ff_mmx_idct;
03029 }
03030 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
03031 #endif
03032 } else if (idct_algo == FF_IDCT_XVIDMMX) {
03033 if (mm_flags & AV_CPU_FLAG_SSE2) {
03034 c->idct_put = ff_idct_xvid_sse2_put;
03035 c->idct_add = ff_idct_xvid_sse2_add;
03036 c->idct = ff_idct_xvid_sse2;
03037 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
03038 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
03039 c->idct_put = ff_idct_xvid_mmx2_put;
03040 c->idct_add = ff_idct_xvid_mmx2_add;
03041 c->idct = ff_idct_xvid_mmx2;
03042 } else {
03043 c->idct_put = ff_idct_xvid_mmx_put;
03044 c->idct_add = ff_idct_xvid_mmx_add;
03045 c->idct = ff_idct_xvid_mmx;
03046 }
03047 }
03048 }
03049 #endif
03050
03051 dsputil_init_mmx(c, avctx, mm_flags);
03052 }
03053
03054 if (mm_flags & AV_CPU_FLAG_MMXEXT)
03055 dsputil_init_mmx2(c, avctx, mm_flags);
03056
03057 if (mm_flags & AV_CPU_FLAG_3DNOW)
03058 dsputil_init_3dnow(c, avctx, mm_flags);
03059
03060 if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
03061 dsputil_init_3dnowext(c, avctx, mm_flags);
03062
03063 if (mm_flags & AV_CPU_FLAG_SSE)
03064 dsputil_init_sse(c, avctx, mm_flags);
03065
03066 if (mm_flags & AV_CPU_FLAG_SSE2)
03067 dsputil_init_sse2(c, avctx, mm_flags);
03068
03069 if (mm_flags & AV_CPU_FLAG_SSSE3)
03070 dsputil_init_ssse3(c, avctx, mm_flags);
03071
03072 if (mm_flags & AV_CPU_FLAG_SSE4)
03073 dsputil_init_sse4(c, avctx, mm_flags);
03074
03075 if (mm_flags & AV_CPU_FLAG_AVX)
03076 dsputil_init_avx(c, avctx, mm_flags);
03077
03078 if (CONFIG_ENCODERS)
03079 ff_dsputilenc_init_mmx(c, avctx);
03080 }