00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/avassert.h"
00026 #include "libavutil/mem.h"
00027 #include "libavutil/x86/asm.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "dsputil_mmx.h"
00030
00031 #if HAVE_INLINE_ASM
00032
00033 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
00034 0x0000000000000000ULL,
00035 0x0001000100010001ULL,
00036 0x0002000200020002ULL,
00037 };
00038
00039 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
00040
00041 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00042 {
00043 x86_reg len= -(x86_reg)stride*h;
00044 __asm__ volatile(
00045 ".p2align 4 \n\t"
00046 "1: \n\t"
00047 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00048 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00049 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00050 "add %3, %%"REG_a" \n\t"
00051 "psubusb %%mm0, %%mm2 \n\t"
00052 "psubusb %%mm4, %%mm0 \n\t"
00053 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00054 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00055 "movq (%2, %%"REG_a"), %%mm5 \n\t"
00056 "psubusb %%mm1, %%mm3 \n\t"
00057 "psubusb %%mm5, %%mm1 \n\t"
00058 "por %%mm2, %%mm0 \n\t"
00059 "por %%mm1, %%mm3 \n\t"
00060 "movq %%mm0, %%mm1 \n\t"
00061 "movq %%mm3, %%mm2 \n\t"
00062 "punpcklbw %%mm7, %%mm0 \n\t"
00063 "punpckhbw %%mm7, %%mm1 \n\t"
00064 "punpcklbw %%mm7, %%mm3 \n\t"
00065 "punpckhbw %%mm7, %%mm2 \n\t"
00066 "paddw %%mm1, %%mm0 \n\t"
00067 "paddw %%mm3, %%mm2 \n\t"
00068 "paddw %%mm2, %%mm0 \n\t"
00069 "paddw %%mm0, %%mm6 \n\t"
00070 "add %3, %%"REG_a" \n\t"
00071 " js 1b \n\t"
00072 : "+a" (len)
00073 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
00074 );
00075 }
00076
00077 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
00078 int stride, int h)
00079 {
00080 __asm__ volatile(
00081 ".p2align 4 \n\t"
00082 "1: \n\t"
00083 "movq (%1), %%mm0 \n\t"
00084 "movq (%1, %3), %%mm1 \n\t"
00085 "psadbw (%2), %%mm0 \n\t"
00086 "psadbw (%2, %3), %%mm1 \n\t"
00087 "paddw %%mm0, %%mm6 \n\t"
00088 "paddw %%mm1, %%mm6 \n\t"
00089 "lea (%1,%3,2), %1 \n\t"
00090 "lea (%2,%3,2), %2 \n\t"
00091 "sub $2, %0 \n\t"
00092 " jg 1b \n\t"
00093 : "+r" (h), "+r" (blk1), "+r" (blk2)
00094 : "r" ((x86_reg)stride)
00095 );
00096 }
00097
00098 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
00099 {
00100 int ret;
00101 __asm__ volatile(
00102 "pxor %%xmm2, %%xmm2 \n\t"
00103 ".p2align 4 \n\t"
00104 "1: \n\t"
00105 "movdqu (%1), %%xmm0 \n\t"
00106 "movdqu (%1, %4), %%xmm1 \n\t"
00107 "psadbw (%2), %%xmm0 \n\t"
00108 "psadbw (%2, %4), %%xmm1 \n\t"
00109 "paddw %%xmm0, %%xmm2 \n\t"
00110 "paddw %%xmm1, %%xmm2 \n\t"
00111 "lea (%1,%4,2), %1 \n\t"
00112 "lea (%2,%4,2), %2 \n\t"
00113 "sub $2, %0 \n\t"
00114 " jg 1b \n\t"
00115 "movhlps %%xmm2, %%xmm0 \n\t"
00116 "paddw %%xmm0, %%xmm2 \n\t"
00117 "movd %%xmm2, %3 \n\t"
00118 : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
00119 : "r" ((x86_reg)stride)
00120 );
00121 return ret;
00122 }
00123
00124 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
00125 int stride, int h)
00126 {
00127 __asm__ volatile(
00128 ".p2align 4 \n\t"
00129 "1: \n\t"
00130 "movq (%1), %%mm0 \n\t"
00131 "movq (%1, %3), %%mm1 \n\t"
00132 "pavgb 1(%1), %%mm0 \n\t"
00133 "pavgb 1(%1, %3), %%mm1 \n\t"
00134 "psadbw (%2), %%mm0 \n\t"
00135 "psadbw (%2, %3), %%mm1 \n\t"
00136 "paddw %%mm0, %%mm6 \n\t"
00137 "paddw %%mm1, %%mm6 \n\t"
00138 "lea (%1,%3,2), %1 \n\t"
00139 "lea (%2,%3,2), %2 \n\t"
00140 "sub $2, %0 \n\t"
00141 " jg 1b \n\t"
00142 : "+r" (h), "+r" (blk1), "+r" (blk2)
00143 : "r" ((x86_reg)stride)
00144 );
00145 }
00146
00147 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
00148 int stride, int h)
00149 {
00150 __asm__ volatile(
00151 "movq (%1), %%mm0 \n\t"
00152 "add %3, %1 \n\t"
00153 ".p2align 4 \n\t"
00154 "1: \n\t"
00155 "movq (%1), %%mm1 \n\t"
00156 "movq (%1, %3), %%mm2 \n\t"
00157 "pavgb %%mm1, %%mm0 \n\t"
00158 "pavgb %%mm2, %%mm1 \n\t"
00159 "psadbw (%2), %%mm0 \n\t"
00160 "psadbw (%2, %3), %%mm1 \n\t"
00161 "paddw %%mm0, %%mm6 \n\t"
00162 "paddw %%mm1, %%mm6 \n\t"
00163 "movq %%mm2, %%mm0 \n\t"
00164 "lea (%1,%3,2), %1 \n\t"
00165 "lea (%2,%3,2), %2 \n\t"
00166 "sub $2, %0 \n\t"
00167 " jg 1b \n\t"
00168 : "+r" (h), "+r" (blk1), "+r" (blk2)
00169 : "r" ((x86_reg)stride)
00170 );
00171 }
00172
00173 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
00174 int stride, int h)
00175 {
00176 __asm__ volatile(
00177 "movq "MANGLE(bone)", %%mm5 \n\t"
00178 "movq (%1), %%mm0 \n\t"
00179 "pavgb 1(%1), %%mm0 \n\t"
00180 "add %3, %1 \n\t"
00181 ".p2align 4 \n\t"
00182 "1: \n\t"
00183 "movq (%1), %%mm1 \n\t"
00184 "movq (%1,%3), %%mm2 \n\t"
00185 "pavgb 1(%1), %%mm1 \n\t"
00186 "pavgb 1(%1,%3), %%mm2 \n\t"
00187 "psubusb %%mm5, %%mm1 \n\t"
00188 "pavgb %%mm1, %%mm0 \n\t"
00189 "pavgb %%mm2, %%mm1 \n\t"
00190 "psadbw (%2), %%mm0 \n\t"
00191 "psadbw (%2,%3), %%mm1 \n\t"
00192 "paddw %%mm0, %%mm6 \n\t"
00193 "paddw %%mm1, %%mm6 \n\t"
00194 "movq %%mm2, %%mm0 \n\t"
00195 "lea (%1,%3,2), %1 \n\t"
00196 "lea (%2,%3,2), %2 \n\t"
00197 "sub $2, %0 \n\t"
00198 " jg 1b \n\t"
00199 : "+r" (h), "+r" (blk1), "+r" (blk2)
00200 : "r" ((x86_reg)stride)
00201 );
00202 }
00203
00204 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
00205 {
00206 x86_reg len= -(x86_reg)stride*h;
00207 __asm__ volatile(
00208 ".p2align 4 \n\t"
00209 "1: \n\t"
00210 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00211 "movq (%2, %%"REG_a"), %%mm1 \n\t"
00212 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00213 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00214 "punpcklbw %%mm7, %%mm0 \n\t"
00215 "punpcklbw %%mm7, %%mm1 \n\t"
00216 "punpckhbw %%mm7, %%mm2 \n\t"
00217 "punpckhbw %%mm7, %%mm3 \n\t"
00218 "paddw %%mm0, %%mm1 \n\t"
00219 "paddw %%mm2, %%mm3 \n\t"
00220 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00221 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00222 "paddw %%mm5, %%mm1 \n\t"
00223 "paddw %%mm5, %%mm3 \n\t"
00224 "psrlw $1, %%mm1 \n\t"
00225 "psrlw $1, %%mm3 \n\t"
00226 "packuswb %%mm3, %%mm1 \n\t"
00227 "psubusb %%mm1, %%mm4 \n\t"
00228 "psubusb %%mm2, %%mm1 \n\t"
00229 "por %%mm4, %%mm1 \n\t"
00230 "movq %%mm1, %%mm0 \n\t"
00231 "punpcklbw %%mm7, %%mm0 \n\t"
00232 "punpckhbw %%mm7, %%mm1 \n\t"
00233 "paddw %%mm1, %%mm0 \n\t"
00234 "paddw %%mm0, %%mm6 \n\t"
00235 "add %4, %%"REG_a" \n\t"
00236 " js 1b \n\t"
00237 : "+a" (len)
00238 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
00239 );
00240 }
00241
00242 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00243 {
00244 x86_reg len= -(x86_reg)stride*h;
00245 __asm__ volatile(
00246 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00247 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
00248 "movq %%mm0, %%mm1 \n\t"
00249 "movq %%mm2, %%mm3 \n\t"
00250 "punpcklbw %%mm7, %%mm0 \n\t"
00251 "punpckhbw %%mm7, %%mm1 \n\t"
00252 "punpcklbw %%mm7, %%mm2 \n\t"
00253 "punpckhbw %%mm7, %%mm3 \n\t"
00254 "paddw %%mm2, %%mm0 \n\t"
00255 "paddw %%mm3, %%mm1 \n\t"
00256 ".p2align 4 \n\t"
00257 "1: \n\t"
00258 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00259 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
00260 "movq %%mm2, %%mm3 \n\t"
00261 "movq %%mm4, %%mm5 \n\t"
00262 "punpcklbw %%mm7, %%mm2 \n\t"
00263 "punpckhbw %%mm7, %%mm3 \n\t"
00264 "punpcklbw %%mm7, %%mm4 \n\t"
00265 "punpckhbw %%mm7, %%mm5 \n\t"
00266 "paddw %%mm4, %%mm2 \n\t"
00267 "paddw %%mm5, %%mm3 \n\t"
00268 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
00269 "paddw %%mm2, %%mm0 \n\t"
00270 "paddw %%mm3, %%mm1 \n\t"
00271 "paddw %%mm5, %%mm0 \n\t"
00272 "paddw %%mm5, %%mm1 \n\t"
00273 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00274 "movq (%3, %%"REG_a"), %%mm5 \n\t"
00275 "psrlw $2, %%mm0 \n\t"
00276 "psrlw $2, %%mm1 \n\t"
00277 "packuswb %%mm1, %%mm0 \n\t"
00278 "psubusb %%mm0, %%mm4 \n\t"
00279 "psubusb %%mm5, %%mm0 \n\t"
00280 "por %%mm4, %%mm0 \n\t"
00281 "movq %%mm0, %%mm4 \n\t"
00282 "punpcklbw %%mm7, %%mm0 \n\t"
00283 "punpckhbw %%mm7, %%mm4 \n\t"
00284 "paddw %%mm0, %%mm6 \n\t"
00285 "paddw %%mm4, %%mm6 \n\t"
00286 "movq %%mm2, %%mm0 \n\t"
00287 "movq %%mm3, %%mm1 \n\t"
00288 "add %4, %%"REG_a" \n\t"
00289 " js 1b \n\t"
00290 : "+a" (len)
00291 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
00292 );
00293 }
00294
00295 static inline int sum_mmx(void)
00296 {
00297 int ret;
00298 __asm__ volatile(
00299 "movq %%mm6, %%mm0 \n\t"
00300 "psrlq $32, %%mm6 \n\t"
00301 "paddw %%mm0, %%mm6 \n\t"
00302 "movq %%mm6, %%mm0 \n\t"
00303 "psrlq $16, %%mm6 \n\t"
00304 "paddw %%mm0, %%mm6 \n\t"
00305 "movd %%mm6, %0 \n\t"
00306 : "=r" (ret)
00307 );
00308 return ret&0xFFFF;
00309 }
00310
00311 static inline int sum_mmxext(void)
00312 {
00313 int ret;
00314 __asm__ volatile(
00315 "movd %%mm6, %0 \n\t"
00316 : "=r" (ret)
00317 );
00318 return ret;
00319 }
00320
00321 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00322 {
00323 sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
00324 }
00325 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00326 {
00327 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
00328 }
00329
00330
00331 #define PIX_SAD(suf)\
00332 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00333 {\
00334 av_assert2(h==8);\
00335 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00336 "pxor %%mm6, %%mm6 \n\t":);\
00337 \
00338 sad8_1_ ## suf(blk1, blk2, stride, 8);\
00339 \
00340 return sum_ ## suf();\
00341 }\
00342 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00343 {\
00344 av_assert2(h==8);\
00345 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00346 "pxor %%mm6, %%mm6 \n\t"\
00347 "movq %0, %%mm5 \n\t"\
00348 :: "m"(round_tab[1]) \
00349 );\
00350 \
00351 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
00352 \
00353 return sum_ ## suf();\
00354 }\
00355 \
00356 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00357 {\
00358 av_assert2(h==8);\
00359 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00360 "pxor %%mm6, %%mm6 \n\t"\
00361 "movq %0, %%mm5 \n\t"\
00362 :: "m"(round_tab[1]) \
00363 );\
00364 \
00365 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
00366 \
00367 return sum_ ## suf();\
00368 }\
00369 \
00370 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00371 {\
00372 av_assert2(h==8);\
00373 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00374 "pxor %%mm6, %%mm6 \n\t"\
00375 ::);\
00376 \
00377 sad8_4_ ## suf(blk1, blk2, stride, 8);\
00378 \
00379 return sum_ ## suf();\
00380 }\
00381 \
00382 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00383 {\
00384 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00385 "pxor %%mm6, %%mm6 \n\t":);\
00386 \
00387 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
00388 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
00389 \
00390 return sum_ ## suf();\
00391 }\
00392 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00393 {\
00394 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00395 "pxor %%mm6, %%mm6 \n\t"\
00396 "movq %0, %%mm5 \n\t"\
00397 :: "m"(round_tab[1]) \
00398 );\
00399 \
00400 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
00401 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
00402 \
00403 return sum_ ## suf();\
00404 }\
00405 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00406 {\
00407 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00408 "pxor %%mm6, %%mm6 \n\t"\
00409 "movq %0, %%mm5 \n\t"\
00410 :: "m"(round_tab[1]) \
00411 );\
00412 \
00413 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
00414 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
00415 \
00416 return sum_ ## suf();\
00417 }\
00418 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00419 {\
00420 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00421 "pxor %%mm6, %%mm6 \n\t"\
00422 ::);\
00423 \
00424 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
00425 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
00426 \
00427 return sum_ ## suf();\
00428 }\
00429
00430 PIX_SAD(mmx)
00431 PIX_SAD(mmxext)
00432
00433 #endif
00434
00435 void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
00436 {
00437 #if HAVE_INLINE_ASM
00438 int mm_flags = av_get_cpu_flags();
00439
00440 if (mm_flags & AV_CPU_FLAG_MMX) {
00441 c->pix_abs[0][0] = sad16_mmx;
00442 c->pix_abs[0][1] = sad16_x2_mmx;
00443 c->pix_abs[0][2] = sad16_y2_mmx;
00444 c->pix_abs[0][3] = sad16_xy2_mmx;
00445 c->pix_abs[1][0] = sad8_mmx;
00446 c->pix_abs[1][1] = sad8_x2_mmx;
00447 c->pix_abs[1][2] = sad8_y2_mmx;
00448 c->pix_abs[1][3] = sad8_xy2_mmx;
00449
00450 c->sad[0]= sad16_mmx;
00451 c->sad[1]= sad8_mmx;
00452 }
00453 if (mm_flags & AV_CPU_FLAG_MMXEXT) {
00454 c->pix_abs[0][0] = sad16_mmxext;
00455 c->pix_abs[1][0] = sad8_mmxext;
00456
00457 c->sad[0] = sad16_mmxext;
00458 c->sad[1] = sad8_mmxext;
00459
00460 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00461 c->pix_abs[0][1] = sad16_x2_mmxext;
00462 c->pix_abs[0][2] = sad16_y2_mmxext;
00463 c->pix_abs[0][3] = sad16_xy2_mmxext;
00464 c->pix_abs[1][1] = sad8_x2_mmxext;
00465 c->pix_abs[1][2] = sad8_y2_mmxext;
00466 c->pix_abs[1][3] = sad8_xy2_mmxext;
00467 }
00468 }
00469 if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
00470 c->sad[0]= sad16_sse2;
00471 }
00472 #endif
00473 }