00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/mem.h"
00026 #include "libavutil/x86/asm.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "dsputil_mmx.h"
00029
00030 #if HAVE_INLINE_ASM
00031
00032 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
00033 0x0000000000000000ULL,
00034 0x0001000100010001ULL,
00035 0x0002000200020002ULL,
00036 };
00037
00038 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
00039
00040 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00041 {
00042 x86_reg len= -(stride*h);
00043 __asm__ volatile(
00044 ".p2align 4 \n\t"
00045 "1: \n\t"
00046 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00047 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00048 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00049 "add %3, %%"REG_a" \n\t"
00050 "psubusb %%mm0, %%mm2 \n\t"
00051 "psubusb %%mm4, %%mm0 \n\t"
00052 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00053 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00054 "movq (%2, %%"REG_a"), %%mm5 \n\t"
00055 "psubusb %%mm1, %%mm3 \n\t"
00056 "psubusb %%mm5, %%mm1 \n\t"
00057 "por %%mm2, %%mm0 \n\t"
00058 "por %%mm1, %%mm3 \n\t"
00059 "movq %%mm0, %%mm1 \n\t"
00060 "movq %%mm3, %%mm2 \n\t"
00061 "punpcklbw %%mm7, %%mm0 \n\t"
00062 "punpckhbw %%mm7, %%mm1 \n\t"
00063 "punpcklbw %%mm7, %%mm3 \n\t"
00064 "punpckhbw %%mm7, %%mm2 \n\t"
00065 "paddw %%mm1, %%mm0 \n\t"
00066 "paddw %%mm3, %%mm2 \n\t"
00067 "paddw %%mm2, %%mm0 \n\t"
00068 "paddw %%mm0, %%mm6 \n\t"
00069 "add %3, %%"REG_a" \n\t"
00070 " js 1b \n\t"
00071 : "+a" (len)
00072 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
00073 );
00074 }
00075
00076 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00077 {
00078 __asm__ volatile(
00079 ".p2align 4 \n\t"
00080 "1: \n\t"
00081 "movq (%1), %%mm0 \n\t"
00082 "movq (%1, %3), %%mm1 \n\t"
00083 "psadbw (%2), %%mm0 \n\t"
00084 "psadbw (%2, %3), %%mm1 \n\t"
00085 "paddw %%mm0, %%mm6 \n\t"
00086 "paddw %%mm1, %%mm6 \n\t"
00087 "lea (%1,%3,2), %1 \n\t"
00088 "lea (%2,%3,2), %2 \n\t"
00089 "sub $2, %0 \n\t"
00090 " jg 1b \n\t"
00091 : "+r" (h), "+r" (blk1), "+r" (blk2)
00092 : "r" ((x86_reg)stride)
00093 );
00094 }
00095
00096 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
00097 {
00098 int ret;
00099 __asm__ volatile(
00100 "pxor %%xmm2, %%xmm2 \n\t"
00101 ".p2align 4 \n\t"
00102 "1: \n\t"
00103 "movdqu (%1), %%xmm0 \n\t"
00104 "movdqu (%1, %4), %%xmm1 \n\t"
00105 "psadbw (%2), %%xmm0 \n\t"
00106 "psadbw (%2, %4), %%xmm1 \n\t"
00107 "paddw %%xmm0, %%xmm2 \n\t"
00108 "paddw %%xmm1, %%xmm2 \n\t"
00109 "lea (%1,%4,2), %1 \n\t"
00110 "lea (%2,%4,2), %2 \n\t"
00111 "sub $2, %0 \n\t"
00112 " jg 1b \n\t"
00113 "movhlps %%xmm2, %%xmm0 \n\t"
00114 "paddw %%xmm0, %%xmm2 \n\t"
00115 "movd %%xmm2, %3 \n\t"
00116 : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
00117 : "r" ((x86_reg)stride)
00118 );
00119 return ret;
00120 }
00121
00122 static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00123 {
00124 __asm__ volatile(
00125 ".p2align 4 \n\t"
00126 "1: \n\t"
00127 "movq (%1), %%mm0 \n\t"
00128 "movq (%1, %3), %%mm1 \n\t"
00129 "pavgb 1(%1), %%mm0 \n\t"
00130 "pavgb 1(%1, %3), %%mm1 \n\t"
00131 "psadbw (%2), %%mm0 \n\t"
00132 "psadbw (%2, %3), %%mm1 \n\t"
00133 "paddw %%mm0, %%mm6 \n\t"
00134 "paddw %%mm1, %%mm6 \n\t"
00135 "lea (%1,%3,2), %1 \n\t"
00136 "lea (%2,%3,2), %2 \n\t"
00137 "sub $2, %0 \n\t"
00138 " jg 1b \n\t"
00139 : "+r" (h), "+r" (blk1), "+r" (blk2)
00140 : "r" ((x86_reg)stride)
00141 );
00142 }
00143
00144 static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00145 {
00146 __asm__ volatile(
00147 "movq (%1), %%mm0 \n\t"
00148 "add %3, %1 \n\t"
00149 ".p2align 4 \n\t"
00150 "1: \n\t"
00151 "movq (%1), %%mm1 \n\t"
00152 "movq (%1, %3), %%mm2 \n\t"
00153 "pavgb %%mm1, %%mm0 \n\t"
00154 "pavgb %%mm2, %%mm1 \n\t"
00155 "psadbw (%2), %%mm0 \n\t"
00156 "psadbw (%2, %3), %%mm1 \n\t"
00157 "paddw %%mm0, %%mm6 \n\t"
00158 "paddw %%mm1, %%mm6 \n\t"
00159 "movq %%mm2, %%mm0 \n\t"
00160 "lea (%1,%3,2), %1 \n\t"
00161 "lea (%2,%3,2), %2 \n\t"
00162 "sub $2, %0 \n\t"
00163 " jg 1b \n\t"
00164 : "+r" (h), "+r" (blk1), "+r" (blk2)
00165 : "r" ((x86_reg)stride)
00166 );
00167 }
00168
00169 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00170 {
00171 __asm__ volatile(
00172 "movq "MANGLE(bone)", %%mm5 \n\t"
00173 "movq (%1), %%mm0 \n\t"
00174 "pavgb 1(%1), %%mm0 \n\t"
00175 "add %3, %1 \n\t"
00176 ".p2align 4 \n\t"
00177 "1: \n\t"
00178 "movq (%1), %%mm1 \n\t"
00179 "movq (%1,%3), %%mm2 \n\t"
00180 "pavgb 1(%1), %%mm1 \n\t"
00181 "pavgb 1(%1,%3), %%mm2 \n\t"
00182 "psubusb %%mm5, %%mm1 \n\t"
00183 "pavgb %%mm1, %%mm0 \n\t"
00184 "pavgb %%mm2, %%mm1 \n\t"
00185 "psadbw (%2), %%mm0 \n\t"
00186 "psadbw (%2,%3), %%mm1 \n\t"
00187 "paddw %%mm0, %%mm6 \n\t"
00188 "paddw %%mm1, %%mm6 \n\t"
00189 "movq %%mm2, %%mm0 \n\t"
00190 "lea (%1,%3,2), %1 \n\t"
00191 "lea (%2,%3,2), %2 \n\t"
00192 "sub $2, %0 \n\t"
00193 " jg 1b \n\t"
00194 : "+r" (h), "+r" (blk1), "+r" (blk2)
00195 : "r" ((x86_reg)stride)
00196 );
00197 }
00198
00199 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
00200 {
00201 x86_reg len= -(stride*h);
00202 __asm__ volatile(
00203 ".p2align 4 \n\t"
00204 "1: \n\t"
00205 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00206 "movq (%2, %%"REG_a"), %%mm1 \n\t"
00207 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00208 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00209 "punpcklbw %%mm7, %%mm0 \n\t"
00210 "punpcklbw %%mm7, %%mm1 \n\t"
00211 "punpckhbw %%mm7, %%mm2 \n\t"
00212 "punpckhbw %%mm7, %%mm3 \n\t"
00213 "paddw %%mm0, %%mm1 \n\t"
00214 "paddw %%mm2, %%mm3 \n\t"
00215 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00216 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00217 "paddw %%mm5, %%mm1 \n\t"
00218 "paddw %%mm5, %%mm3 \n\t"
00219 "psrlw $1, %%mm1 \n\t"
00220 "psrlw $1, %%mm3 \n\t"
00221 "packuswb %%mm3, %%mm1 \n\t"
00222 "psubusb %%mm1, %%mm4 \n\t"
00223 "psubusb %%mm2, %%mm1 \n\t"
00224 "por %%mm4, %%mm1 \n\t"
00225 "movq %%mm1, %%mm0 \n\t"
00226 "punpcklbw %%mm7, %%mm0 \n\t"
00227 "punpckhbw %%mm7, %%mm1 \n\t"
00228 "paddw %%mm1, %%mm0 \n\t"
00229 "paddw %%mm0, %%mm6 \n\t"
00230 "add %4, %%"REG_a" \n\t"
00231 " js 1b \n\t"
00232 : "+a" (len)
00233 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
00234 );
00235 }
00236
00237 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00238 {
00239 x86_reg len= -(stride*h);
00240 __asm__ volatile(
00241 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00242 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
00243 "movq %%mm0, %%mm1 \n\t"
00244 "movq %%mm2, %%mm3 \n\t"
00245 "punpcklbw %%mm7, %%mm0 \n\t"
00246 "punpckhbw %%mm7, %%mm1 \n\t"
00247 "punpcklbw %%mm7, %%mm2 \n\t"
00248 "punpckhbw %%mm7, %%mm3 \n\t"
00249 "paddw %%mm2, %%mm0 \n\t"
00250 "paddw %%mm3, %%mm1 \n\t"
00251 ".p2align 4 \n\t"
00252 "1: \n\t"
00253 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00254 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
00255 "movq %%mm2, %%mm3 \n\t"
00256 "movq %%mm4, %%mm5 \n\t"
00257 "punpcklbw %%mm7, %%mm2 \n\t"
00258 "punpckhbw %%mm7, %%mm3 \n\t"
00259 "punpcklbw %%mm7, %%mm4 \n\t"
00260 "punpckhbw %%mm7, %%mm5 \n\t"
00261 "paddw %%mm4, %%mm2 \n\t"
00262 "paddw %%mm5, %%mm3 \n\t"
00263 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
00264 "paddw %%mm2, %%mm0 \n\t"
00265 "paddw %%mm3, %%mm1 \n\t"
00266 "paddw %%mm5, %%mm0 \n\t"
00267 "paddw %%mm5, %%mm1 \n\t"
00268 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00269 "movq (%3, %%"REG_a"), %%mm5 \n\t"
00270 "psrlw $2, %%mm0 \n\t"
00271 "psrlw $2, %%mm1 \n\t"
00272 "packuswb %%mm1, %%mm0 \n\t"
00273 "psubusb %%mm0, %%mm4 \n\t"
00274 "psubusb %%mm5, %%mm0 \n\t"
00275 "por %%mm4, %%mm0 \n\t"
00276 "movq %%mm0, %%mm4 \n\t"
00277 "punpcklbw %%mm7, %%mm0 \n\t"
00278 "punpckhbw %%mm7, %%mm4 \n\t"
00279 "paddw %%mm0, %%mm6 \n\t"
00280 "paddw %%mm4, %%mm6 \n\t"
00281 "movq %%mm2, %%mm0 \n\t"
00282 "movq %%mm3, %%mm1 \n\t"
00283 "add %4, %%"REG_a" \n\t"
00284 " js 1b \n\t"
00285 : "+a" (len)
00286 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
00287 );
00288 }
00289
00290 static inline int sum_mmx(void)
00291 {
00292 int ret;
00293 __asm__ volatile(
00294 "movq %%mm6, %%mm0 \n\t"
00295 "psrlq $32, %%mm6 \n\t"
00296 "paddw %%mm0, %%mm6 \n\t"
00297 "movq %%mm6, %%mm0 \n\t"
00298 "psrlq $16, %%mm6 \n\t"
00299 "paddw %%mm0, %%mm6 \n\t"
00300 "movd %%mm6, %0 \n\t"
00301 : "=r" (ret)
00302 );
00303 return ret&0xFFFF;
00304 }
00305
00306 static inline int sum_mmx2(void)
00307 {
00308 int ret;
00309 __asm__ volatile(
00310 "movd %%mm6, %0 \n\t"
00311 : "=r" (ret)
00312 );
00313 return ret;
00314 }
00315
00316 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00317 {
00318 sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
00319 }
00320 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00321 {
00322 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
00323 }
00324
00325
00326 #define PIX_SAD(suf)\
00327 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00328 {\
00329 assert(h==8);\
00330 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00331 "pxor %%mm6, %%mm6 \n\t":);\
00332 \
00333 sad8_1_ ## suf(blk1, blk2, stride, 8);\
00334 \
00335 return sum_ ## suf();\
00336 }\
00337 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00338 {\
00339 assert(h==8);\
00340 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00341 "pxor %%mm6, %%mm6 \n\t"\
00342 "movq %0, %%mm5 \n\t"\
00343 :: "m"(round_tab[1]) \
00344 );\
00345 \
00346 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
00347 \
00348 return sum_ ## suf();\
00349 }\
00350 \
00351 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00352 {\
00353 assert(h==8);\
00354 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00355 "pxor %%mm6, %%mm6 \n\t"\
00356 "movq %0, %%mm5 \n\t"\
00357 :: "m"(round_tab[1]) \
00358 );\
00359 \
00360 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
00361 \
00362 return sum_ ## suf();\
00363 }\
00364 \
00365 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00366 {\
00367 assert(h==8);\
00368 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00369 "pxor %%mm6, %%mm6 \n\t"\
00370 ::);\
00371 \
00372 sad8_4_ ## suf(blk1, blk2, stride, 8);\
00373 \
00374 return sum_ ## suf();\
00375 }\
00376 \
00377 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00378 {\
00379 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00380 "pxor %%mm6, %%mm6 \n\t":);\
00381 \
00382 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
00383 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
00384 \
00385 return sum_ ## suf();\
00386 }\
00387 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00388 {\
00389 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00390 "pxor %%mm6, %%mm6 \n\t"\
00391 "movq %0, %%mm5 \n\t"\
00392 :: "m"(round_tab[1]) \
00393 );\
00394 \
00395 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
00396 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
00397 \
00398 return sum_ ## suf();\
00399 }\
00400 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00401 {\
00402 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00403 "pxor %%mm6, %%mm6 \n\t"\
00404 "movq %0, %%mm5 \n\t"\
00405 :: "m"(round_tab[1]) \
00406 );\
00407 \
00408 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
00409 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
00410 \
00411 return sum_ ## suf();\
00412 }\
00413 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00414 {\
00415 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00416 "pxor %%mm6, %%mm6 \n\t"\
00417 ::);\
00418 \
00419 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
00420 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
00421 \
00422 return sum_ ## suf();\
00423 }\
00424
00425 PIX_SAD(mmx)
00426 PIX_SAD(mmx2)
00427
00428 #endif
00429
00430 void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
00431 {
00432 #if HAVE_INLINE_ASM
00433 int mm_flags = av_get_cpu_flags();
00434
00435 if (mm_flags & AV_CPU_FLAG_MMX) {
00436 c->pix_abs[0][0] = sad16_mmx;
00437 c->pix_abs[0][1] = sad16_x2_mmx;
00438 c->pix_abs[0][2] = sad16_y2_mmx;
00439 c->pix_abs[0][3] = sad16_xy2_mmx;
00440 c->pix_abs[1][0] = sad8_mmx;
00441 c->pix_abs[1][1] = sad8_x2_mmx;
00442 c->pix_abs[1][2] = sad8_y2_mmx;
00443 c->pix_abs[1][3] = sad8_xy2_mmx;
00444
00445 c->sad[0]= sad16_mmx;
00446 c->sad[1]= sad8_mmx;
00447 }
00448 if (mm_flags & AV_CPU_FLAG_MMXEXT) {
00449 c->pix_abs[0][0] = sad16_mmx2;
00450 c->pix_abs[1][0] = sad8_mmx2;
00451
00452 c->sad[0]= sad16_mmx2;
00453 c->sad[1]= sad8_mmx2;
00454
00455 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00456 c->pix_abs[0][1] = sad16_x2_mmx2;
00457 c->pix_abs[0][2] = sad16_y2_mmx2;
00458 c->pix_abs[0][3] = sad16_xy2_mmx2;
00459 c->pix_abs[1][1] = sad8_x2_mmx2;
00460 c->pix_abs[1][2] = sad8_y2_mmx2;
00461 c->pix_abs[1][3] = sad8_xy2_mmx2;
00462 }
00463 }
00464 if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
00465 c->sad[0]= sad16_sse2;
00466 }
00467 #endif
00468 }