00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/dsputil.h"
00027 #include "dsputil_mmx.h"
00028
00029 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
00030 0x0000000000000000ULL,
00031 0x0001000100010001ULL,
00032 0x0002000200020002ULL,
00033 };
00034
00035 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
00036
00037 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00038 {
00039 x86_reg len= -(stride*h);
00040 __asm__ volatile(
00041 ASMALIGN(4)
00042 "1: \n\t"
00043 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00044 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00045 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00046 "add %3, %%"REG_a" \n\t"
00047 "psubusb %%mm0, %%mm2 \n\t"
00048 "psubusb %%mm4, %%mm0 \n\t"
00049 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00050 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00051 "movq (%2, %%"REG_a"), %%mm5 \n\t"
00052 "psubusb %%mm1, %%mm3 \n\t"
00053 "psubusb %%mm5, %%mm1 \n\t"
00054 "por %%mm2, %%mm0 \n\t"
00055 "por %%mm1, %%mm3 \n\t"
00056 "movq %%mm0, %%mm1 \n\t"
00057 "movq %%mm3, %%mm2 \n\t"
00058 "punpcklbw %%mm7, %%mm0 \n\t"
00059 "punpckhbw %%mm7, %%mm1 \n\t"
00060 "punpcklbw %%mm7, %%mm3 \n\t"
00061 "punpckhbw %%mm7, %%mm2 \n\t"
00062 "paddw %%mm1, %%mm0 \n\t"
00063 "paddw %%mm3, %%mm2 \n\t"
00064 "paddw %%mm2, %%mm0 \n\t"
00065 "paddw %%mm0, %%mm6 \n\t"
00066 "add %3, %%"REG_a" \n\t"
00067 " js 1b \n\t"
00068 : "+a" (len)
00069 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
00070 );
00071 }
00072
00073 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00074 {
00075 __asm__ volatile(
00076 ASMALIGN(4)
00077 "1: \n\t"
00078 "movq (%1), %%mm0 \n\t"
00079 "movq (%1, %3), %%mm1 \n\t"
00080 "psadbw (%2), %%mm0 \n\t"
00081 "psadbw (%2, %3), %%mm1 \n\t"
00082 "paddw %%mm0, %%mm6 \n\t"
00083 "paddw %%mm1, %%mm6 \n\t"
00084 "lea (%1,%3,2), %1 \n\t"
00085 "lea (%2,%3,2), %2 \n\t"
00086 "sub $2, %0 \n\t"
00087 " jg 1b \n\t"
00088 : "+r" (h), "+r" (blk1), "+r" (blk2)
00089 : "r" ((x86_reg)stride)
00090 );
00091 }
00092
00093 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
00094 {
00095 int ret;
00096 __asm__ volatile(
00097 "pxor %%xmm6, %%xmm6 \n\t"
00098 ASMALIGN(4)
00099 "1: \n\t"
00100 "movdqu (%1), %%xmm0 \n\t"
00101 "movdqu (%1, %3), %%xmm1 \n\t"
00102 "psadbw (%2), %%xmm0 \n\t"
00103 "psadbw (%2, %3), %%xmm1 \n\t"
00104 "paddw %%xmm0, %%xmm6 \n\t"
00105 "paddw %%xmm1, %%xmm6 \n\t"
00106 "lea (%1,%3,2), %1 \n\t"
00107 "lea (%2,%3,2), %2 \n\t"
00108 "sub $2, %0 \n\t"
00109 " jg 1b \n\t"
00110 : "+r" (h), "+r" (blk1), "+r" (blk2)
00111 : "r" ((x86_reg)stride)
00112 );
00113 __asm__ volatile(
00114 "movhlps %%xmm6, %%xmm0 \n\t"
00115 "paddw %%xmm0, %%xmm6 \n\t"
00116 "movd %%xmm6, %0 \n\t"
00117 : "=r"(ret)
00118 );
00119 return ret;
00120 }
00121
00122 static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00123 {
00124 __asm__ volatile(
00125 ASMALIGN(4)
00126 "1: \n\t"
00127 "movq (%1), %%mm0 \n\t"
00128 "movq (%1, %3), %%mm1 \n\t"
00129 "pavgb 1(%1), %%mm0 \n\t"
00130 "pavgb 1(%1, %3), %%mm1 \n\t"
00131 "psadbw (%2), %%mm0 \n\t"
00132 "psadbw (%2, %3), %%mm1 \n\t"
00133 "paddw %%mm0, %%mm6 \n\t"
00134 "paddw %%mm1, %%mm6 \n\t"
00135 "lea (%1,%3,2), %1 \n\t"
00136 "lea (%2,%3,2), %2 \n\t"
00137 "sub $2, %0 \n\t"
00138 " jg 1b \n\t"
00139 : "+r" (h), "+r" (blk1), "+r" (blk2)
00140 : "r" ((x86_reg)stride)
00141 );
00142 }
00143
00144 static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00145 {
00146 __asm__ volatile(
00147 "movq (%1), %%mm0 \n\t"
00148 "add %3, %1 \n\t"
00149 ASMALIGN(4)
00150 "1: \n\t"
00151 "movq (%1), %%mm1 \n\t"
00152 "movq (%1, %3), %%mm2 \n\t"
00153 "pavgb %%mm1, %%mm0 \n\t"
00154 "pavgb %%mm2, %%mm1 \n\t"
00155 "psadbw (%2), %%mm0 \n\t"
00156 "psadbw (%2, %3), %%mm1 \n\t"
00157 "paddw %%mm0, %%mm6 \n\t"
00158 "paddw %%mm1, %%mm6 \n\t"
00159 "movq %%mm2, %%mm0 \n\t"
00160 "lea (%1,%3,2), %1 \n\t"
00161 "lea (%2,%3,2), %2 \n\t"
00162 "sub $2, %0 \n\t"
00163 " jg 1b \n\t"
00164 : "+r" (h), "+r" (blk1), "+r" (blk2)
00165 : "r" ((x86_reg)stride)
00166 );
00167 }
00168
00169 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00170 {
00171 __asm__ volatile(
00172 "movq "MANGLE(bone)", %%mm5 \n\t"
00173 "movq (%1), %%mm0 \n\t"
00174 "pavgb 1(%1), %%mm0 \n\t"
00175 "add %3, %1 \n\t"
00176 ASMALIGN(4)
00177 "1: \n\t"
00178 "movq (%1), %%mm1 \n\t"
00179 "movq (%1,%3), %%mm2 \n\t"
00180 "pavgb 1(%1), %%mm1 \n\t"
00181 "pavgb 1(%1,%3), %%mm2 \n\t"
00182 "psubusb %%mm5, %%mm1 \n\t"
00183 "pavgb %%mm1, %%mm0 \n\t"
00184 "pavgb %%mm2, %%mm1 \n\t"
00185 "psadbw (%2), %%mm0 \n\t"
00186 "psadbw (%2,%3), %%mm1 \n\t"
00187 "paddw %%mm0, %%mm6 \n\t"
00188 "paddw %%mm1, %%mm6 \n\t"
00189 "movq %%mm2, %%mm0 \n\t"
00190 "lea (%1,%3,2), %1 \n\t"
00191 "lea (%2,%3,2), %2 \n\t"
00192 "sub $2, %0 \n\t"
00193 " jg 1b \n\t"
00194 : "+r" (h), "+r" (blk1), "+r" (blk2)
00195 : "r" ((x86_reg)stride)
00196 );
00197 }
00198
00199 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
00200 {
00201 x86_reg len= -(stride*h);
00202 __asm__ volatile(
00203 ASMALIGN(4)
00204 "1: \n\t"
00205 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00206 "movq (%2, %%"REG_a"), %%mm1 \n\t"
00207 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00208 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00209 "punpcklbw %%mm7, %%mm0 \n\t"
00210 "punpcklbw %%mm7, %%mm1 \n\t"
00211 "punpckhbw %%mm7, %%mm2 \n\t"
00212 "punpckhbw %%mm7, %%mm3 \n\t"
00213 "paddw %%mm0, %%mm1 \n\t"
00214 "paddw %%mm2, %%mm3 \n\t"
00215 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00216 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00217 "paddw %%mm5, %%mm1 \n\t"
00218 "paddw %%mm5, %%mm3 \n\t"
00219 "psrlw $1, %%mm1 \n\t"
00220 "psrlw $1, %%mm3 \n\t"
00221 "packuswb %%mm3, %%mm1 \n\t"
00222 "psubusb %%mm1, %%mm4 \n\t"
00223 "psubusb %%mm2, %%mm1 \n\t"
00224 "por %%mm4, %%mm1 \n\t"
00225 "movq %%mm1, %%mm0 \n\t"
00226 "punpcklbw %%mm7, %%mm0 \n\t"
00227 "punpckhbw %%mm7, %%mm1 \n\t"
00228 "paddw %%mm1, %%mm0 \n\t"
00229 "paddw %%mm0, %%mm6 \n\t"
00230 "add %4, %%"REG_a" \n\t"
00231 " js 1b \n\t"
00232 : "+a" (len)
00233 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
00234 );
00235 }
00236
00237 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00238 {
00239 x86_reg len= -(stride*h);
00240 __asm__ volatile(
00241 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00242 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
00243 "movq %%mm0, %%mm1 \n\t"
00244 "movq %%mm2, %%mm3 \n\t"
00245 "punpcklbw %%mm7, %%mm0 \n\t"
00246 "punpckhbw %%mm7, %%mm1 \n\t"
00247 "punpcklbw %%mm7, %%mm2 \n\t"
00248 "punpckhbw %%mm7, %%mm3 \n\t"
00249 "paddw %%mm2, %%mm0 \n\t"
00250 "paddw %%mm3, %%mm1 \n\t"
00251 ASMALIGN(4)
00252 "1: \n\t"
00253 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00254 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
00255 "movq %%mm2, %%mm3 \n\t"
00256 "movq %%mm4, %%mm5 \n\t"
00257 "punpcklbw %%mm7, %%mm2 \n\t"
00258 "punpckhbw %%mm7, %%mm3 \n\t"
00259 "punpcklbw %%mm7, %%mm4 \n\t"
00260 "punpckhbw %%mm7, %%mm5 \n\t"
00261 "paddw %%mm4, %%mm2 \n\t"
00262 "paddw %%mm5, %%mm3 \n\t"
00263 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
00264 "paddw %%mm2, %%mm0 \n\t"
00265 "paddw %%mm3, %%mm1 \n\t"
00266 "paddw %%mm5, %%mm0 \n\t"
00267 "paddw %%mm5, %%mm1 \n\t"
00268 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00269 "movq (%3, %%"REG_a"), %%mm5 \n\t"
00270 "psrlw $2, %%mm0 \n\t"
00271 "psrlw $2, %%mm1 \n\t"
00272 "packuswb %%mm1, %%mm0 \n\t"
00273 "psubusb %%mm0, %%mm4 \n\t"
00274 "psubusb %%mm5, %%mm0 \n\t"
00275 "por %%mm4, %%mm0 \n\t"
00276 "movq %%mm0, %%mm4 \n\t"
00277 "punpcklbw %%mm7, %%mm0 \n\t"
00278 "punpckhbw %%mm7, %%mm4 \n\t"
00279 "paddw %%mm0, %%mm6 \n\t"
00280 "paddw %%mm4, %%mm6 \n\t"
00281 "movq %%mm2, %%mm0 \n\t"
00282 "movq %%mm3, %%mm1 \n\t"
00283 "add %4, %%"REG_a" \n\t"
00284 " js 1b \n\t"
00285 : "+a" (len)
00286 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
00287 );
00288 }
00289
00290 static inline int sum_mmx(void)
00291 {
00292 int ret;
00293 __asm__ volatile(
00294 "movq %%mm6, %%mm0 \n\t"
00295 "psrlq $32, %%mm6 \n\t"
00296 "paddw %%mm0, %%mm6 \n\t"
00297 "movq %%mm6, %%mm0 \n\t"
00298 "psrlq $16, %%mm6 \n\t"
00299 "paddw %%mm0, %%mm6 \n\t"
00300 "movd %%mm6, %0 \n\t"
00301 : "=r" (ret)
00302 );
00303 return ret&0xFFFF;
00304 }
00305
00306 static inline int sum_mmx2(void)
00307 {
00308 int ret;
00309 __asm__ volatile(
00310 "movd %%mm6, %0 \n\t"
00311 : "=r" (ret)
00312 );
00313 return ret;
00314 }
00315
00316 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00317 {
00318 sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
00319 }
00320 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00321 {
00322 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
00323 }
00324
00325
00326 #define PIX_SAD(suf)\
00327 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00328 {\
00329 assert(h==8);\
00330 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00331 "pxor %%mm6, %%mm6 \n\t":);\
00332 \
00333 sad8_1_ ## suf(blk1, blk2, stride, 8);\
00334 \
00335 return sum_ ## suf();\
00336 }\
00337 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00338 {\
00339 assert(h==8);\
00340 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00341 "pxor %%mm6, %%mm6 \n\t"\
00342 "movq %0, %%mm5 \n\t"\
00343 :: "m"(round_tab[1]) \
00344 );\
00345 \
00346 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
00347 \
00348 return sum_ ## suf();\
00349 }\
00350 \
00351 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00352 {\
00353 assert(h==8);\
00354 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00355 "pxor %%mm6, %%mm6 \n\t"\
00356 "movq %0, %%mm5 \n\t"\
00357 :: "m"(round_tab[1]) \
00358 );\
00359 \
00360 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
00361 \
00362 return sum_ ## suf();\
00363 }\
00364 \
00365 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00366 {\
00367 assert(h==8);\
00368 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00369 "pxor %%mm6, %%mm6 \n\t"\
00370 ::);\
00371 \
00372 sad8_4_ ## suf(blk1, blk2, stride, 8);\
00373 \
00374 return sum_ ## suf();\
00375 }\
00376 \
00377 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00378 {\
00379 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00380 "pxor %%mm6, %%mm6 \n\t":);\
00381 \
00382 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
00383 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
00384 \
00385 return sum_ ## suf();\
00386 }\
00387 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00388 {\
00389 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00390 "pxor %%mm6, %%mm6 \n\t"\
00391 "movq %0, %%mm5 \n\t"\
00392 :: "m"(round_tab[1]) \
00393 );\
00394 \
00395 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
00396 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
00397 \
00398 return sum_ ## suf();\
00399 }\
00400 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00401 {\
00402 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00403 "pxor %%mm6, %%mm6 \n\t"\
00404 "movq %0, %%mm5 \n\t"\
00405 :: "m"(round_tab[1]) \
00406 );\
00407 \
00408 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
00409 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
00410 \
00411 return sum_ ## suf();\
00412 }\
00413 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00414 {\
00415 __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
00416 "pxor %%mm6, %%mm6 \n\t"\
00417 ::);\
00418 \
00419 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
00420 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
00421 \
00422 return sum_ ## suf();\
00423 }\
00424
00425 PIX_SAD(mmx)
00426 PIX_SAD(mmx2)
00427
00428 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
00429 {
00430 if (mm_flags & FF_MM_MMX) {
00431 c->pix_abs[0][0] = sad16_mmx;
00432 c->pix_abs[0][1] = sad16_x2_mmx;
00433 c->pix_abs[0][2] = sad16_y2_mmx;
00434 c->pix_abs[0][3] = sad16_xy2_mmx;
00435 c->pix_abs[1][0] = sad8_mmx;
00436 c->pix_abs[1][1] = sad8_x2_mmx;
00437 c->pix_abs[1][2] = sad8_y2_mmx;
00438 c->pix_abs[1][3] = sad8_xy2_mmx;
00439
00440 c->sad[0]= sad16_mmx;
00441 c->sad[1]= sad8_mmx;
00442 }
00443 if (mm_flags & FF_MM_MMX2) {
00444 c->pix_abs[0][0] = sad16_mmx2;
00445 c->pix_abs[1][0] = sad8_mmx2;
00446
00447 c->sad[0]= sad16_mmx2;
00448 c->sad[1]= sad8_mmx2;
00449
00450 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00451 c->pix_abs[0][1] = sad16_x2_mmx2;
00452 c->pix_abs[0][2] = sad16_y2_mmx2;
00453 c->pix_abs[0][3] = sad16_xy2_mmx2;
00454 c->pix_abs[1][1] = sad8_x2_mmx2;
00455 c->pix_abs[1][2] = sad8_y2_mmx2;
00456 c->pix_abs[1][3] = sad8_xy2_mmx2;
00457 }
00458 }
00459 if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW) && avctx->codec_id != CODEC_ID_SNOW) {
00460 c->sad[0]= sad16_sse2;
00461 }
00462 }