00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "libavutil/x86_cpu.h"
00027
00028 #undef REAL_PAVGB
00029 #undef PAVGB
00030 #undef PMINUB
00031 #undef PMAXUB
00032
00033 #if HAVE_MMX2
00034 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00035 #elif HAVE_AMD3DNOW
00036 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00037 #endif
00038 #define PAVGB(a,b) REAL_PAVGB(a,b)
00039
00040 #if HAVE_MMX2
00041 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
00042 #elif HAVE_MMX
00043 #define PMINUB(b,a,t) \
00044 "movq " #a ", " #t " \n\t"\
00045 "psubusb " #b ", " #t " \n\t"\
00046 "psubb " #t ", " #a " \n\t"
00047 #endif
00048
00049 #if HAVE_MMX2
00050 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
00051 #elif HAVE_MMX
00052 #define PMAXUB(a,b) \
00053 "psubusb " #a ", " #b " \n\t"\
00054 "paddb " #a ", " #b " \n\t"
00055 #endif
00056
00057
00058 #if HAVE_MMX
00059
00062 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
00063 int numEq= 0, dcOk;
00064 src+= stride*4;
00065 __asm__ volatile(
00066 "movq %0, %%mm7 \n\t"
00067 "movq %1, %%mm6 \n\t"
00068 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
00069 );
00070
00071 __asm__ volatile(
00072 "lea (%2, %3), %%"REG_a" \n\t"
00073
00074
00075
00076 "movq (%2), %%mm0 \n\t"
00077 "movq (%%"REG_a"), %%mm1 \n\t"
00078 "movq %%mm0, %%mm3 \n\t"
00079 "movq %%mm0, %%mm4 \n\t"
00080 PMAXUB(%%mm1, %%mm4)
00081 PMINUB(%%mm1, %%mm3, %%mm5)
00082 "psubb %%mm1, %%mm0 \n\t"
00083 "paddb %%mm7, %%mm0 \n\t"
00084 "pcmpgtb %%mm6, %%mm0 \n\t"
00085
00086 "movq (%%"REG_a",%3), %%mm2 \n\t"
00087 PMAXUB(%%mm2, %%mm4)
00088 PMINUB(%%mm2, %%mm3, %%mm5)
00089 "psubb %%mm2, %%mm1 \n\t"
00090 "paddb %%mm7, %%mm1 \n\t"
00091 "pcmpgtb %%mm6, %%mm1 \n\t"
00092 "paddb %%mm1, %%mm0 \n\t"
00093
00094 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
00095 PMAXUB(%%mm1, %%mm4)
00096 PMINUB(%%mm1, %%mm3, %%mm5)
00097 "psubb %%mm1, %%mm2 \n\t"
00098 "paddb %%mm7, %%mm2 \n\t"
00099 "pcmpgtb %%mm6, %%mm2 \n\t"
00100 "paddb %%mm2, %%mm0 \n\t"
00101
00102 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
00103
00104 "movq (%2, %3, 4), %%mm2 \n\t"
00105 PMAXUB(%%mm2, %%mm4)
00106 PMINUB(%%mm2, %%mm3, %%mm5)
00107 "psubb %%mm2, %%mm1 \n\t"
00108 "paddb %%mm7, %%mm1 \n\t"
00109 "pcmpgtb %%mm6, %%mm1 \n\t"
00110 "paddb %%mm1, %%mm0 \n\t"
00111
00112 "movq (%%"REG_a"), %%mm1 \n\t"
00113 PMAXUB(%%mm1, %%mm4)
00114 PMINUB(%%mm1, %%mm3, %%mm5)
00115 "psubb %%mm1, %%mm2 \n\t"
00116 "paddb %%mm7, %%mm2 \n\t"
00117 "pcmpgtb %%mm6, %%mm2 \n\t"
00118 "paddb %%mm2, %%mm0 \n\t"
00119
00120 "movq (%%"REG_a", %3), %%mm2 \n\t"
00121 PMAXUB(%%mm2, %%mm4)
00122 PMINUB(%%mm2, %%mm3, %%mm5)
00123 "psubb %%mm2, %%mm1 \n\t"
00124 "paddb %%mm7, %%mm1 \n\t"
00125 "pcmpgtb %%mm6, %%mm1 \n\t"
00126 "paddb %%mm1, %%mm0 \n\t"
00127
00128 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
00129 PMAXUB(%%mm1, %%mm4)
00130 PMINUB(%%mm1, %%mm3, %%mm5)
00131 "psubb %%mm1, %%mm2 \n\t"
00132 "paddb %%mm7, %%mm2 \n\t"
00133 "pcmpgtb %%mm6, %%mm2 \n\t"
00134 "paddb %%mm2, %%mm0 \n\t"
00135 "psubusb %%mm3, %%mm4 \n\t"
00136
00137 " \n\t"
00138 #if HAVE_MMX2
00139 "pxor %%mm7, %%mm7 \n\t"
00140 "psadbw %%mm7, %%mm0 \n\t"
00141 #else
00142 "movq %%mm0, %%mm1 \n\t"
00143 "psrlw $8, %%mm0 \n\t"
00144 "paddb %%mm1, %%mm0 \n\t"
00145 "movq %%mm0, %%mm1 \n\t"
00146 "psrlq $16, %%mm0 \n\t"
00147 "paddb %%mm1, %%mm0 \n\t"
00148 "movq %%mm0, %%mm1 \n\t"
00149 "psrlq $32, %%mm0 \n\t"
00150 "paddb %%mm1, %%mm0 \n\t"
00151 #endif
00152 "movq %4, %%mm7 \n\t"
00153 "paddusb %%mm7, %%mm7 \n\t"
00154 "psubusb %%mm7, %%mm4 \n\t"
00155 "packssdw %%mm4, %%mm4 \n\t"
00156 "movd %%mm0, %0 \n\t"
00157 "movd %%mm4, %1 \n\t"
00158
00159 : "=r" (numEq), "=r" (dcOk)
00160 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00161 : "%"REG_a
00162 );
00163
00164 numEq= (-numEq) &0xFF;
00165 if(numEq > c->ppMode.flatnessThreshold){
00166 if(dcOk) return 0;
00167 else return 1;
00168 }else{
00169 return 2;
00170 }
00171 }
00172 #endif //HAVE_MMX
00173
00178 #if !HAVE_ALTIVEC
00179 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
00180 {
00181 #if HAVE_MMX2 || HAVE_AMD3DNOW
00182 src+= stride*3;
00183 __asm__ volatile(
00184 "movq %2, %%mm0 \n\t"
00185 "pxor %%mm4, %%mm4 \n\t"
00186
00187 "movq (%0), %%mm6 \n\t"
00188 "movq (%0, %1), %%mm5 \n\t"
00189 "movq %%mm5, %%mm1 \n\t"
00190 "movq %%mm6, %%mm2 \n\t"
00191 "psubusb %%mm6, %%mm5 \n\t"
00192 "psubusb %%mm1, %%mm2 \n\t"
00193 "por %%mm5, %%mm2 \n\t"
00194 "psubusb %%mm0, %%mm2 \n\t"
00195 "pcmpeqb %%mm4, %%mm2 \n\t"
00196
00197 "pand %%mm2, %%mm6 \n\t"
00198 "pandn %%mm1, %%mm2 \n\t"
00199 "por %%mm2, %%mm6 \n\t"
00200
00201 "movq (%0, %1, 8), %%mm5 \n\t"
00202 "lea (%0, %1, 4), %%"REG_a" \n\t"
00203 "lea (%0, %1, 8), %%"REG_c" \n\t"
00204 "sub %1, %%"REG_c" \n\t"
00205 "add %1, %0 \n\t"
00206 "movq (%0, %1, 8), %%mm7 \n\t"
00207 "movq %%mm5, %%mm1 \n\t"
00208 "movq %%mm7, %%mm2 \n\t"
00209 "psubusb %%mm7, %%mm5 \n\t"
00210 "psubusb %%mm1, %%mm2 \n\t"
00211 "por %%mm5, %%mm2 \n\t"
00212 "psubusb %%mm0, %%mm2 \n\t"
00213 "pcmpeqb %%mm4, %%mm2 \n\t"
00214
00215 "pand %%mm2, %%mm7 \n\t"
00216 "pandn %%mm1, %%mm2 \n\t"
00217 "por %%mm2, %%mm7 \n\t"
00218
00219
00220
00221
00222
00223
00224
00225
00226 "movq (%0, %1), %%mm0 \n\t"
00227 "movq %%mm0, %%mm1 \n\t"
00228 PAVGB(%%mm6, %%mm0)
00229 PAVGB(%%mm6, %%mm0)
00230
00231 "movq (%0, %1, 4), %%mm2 \n\t"
00232 "movq %%mm2, %%mm5 \n\t"
00233 PAVGB((%%REGa), %%mm2)
00234 PAVGB((%0, %1, 2), %%mm2)
00235 "movq %%mm2, %%mm3 \n\t"
00236 "movq (%0), %%mm4 \n\t"
00237 PAVGB(%%mm4, %%mm3)
00238 PAVGB(%%mm0, %%mm3)
00239 "movq %%mm3, (%0) \n\t"
00240
00241 "movq %%mm1, %%mm0 \n\t"
00242 PAVGB(%%mm6, %%mm0)
00243 "movq %%mm4, %%mm3 \n\t"
00244 PAVGB((%0,%1,2), %%mm3)
00245 PAVGB((%%REGa,%1,2), %%mm5)
00246 PAVGB((%%REGa), %%mm5)
00247 PAVGB(%%mm5, %%mm3)
00248 PAVGB(%%mm0, %%mm3)
00249 "movq %%mm3, (%0,%1) \n\t"
00250
00251 PAVGB(%%mm4, %%mm6)
00252 "movq (%%"REG_c"), %%mm0 \n\t"
00253 PAVGB((%%REGa, %1, 2), %%mm0)
00254 "movq %%mm0, %%mm3 \n\t"
00255 PAVGB(%%mm1, %%mm0)
00256 PAVGB(%%mm6, %%mm0)
00257 PAVGB(%%mm2, %%mm0)
00258 "movq (%0, %1, 2), %%mm2 \n\t"
00259 "movq %%mm0, (%0, %1, 2) \n\t"
00260
00261 "movq (%%"REG_a", %1, 4), %%mm0 \n\t"
00262 PAVGB((%%REGc), %%mm0)
00263 PAVGB(%%mm0, %%mm6)
00264 PAVGB(%%mm1, %%mm4)
00265 PAVGB(%%mm2, %%mm1)
00266 PAVGB(%%mm1, %%mm6)
00267 PAVGB(%%mm5, %%mm6)
00268 "movq (%%"REG_a"), %%mm5 \n\t"
00269 "movq %%mm6, (%%"REG_a") \n\t"
00270
00271 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00272 PAVGB(%%mm7, %%mm6)
00273 PAVGB(%%mm4, %%mm6)
00274 PAVGB(%%mm3, %%mm6)
00275 PAVGB(%%mm5, %%mm2)
00276 "movq (%0, %1, 4), %%mm4 \n\t"
00277 PAVGB(%%mm4, %%mm2)
00278 PAVGB(%%mm2, %%mm6)
00279 "movq %%mm6, (%0, %1, 4) \n\t"
00280
00281 PAVGB(%%mm7, %%mm1)
00282 PAVGB(%%mm4, %%mm5)
00283 PAVGB(%%mm5, %%mm0)
00284 "movq (%%"REG_a", %1, 2), %%mm6 \n\t"
00285 PAVGB(%%mm6, %%mm1)
00286 PAVGB(%%mm0, %%mm1)
00287 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
00288
00289 PAVGB((%%REGc), %%mm2)
00290 "movq (%%"REG_a", %1, 4), %%mm0 \n\t"
00291 PAVGB(%%mm0, %%mm6)
00292 PAVGB(%%mm7, %%mm6)
00293 PAVGB(%%mm2, %%mm6)
00294 "movq %%mm6, (%%"REG_c") \n\t"
00295
00296 PAVGB(%%mm7, %%mm5)
00297 PAVGB(%%mm7, %%mm5)
00298
00299 PAVGB(%%mm3, %%mm0)
00300 PAVGB(%%mm0, %%mm5)
00301 "movq %%mm5, (%%"REG_a", %1, 4) \n\t"
00302 "sub %1, %0 \n\t"
00303
00304 :
00305 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00306 : "%"REG_a, "%"REG_c
00307 );
00308 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00309 const int l1= stride;
00310 const int l2= stride + l1;
00311 const int l3= stride + l2;
00312 const int l4= stride + l3;
00313 const int l5= stride + l4;
00314 const int l6= stride + l5;
00315 const int l7= stride + l6;
00316 const int l8= stride + l7;
00317 const int l9= stride + l8;
00318 int x;
00319 src+= stride*3;
00320 for(x=0; x<BLOCK_SIZE; x++){
00321 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
00322 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
00323
00324 int sums[10];
00325 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
00326 sums[1] = sums[0] - first + src[l4];
00327 sums[2] = sums[1] - first + src[l5];
00328 sums[3] = sums[2] - first + src[l6];
00329 sums[4] = sums[3] - first + src[l7];
00330 sums[5] = sums[4] - src[l1] + src[l8];
00331 sums[6] = sums[5] - src[l2] + last;
00332 sums[7] = sums[6] - src[l3] + last;
00333 sums[8] = sums[7] - src[l4] + last;
00334 sums[9] = sums[8] - src[l5] + last;
00335
00336 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
00337 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
00338 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
00339 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
00340 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
00341 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
00342 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
00343 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
00344
00345 src++;
00346 }
00347 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
00348 }
00349 #endif //HAVE_ALTIVEC
00350
00358 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
00359 {
00360 #if HAVE_MMX2 || HAVE_AMD3DNOW
00361 src+= stride*3;
00362
00363 __asm__ volatile(
00364 "pxor %%mm7, %%mm7 \n\t"
00365 "lea (%0, %1), %%"REG_a" \n\t"
00366 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00367
00368
00369 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00370 "movq (%0, %1, 4), %%mm1 \n\t"
00371 "movq %%mm1, %%mm2 \n\t"
00372 "psubusb %%mm0, %%mm1 \n\t"
00373 "psubusb %%mm2, %%mm0 \n\t"
00374 "por %%mm1, %%mm0 \n\t"
00375 "movq (%%"REG_c"), %%mm3 \n\t"
00376 "movq (%%"REG_c", %1), %%mm4 \n\t"
00377 "movq %%mm3, %%mm5 \n\t"
00378 "psubusb %%mm4, %%mm3 \n\t"
00379 "psubusb %%mm5, %%mm4 \n\t"
00380 "por %%mm4, %%mm3 \n\t"
00381 PAVGB(%%mm3, %%mm0)
00382 "movq %%mm2, %%mm1 \n\t"
00383 "psubusb %%mm5, %%mm2 \n\t"
00384 "movq %%mm2, %%mm4 \n\t"
00385 "pcmpeqb %%mm7, %%mm2 \n\t"
00386 "psubusb %%mm1, %%mm5 \n\t"
00387 "por %%mm5, %%mm4 \n\t"
00388 "psubusb %%mm0, %%mm4 \n\t"
00389 "movq %%mm4, %%mm3 \n\t"
00390 "movq %2, %%mm0 \n\t"
00391 "paddusb %%mm0, %%mm0 \n\t"
00392 "psubusb %%mm0, %%mm4 \n\t"
00393 "pcmpeqb %%mm7, %%mm4 \n\t"
00394 "psubusb "MANGLE(b01)", %%mm3 \n\t"
00395 "pand %%mm4, %%mm3 \n\t"
00396
00397 PAVGB(%%mm7, %%mm3)
00398 "movq %%mm3, %%mm1 \n\t"
00399 PAVGB(%%mm7, %%mm3)
00400 PAVGB(%%mm1, %%mm3)
00401
00402 "movq (%0, %1, 4), %%mm0 \n\t"
00403 "pxor %%mm2, %%mm0 \n\t"
00404 "psubusb %%mm3, %%mm0 \n\t"
00405 "pxor %%mm2, %%mm0 \n\t"
00406 "movq %%mm0, (%0, %1, 4) \n\t"
00407
00408 "movq (%%"REG_c"), %%mm0 \n\t"
00409 "pxor %%mm2, %%mm0 \n\t"
00410 "paddusb %%mm3, %%mm0 \n\t"
00411 "pxor %%mm2, %%mm0 \n\t"
00412 "movq %%mm0, (%%"REG_c") \n\t"
00413
00414 PAVGB(%%mm7, %%mm1)
00415
00416 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00417 "pxor %%mm2, %%mm0 \n\t"
00418 "psubusb %%mm1, %%mm0 \n\t"
00419 "pxor %%mm2, %%mm0 \n\t"
00420 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00421
00422 "movq (%%"REG_c", %1), %%mm0 \n\t"
00423 "pxor %%mm2, %%mm0 \n\t"
00424 "paddusb %%mm1, %%mm0 \n\t"
00425 "pxor %%mm2, %%mm0 \n\t"
00426 "movq %%mm0, (%%"REG_c", %1) \n\t"
00427
00428 PAVGB(%%mm7, %%mm1)
00429
00430 "movq (%%"REG_a", %1), %%mm0 \n\t"
00431 "pxor %%mm2, %%mm0 \n\t"
00432 "psubusb %%mm1, %%mm0 \n\t"
00433 "pxor %%mm2, %%mm0 \n\t"
00434 "movq %%mm0, (%%"REG_a", %1) \n\t"
00435
00436 "movq (%%"REG_c", %1, 2), %%mm0 \n\t"
00437 "pxor %%mm2, %%mm0 \n\t"
00438 "paddusb %%mm1, %%mm0 \n\t"
00439 "pxor %%mm2, %%mm0 \n\t"
00440 "movq %%mm0, (%%"REG_c", %1, 2) \n\t"
00441
00442 :
00443 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
00444 : "%"REG_a, "%"REG_c
00445 );
00446 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00447
00448 const int l1= stride;
00449 const int l2= stride + l1;
00450 const int l3= stride + l2;
00451 const int l4= stride + l3;
00452 const int l5= stride + l4;
00453 const int l6= stride + l5;
00454 const int l7= stride + l6;
00455
00456
00457 int x;
00458
00459 src+= stride*3;
00460 for(x=0; x<BLOCK_SIZE; x++){
00461 int a= src[l3] - src[l4];
00462 int b= src[l4] - src[l5];
00463 int c= src[l5] - src[l6];
00464
00465 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
00466 d= FFMAX(d, 0);
00467
00468 if(d < co->QP*2){
00469 int v = d * FFSIGN(-b);
00470
00471 src[l2] +=v>>3;
00472 src[l3] +=v>>2;
00473 src[l4] +=(3*v)>>3;
00474 src[l5] -=(3*v)>>3;
00475 src[l6] -=v>>2;
00476 src[l7] -=v>>3;
00477 }
00478 src++;
00479 }
00480 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
00481 }
00482
00483 #if !HAVE_ALTIVEC
00484 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
00485 {
00486 #if HAVE_MMX2 || HAVE_AMD3DNOW
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501 src+= stride*4;
00502 __asm__ volatile(
00503
00504 #if 0 //slightly more accurate and slightly slower
00505 "pxor %%mm7, %%mm7 \n\t"
00506 "lea (%0, %1), %%"REG_a" \n\t"
00507 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00508
00509
00510
00511
00512
00513 "movq (%0, %1, 2), %%mm0 \n\t"
00514 "movq (%0), %%mm1 \n\t"
00515 "movq %%mm0, %%mm2 \n\t"
00516 PAVGB(%%mm7, %%mm0)
00517 PAVGB(%%mm1, %%mm0)
00518 PAVGB(%%mm2, %%mm0)
00519
00520 "movq (%%"REG_a"), %%mm1 \n\t"
00521 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
00522 "movq %%mm1, %%mm4 \n\t"
00523 PAVGB(%%mm7, %%mm1)
00524 PAVGB(%%mm3, %%mm1)
00525 PAVGB(%%mm4, %%mm1)
00526
00527 "movq %%mm0, %%mm4 \n\t"
00528 "psubusb %%mm1, %%mm0 \n\t"
00529 "psubusb %%mm4, %%mm1 \n\t"
00530 "por %%mm0, %%mm1 \n\t"
00531
00532
00533 "movq (%0, %1, 4), %%mm0 \n\t"
00534 "movq %%mm0, %%mm4 \n\t"
00535 PAVGB(%%mm7, %%mm0)
00536 PAVGB(%%mm2, %%mm0)
00537 PAVGB(%%mm4, %%mm0)
00538
00539 "movq (%%"REG_c"), %%mm2 \n\t"
00540 "movq %%mm3, %%mm5 \n\t"
00541 PAVGB(%%mm7, %%mm3)
00542 PAVGB(%%mm2, %%mm3)
00543 PAVGB(%%mm5, %%mm3)
00544
00545 "movq %%mm0, %%mm6 \n\t"
00546 "psubusb %%mm3, %%mm0 \n\t"
00547 "psubusb %%mm6, %%mm3 \n\t"
00548 "por %%mm0, %%mm3 \n\t"
00549 "pcmpeqb %%mm7, %%mm0 \n\t"
00550
00551
00552 "movq (%%"REG_c", %1), %%mm6 \n\t"
00553 "movq %%mm6, %%mm5 \n\t"
00554 PAVGB(%%mm7, %%mm6)
00555 PAVGB(%%mm4, %%mm6)
00556 PAVGB(%%mm5, %%mm6)
00557
00558 "movq (%%"REG_c", %1, 2), %%mm5 \n\t"
00559 "movq %%mm2, %%mm4 \n\t"
00560 PAVGB(%%mm7, %%mm2)
00561 PAVGB(%%mm5, %%mm2)
00562 PAVGB(%%mm4, %%mm2)
00563
00564 "movq %%mm6, %%mm4 \n\t"
00565 "psubusb %%mm2, %%mm6 \n\t"
00566 "psubusb %%mm4, %%mm2 \n\t"
00567 "por %%mm6, %%mm2 \n\t"
00568
00569
00570
00571 PMINUB(%%mm2, %%mm1, %%mm4)
00572 "movq %2, %%mm4 \n\t"
00573 "paddusb "MANGLE(b01)", %%mm4 \n\t"
00574 "pcmpgtb %%mm3, %%mm4 \n\t"
00575 "psubusb %%mm1, %%mm3 \n\t"
00576 "pand %%mm4, %%mm3 \n\t"
00577
00578 "movq %%mm3, %%mm1 \n\t"
00579
00580 PAVGB(%%mm7, %%mm3)
00581 PAVGB(%%mm7, %%mm3)
00582 "paddusb %%mm1, %%mm3 \n\t"
00583
00584
00585 "movq (%%"REG_a", %1, 2), %%mm6 \n\t"
00586 "movq (%0, %1, 4), %%mm5 \n\t"
00587 "movq (%0, %1, 4), %%mm4 \n\t"
00588 "psubusb %%mm6, %%mm5 \n\t"
00589 "psubusb %%mm4, %%mm6 \n\t"
00590 "por %%mm6, %%mm5 \n\t"
00591 "pcmpeqb %%mm7, %%mm6 \n\t"
00592 "pxor %%mm6, %%mm0 \n\t"
00593 "pand %%mm0, %%mm3 \n\t"
00594 PMINUB(%%mm5, %%mm3, %%mm0)
00595
00596 "psubusb "MANGLE(b01)", %%mm3 \n\t"
00597 PAVGB(%%mm7, %%mm3)
00598
00599 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00600 "movq (%0, %1, 4), %%mm2 \n\t"
00601 "pxor %%mm6, %%mm0 \n\t"
00602 "pxor %%mm6, %%mm2 \n\t"
00603 "psubb %%mm3, %%mm0 \n\t"
00604 "paddb %%mm3, %%mm2 \n\t"
00605 "pxor %%mm6, %%mm0 \n\t"
00606 "pxor %%mm6, %%mm2 \n\t"
00607 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00608 "movq %%mm2, (%0, %1, 4) \n\t"
00609 #endif //0
00610
00611 "lea (%0, %1), %%"REG_a" \n\t"
00612 "pcmpeqb %%mm6, %%mm6 \n\t"
00613
00614
00615
00616
00617
00618 "movq (%%"REG_a", %1, 2), %%mm1 \n\t"
00619 "movq (%0, %1, 4), %%mm0 \n\t"
00620 "pxor %%mm6, %%mm1 \n\t"
00621 PAVGB(%%mm1, %%mm0)
00622
00623
00624 "movq (%%"REG_a", %1, 4), %%mm2 \n\t"
00625 "movq (%%"REG_a", %1), %%mm3 \n\t"
00626 "pxor %%mm6, %%mm2 \n\t"
00627 "movq %%mm2, %%mm5 \n\t"
00628 "movq "MANGLE(b80)", %%mm4 \n\t"
00629 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00630 PAVGB(%%mm3, %%mm2)
00631 PAVGB(%%mm0, %%mm4)
00632 PAVGB(%%mm2, %%mm4)
00633 PAVGB(%%mm0, %%mm4)
00634
00635
00636 "movq (%%"REG_a"), %%mm2 \n\t"
00637 "pxor %%mm6, %%mm2 \n\t"
00638 PAVGB(%%mm3, %%mm2)
00639 PAVGB((%0), %%mm1)
00640 "movq "MANGLE(b80)", %%mm3 \n\t"
00641 PAVGB(%%mm2, %%mm3)
00642 PAVGB(%%mm1, %%mm3)
00643 PAVGB(%%mm2, %%mm3)
00644
00645
00646 PAVGB((%%REGc, %1), %%mm5)
00647 "movq (%%"REG_c", %1, 2), %%mm1 \n\t"
00648 "pxor %%mm6, %%mm1 \n\t"
00649 PAVGB((%0, %1, 4), %%mm1)
00650 "movq "MANGLE(b80)", %%mm2 \n\t"
00651 PAVGB(%%mm5, %%mm2)
00652 PAVGB(%%mm1, %%mm2)
00653 PAVGB(%%mm5, %%mm2)
00654
00655
00656 "movq "MANGLE(b00)", %%mm1 \n\t"
00657 "movq "MANGLE(b00)", %%mm5 \n\t"
00658 "psubb %%mm2, %%mm1 \n\t"
00659 "psubb %%mm3, %%mm5 \n\t"
00660 PMAXUB(%%mm1, %%mm2)
00661 PMAXUB(%%mm5, %%mm3)
00662 PMINUB(%%mm2, %%mm3, %%mm1)
00663
00664
00665
00666 "movq "MANGLE(b00)", %%mm7 \n\t"
00667 "movq %2, %%mm2 \n\t"
00668 PAVGB(%%mm6, %%mm2)
00669 "psubb %%mm6, %%mm2 \n\t"
00670
00671 "movq %%mm4, %%mm1 \n\t"
00672 "pcmpgtb %%mm7, %%mm1 \n\t"
00673 "pxor %%mm1, %%mm4 \n\t"
00674 "psubb %%mm1, %%mm4 \n\t"
00675 "pcmpgtb %%mm4, %%mm2 \n\t"
00676 "psubusb %%mm3, %%mm4 \n\t"
00677
00678
00679 "movq %%mm4, %%mm3 \n\t"
00680 "psubusb "MANGLE(b01)", %%mm4 \n\t"
00681 PAVGB(%%mm7, %%mm4)
00682 PAVGB(%%mm7, %%mm4)
00683 "paddb %%mm3, %%mm4 \n\t"
00684 "pand %%mm2, %%mm4 \n\t"
00685
00686 "movq "MANGLE(b80)", %%mm5 \n\t"
00687 "psubb %%mm0, %%mm5 \n\t"
00688 "paddsb %%mm6, %%mm5 \n\t"
00689 "pcmpgtb %%mm5, %%mm7 \n\t"
00690 "pxor %%mm7, %%mm5 \n\t"
00691
00692 PMINUB(%%mm5, %%mm4, %%mm3)
00693 "pxor %%mm1, %%mm7 \n\t"
00694
00695 "pand %%mm7, %%mm4 \n\t"
00696 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00697 "movq (%0, %1, 4), %%mm2 \n\t"
00698 "pxor %%mm1, %%mm0 \n\t"
00699 "pxor %%mm1, %%mm2 \n\t"
00700 "paddb %%mm4, %%mm0 \n\t"
00701 "psubb %%mm4, %%mm2 \n\t"
00702 "pxor %%mm1, %%mm0 \n\t"
00703 "pxor %%mm1, %%mm2 \n\t"
00704 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00705 "movq %%mm2, (%0, %1, 4) \n\t"
00706
00707 :
00708 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00709 : "%"REG_a, "%"REG_c
00710 );
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767 #elif HAVE_MMX
00768 DECLARE_ALIGNED(8, uint64_t, tmp)[4];
00769 src+= stride*4;
00770 __asm__ volatile(
00771 "pxor %%mm7, %%mm7 \n\t"
00772
00773
00774
00775
00776 "movq (%0), %%mm0 \n\t"
00777 "movq %%mm0, %%mm1 \n\t"
00778 "punpcklbw %%mm7, %%mm0 \n\t"
00779 "punpckhbw %%mm7, %%mm1 \n\t"
00780
00781 "movq (%0, %1), %%mm2 \n\t"
00782 "lea (%0, %1, 2), %%"REG_a" \n\t"
00783 "movq %%mm2, %%mm3 \n\t"
00784 "punpcklbw %%mm7, %%mm2 \n\t"
00785 "punpckhbw %%mm7, %%mm3 \n\t"
00786
00787 "movq (%%"REG_a"), %%mm4 \n\t"
00788 "movq %%mm4, %%mm5 \n\t"
00789 "punpcklbw %%mm7, %%mm4 \n\t"
00790 "punpckhbw %%mm7, %%mm5 \n\t"
00791
00792 "paddw %%mm0, %%mm0 \n\t"
00793 "paddw %%mm1, %%mm1 \n\t"
00794 "psubw %%mm4, %%mm2 \n\t"
00795 "psubw %%mm5, %%mm3 \n\t"
00796 "psubw %%mm2, %%mm0 \n\t"
00797 "psubw %%mm3, %%mm1 \n\t"
00798
00799 "psllw $2, %%mm2 \n\t"
00800 "psllw $2, %%mm3 \n\t"
00801 "psubw %%mm2, %%mm0 \n\t"
00802 "psubw %%mm3, %%mm1 \n\t"
00803
00804 "movq (%%"REG_a", %1), %%mm2 \n\t"
00805 "movq %%mm2, %%mm3 \n\t"
00806 "punpcklbw %%mm7, %%mm2 \n\t"
00807 "punpckhbw %%mm7, %%mm3 \n\t"
00808
00809 "psubw %%mm2, %%mm0 \n\t"
00810 "psubw %%mm3, %%mm1 \n\t"
00811 "psubw %%mm2, %%mm0 \n\t"
00812 "psubw %%mm3, %%mm1 \n\t"
00813 "movq %%mm0, (%3) \n\t"
00814 "movq %%mm1, 8(%3) \n\t"
00815
00816 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00817 "movq %%mm0, %%mm1 \n\t"
00818 "punpcklbw %%mm7, %%mm0 \n\t"
00819 "punpckhbw %%mm7, %%mm1 \n\t"
00820
00821 "psubw %%mm0, %%mm2 \n\t"
00822 "psubw %%mm1, %%mm3 \n\t"
00823 "movq %%mm2, 16(%3) \n\t"
00824 "movq %%mm3, 24(%3) \n\t"
00825 "paddw %%mm4, %%mm4 \n\t"
00826 "paddw %%mm5, %%mm5 \n\t"
00827 "psubw %%mm2, %%mm4 \n\t"
00828 "psubw %%mm3, %%mm5 \n\t"
00829
00830 "lea (%%"REG_a", %1), %0 \n\t"
00831 "psllw $2, %%mm2 \n\t"
00832 "psllw $2, %%mm3 \n\t"
00833 "psubw %%mm2, %%mm4 \n\t"
00834 "psubw %%mm3, %%mm5 \n\t"
00835
00836 "movq (%0, %1, 2), %%mm2 \n\t"
00837 "movq %%mm2, %%mm3 \n\t"
00838 "punpcklbw %%mm7, %%mm2 \n\t"
00839 "punpckhbw %%mm7, %%mm3 \n\t"
00840 "psubw %%mm2, %%mm4 \n\t"
00841 "psubw %%mm3, %%mm5 \n\t"
00842 "psubw %%mm2, %%mm4 \n\t"
00843 "psubw %%mm3, %%mm5 \n\t"
00844
00845 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00846 "punpcklbw %%mm7, %%mm6 \n\t"
00847 "psubw %%mm6, %%mm2 \n\t"
00848 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00849 "punpckhbw %%mm7, %%mm6 \n\t"
00850 "psubw %%mm6, %%mm3 \n\t"
00851
00852 "paddw %%mm0, %%mm0 \n\t"
00853 "paddw %%mm1, %%mm1 \n\t"
00854 "psubw %%mm2, %%mm0 \n\t"
00855 "psubw %%mm3, %%mm1 \n\t"
00856
00857 "psllw $2, %%mm2 \n\t"
00858 "psllw $2, %%mm3 \n\t"
00859 "psubw %%mm2, %%mm0 \n\t"
00860 "psubw %%mm3, %%mm1 \n\t"
00861
00862 "movq (%0, %1, 4), %%mm2 \n\t"
00863 "movq %%mm2, %%mm3 \n\t"
00864 "punpcklbw %%mm7, %%mm2 \n\t"
00865 "punpckhbw %%mm7, %%mm3 \n\t"
00866
00867 "paddw %%mm2, %%mm2 \n\t"
00868 "paddw %%mm3, %%mm3 \n\t"
00869 "psubw %%mm2, %%mm0 \n\t"
00870 "psubw %%mm3, %%mm1 \n\t"
00871
00872 "movq (%3), %%mm2 \n\t"
00873 "movq 8(%3), %%mm3 \n\t"
00874
00875 #if HAVE_MMX2
00876 "movq %%mm7, %%mm6 \n\t"
00877 "psubw %%mm0, %%mm6 \n\t"
00878 "pmaxsw %%mm6, %%mm0 \n\t"
00879 "movq %%mm7, %%mm6 \n\t"
00880 "psubw %%mm1, %%mm6 \n\t"
00881 "pmaxsw %%mm6, %%mm1 \n\t"
00882 "movq %%mm7, %%mm6 \n\t"
00883 "psubw %%mm2, %%mm6 \n\t"
00884 "pmaxsw %%mm6, %%mm2 \n\t"
00885 "movq %%mm7, %%mm6 \n\t"
00886 "psubw %%mm3, %%mm6 \n\t"
00887 "pmaxsw %%mm6, %%mm3 \n\t"
00888 #else
00889 "movq %%mm7, %%mm6 \n\t"
00890 "pcmpgtw %%mm0, %%mm6 \n\t"
00891 "pxor %%mm6, %%mm0 \n\t"
00892 "psubw %%mm6, %%mm0 \n\t"
00893 "movq %%mm7, %%mm6 \n\t"
00894 "pcmpgtw %%mm1, %%mm6 \n\t"
00895 "pxor %%mm6, %%mm1 \n\t"
00896 "psubw %%mm6, %%mm1 \n\t"
00897 "movq %%mm7, %%mm6 \n\t"
00898 "pcmpgtw %%mm2, %%mm6 \n\t"
00899 "pxor %%mm6, %%mm2 \n\t"
00900 "psubw %%mm6, %%mm2 \n\t"
00901 "movq %%mm7, %%mm6 \n\t"
00902 "pcmpgtw %%mm3, %%mm6 \n\t"
00903 "pxor %%mm6, %%mm3 \n\t"
00904 "psubw %%mm6, %%mm3 \n\t"
00905 #endif
00906
00907 #if HAVE_MMX2
00908 "pminsw %%mm2, %%mm0 \n\t"
00909 "pminsw %%mm3, %%mm1 \n\t"
00910 #else
00911 "movq %%mm0, %%mm6 \n\t"
00912 "psubusw %%mm2, %%mm6 \n\t"
00913 "psubw %%mm6, %%mm0 \n\t"
00914 "movq %%mm1, %%mm6 \n\t"
00915 "psubusw %%mm3, %%mm6 \n\t"
00916 "psubw %%mm6, %%mm1 \n\t"
00917 #endif
00918
00919 "movd %2, %%mm2 \n\t"
00920 "punpcklbw %%mm7, %%mm2 \n\t"
00921
00922 "movq %%mm7, %%mm6 \n\t"
00923 "pcmpgtw %%mm4, %%mm6 \n\t"
00924 "pxor %%mm6, %%mm4 \n\t"
00925 "psubw %%mm6, %%mm4 \n\t"
00926 "pcmpgtw %%mm5, %%mm7 \n\t"
00927 "pxor %%mm7, %%mm5 \n\t"
00928 "psubw %%mm7, %%mm5 \n\t"
00929
00930 "psllw $3, %%mm2 \n\t"
00931 "movq %%mm2, %%mm3 \n\t"
00932 "pcmpgtw %%mm4, %%mm2 \n\t"
00933 "pcmpgtw %%mm5, %%mm3 \n\t"
00934 "pand %%mm2, %%mm4 \n\t"
00935 "pand %%mm3, %%mm5 \n\t"
00936
00937
00938 "psubusw %%mm0, %%mm4 \n\t"
00939 "psubusw %%mm1, %%mm5 \n\t"
00940
00941
00942 "movq "MANGLE(w05)", %%mm2 \n\t"
00943 "pmullw %%mm2, %%mm4 \n\t"
00944 "pmullw %%mm2, %%mm5 \n\t"
00945 "movq "MANGLE(w20)", %%mm2 \n\t"
00946 "paddw %%mm2, %%mm4 \n\t"
00947 "paddw %%mm2, %%mm5 \n\t"
00948 "psrlw $6, %%mm4 \n\t"
00949 "psrlw $6, %%mm5 \n\t"
00950
00951 "movq 16(%3), %%mm0 \n\t"
00952 "movq 24(%3), %%mm1 \n\t"
00953
00954 "pxor %%mm2, %%mm2 \n\t"
00955 "pxor %%mm3, %%mm3 \n\t"
00956
00957 "pcmpgtw %%mm0, %%mm2 \n\t"
00958 "pcmpgtw %%mm1, %%mm3 \n\t"
00959 "pxor %%mm2, %%mm0 \n\t"
00960 "pxor %%mm3, %%mm1 \n\t"
00961 "psubw %%mm2, %%mm0 \n\t"
00962 "psubw %%mm3, %%mm1 \n\t"
00963 "psrlw $1, %%mm0 \n\t"
00964 "psrlw $1, %%mm1 \n\t"
00965
00966 "pxor %%mm6, %%mm2 \n\t"
00967 "pxor %%mm7, %%mm3 \n\t"
00968 "pand %%mm2, %%mm4 \n\t"
00969 "pand %%mm3, %%mm5 \n\t"
00970
00971 #if HAVE_MMX2
00972 "pminsw %%mm0, %%mm4 \n\t"
00973 "pminsw %%mm1, %%mm5 \n\t"
00974 #else
00975 "movq %%mm4, %%mm2 \n\t"
00976 "psubusw %%mm0, %%mm2 \n\t"
00977 "psubw %%mm2, %%mm4 \n\t"
00978 "movq %%mm5, %%mm2 \n\t"
00979 "psubusw %%mm1, %%mm2 \n\t"
00980 "psubw %%mm2, %%mm5 \n\t"
00981 #endif
00982 "pxor %%mm6, %%mm4 \n\t"
00983 "pxor %%mm7, %%mm5 \n\t"
00984 "psubw %%mm6, %%mm4 \n\t"
00985 "psubw %%mm7, %%mm5 \n\t"
00986 "packsswb %%mm5, %%mm4 \n\t"
00987 "movq (%0), %%mm0 \n\t"
00988 "paddb %%mm4, %%mm0 \n\t"
00989 "movq %%mm0, (%0) \n\t"
00990 "movq (%0, %1), %%mm0 \n\t"
00991 "psubb %%mm4, %%mm0 \n\t"
00992 "movq %%mm0, (%0, %1) \n\t"
00993
00994 : "+r" (src)
00995 : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
00996 : "%"REG_a
00997 );
00998 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00999 const int l1= stride;
01000 const int l2= stride + l1;
01001 const int l3= stride + l2;
01002 const int l4= stride + l3;
01003 const int l5= stride + l4;
01004 const int l6= stride + l5;
01005 const int l7= stride + l6;
01006 const int l8= stride + l7;
01007
01008 int x;
01009 src+= stride*3;
01010 for(x=0; x<BLOCK_SIZE; x++){
01011 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
01012 if(FFABS(middleEnergy) < 8*c->QP){
01013 const int q=(src[l4] - src[l5])/2;
01014 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
01015 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
01016
01017 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
01018 d= FFMAX(d, 0);
01019
01020 d= (5*d + 32) >> 6;
01021 d*= FFSIGN(-middleEnergy);
01022
01023 if(q>0){
01024 d= d<0 ? 0 : d;
01025 d= d>q ? q : d;
01026 }else{
01027 d= d>0 ? 0 : d;
01028 d= d<q ? q : d;
01029 }
01030
01031 src[l4]-= d;
01032 src[l5]+= d;
01033 }
01034 src++;
01035 }
01036 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01037 }
01038 #endif //HAVE_ALTIVEC
01039
01040 #if !HAVE_ALTIVEC
01041 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
01042 {
01043 #if HAVE_MMX2 || HAVE_AMD3DNOW
01044 DECLARE_ALIGNED(8, uint64_t, tmp)[3];
01045 __asm__ volatile(
01046 "pxor %%mm6, %%mm6 \n\t"
01047 "pcmpeqb %%mm7, %%mm7 \n\t"
01048 "movq %2, %%mm0 \n\t"
01049 "punpcklbw %%mm6, %%mm0 \n\t"
01050 "psrlw $1, %%mm0 \n\t"
01051 "psubw %%mm7, %%mm0 \n\t"
01052 "packuswb %%mm0, %%mm0 \n\t"
01053 "movq %%mm0, %3 \n\t"
01054
01055 "lea (%0, %1), %%"REG_a" \n\t"
01056 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01057
01058
01059
01060
01061 #undef REAL_FIND_MIN_MAX
01062 #undef FIND_MIN_MAX
01063 #if HAVE_MMX2
01064 #define REAL_FIND_MIN_MAX(addr)\
01065 "movq " #addr ", %%mm0 \n\t"\
01066 "pminub %%mm0, %%mm7 \n\t"\
01067 "pmaxub %%mm0, %%mm6 \n\t"
01068 #else
01069 #define REAL_FIND_MIN_MAX(addr)\
01070 "movq " #addr ", %%mm0 \n\t"\
01071 "movq %%mm7, %%mm1 \n\t"\
01072 "psubusb %%mm0, %%mm6 \n\t"\
01073 "paddb %%mm0, %%mm6 \n\t"\
01074 "psubusb %%mm0, %%mm1 \n\t"\
01075 "psubb %%mm1, %%mm7 \n\t"
01076 #endif
01077 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
01078
01079 FIND_MIN_MAX((%%REGa))
01080 FIND_MIN_MAX((%%REGa, %1))
01081 FIND_MIN_MAX((%%REGa, %1, 2))
01082 FIND_MIN_MAX((%0, %1, 4))
01083 FIND_MIN_MAX((%%REGd))
01084 FIND_MIN_MAX((%%REGd, %1))
01085 FIND_MIN_MAX((%%REGd, %1, 2))
01086 FIND_MIN_MAX((%0, %1, 8))
01087
01088 "movq %%mm7, %%mm4 \n\t"
01089 "psrlq $8, %%mm7 \n\t"
01090 #if HAVE_MMX2
01091 "pminub %%mm4, %%mm7 \n\t"
01092 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
01093 "pminub %%mm4, %%mm7 \n\t"
01094 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
01095 "pminub %%mm4, %%mm7 \n\t"
01096 #else
01097 "movq %%mm7, %%mm1 \n\t"
01098 "psubusb %%mm4, %%mm1 \n\t"
01099 "psubb %%mm1, %%mm7 \n\t"
01100 "movq %%mm7, %%mm4 \n\t"
01101 "psrlq $16, %%mm7 \n\t"
01102 "movq %%mm7, %%mm1 \n\t"
01103 "psubusb %%mm4, %%mm1 \n\t"
01104 "psubb %%mm1, %%mm7 \n\t"
01105 "movq %%mm7, %%mm4 \n\t"
01106 "psrlq $32, %%mm7 \n\t"
01107 "movq %%mm7, %%mm1 \n\t"
01108 "psubusb %%mm4, %%mm1 \n\t"
01109 "psubb %%mm1, %%mm7 \n\t"
01110 #endif
01111
01112
01113 "movq %%mm6, %%mm4 \n\t"
01114 "psrlq $8, %%mm6 \n\t"
01115 #if HAVE_MMX2
01116 "pmaxub %%mm4, %%mm6 \n\t"
01117 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
01118 "pmaxub %%mm4, %%mm6 \n\t"
01119 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
01120 "pmaxub %%mm4, %%mm6 \n\t"
01121 #else
01122 "psubusb %%mm4, %%mm6 \n\t"
01123 "paddb %%mm4, %%mm6 \n\t"
01124 "movq %%mm6, %%mm4 \n\t"
01125 "psrlq $16, %%mm6 \n\t"
01126 "psubusb %%mm4, %%mm6 \n\t"
01127 "paddb %%mm4, %%mm6 \n\t"
01128 "movq %%mm6, %%mm4 \n\t"
01129 "psrlq $32, %%mm6 \n\t"
01130 "psubusb %%mm4, %%mm6 \n\t"
01131 "paddb %%mm4, %%mm6 \n\t"
01132 #endif
01133 "movq %%mm6, %%mm0 \n\t"
01134 "psubb %%mm7, %%mm6 \n\t"
01135 "push %4 \n\t"
01136 "movd %%mm6, %k4 \n\t"
01137 "cmpb "MANGLE(deringThreshold)", %b4 \n\t"
01138 "pop %4 \n\t"
01139 " jb 1f \n\t"
01140 PAVGB(%%mm0, %%mm7)
01141 "punpcklbw %%mm7, %%mm7 \n\t"
01142 "punpcklbw %%mm7, %%mm7 \n\t"
01143 "punpcklbw %%mm7, %%mm7 \n\t"
01144 "movq %%mm7, (%4) \n\t"
01145
01146 "movq (%0), %%mm0 \n\t"
01147 "movq %%mm0, %%mm1 \n\t"
01148 "movq %%mm0, %%mm2 \n\t"
01149 "psllq $8, %%mm1 \n\t"
01150 "psrlq $8, %%mm2 \n\t"
01151 "movd -4(%0), %%mm3 \n\t"
01152 "movd 8(%0), %%mm4 \n\t"
01153 "psrlq $24, %%mm3 \n\t"
01154 "psllq $56, %%mm4 \n\t"
01155 "por %%mm3, %%mm1 \n\t"
01156 "por %%mm4, %%mm2 \n\t"
01157 "movq %%mm1, %%mm3 \n\t"
01158 PAVGB(%%mm2, %%mm1)
01159 PAVGB(%%mm0, %%mm1)
01160 "psubusb %%mm7, %%mm0 \n\t"
01161 "psubusb %%mm7, %%mm2 \n\t"
01162 "psubusb %%mm7, %%mm3 \n\t"
01163 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t"
01164 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t"
01165 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t"
01166 "paddb %%mm2, %%mm0 \n\t"
01167 "paddb %%mm3, %%mm0 \n\t"
01168
01169 "movq (%%"REG_a"), %%mm2 \n\t"
01170 "movq %%mm2, %%mm3 \n\t"
01171 "movq %%mm2, %%mm4 \n\t"
01172 "psllq $8, %%mm3 \n\t"
01173 "psrlq $8, %%mm4 \n\t"
01174 "movd -4(%%"REG_a"), %%mm5 \n\t"
01175 "movd 8(%%"REG_a"), %%mm6 \n\t"
01176 "psrlq $24, %%mm5 \n\t"
01177 "psllq $56, %%mm6 \n\t"
01178 "por %%mm5, %%mm3 \n\t"
01179 "por %%mm6, %%mm4 \n\t"
01180 "movq %%mm3, %%mm5 \n\t"
01181 PAVGB(%%mm4, %%mm3)
01182 PAVGB(%%mm2, %%mm3)
01183 "psubusb %%mm7, %%mm2 \n\t"
01184 "psubusb %%mm7, %%mm4 \n\t"
01185 "psubusb %%mm7, %%mm5 \n\t"
01186 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t"
01187 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t"
01188 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t"
01189 "paddb %%mm4, %%mm2 \n\t"
01190 "paddb %%mm5, %%mm2 \n\t"
01191
01192 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
01193 "movq " #src ", " #sx " \n\t" \
01194 "movq " #sx ", " #lx " \n\t" \
01195 "movq " #sx ", " #t0 " \n\t" \
01196 "psllq $8, " #lx " \n\t"\
01197 "psrlq $8, " #t0 " \n\t"\
01198 "movd -4" #src ", " #t1 " \n\t"\
01199 "psrlq $24, " #t1 " \n\t"\
01200 "por " #t1 ", " #lx " \n\t" \
01201 "movd 8" #src ", " #t1 " \n\t"\
01202 "psllq $56, " #t1 " \n\t"\
01203 "por " #t1 ", " #t0 " \n\t" \
01204 "movq " #lx ", " #t1 " \n\t" \
01205 PAVGB(t0, lx) \
01206 PAVGB(sx, lx) \
01207 PAVGB(lx, pplx) \
01208 "movq " #lx ", 8(%4) \n\t"\
01209 "movq (%4), " #lx " \n\t"\
01210 "psubusb " #lx ", " #t1 " \n\t"\
01211 "psubusb " #lx ", " #t0 " \n\t"\
01212 "psubusb " #lx ", " #sx " \n\t"\
01213 "movq "MANGLE(b00)", " #lx " \n\t"\
01214 "pcmpeqb " #lx ", " #t1 " \n\t" \
01215 "pcmpeqb " #lx ", " #t0 " \n\t" \
01216 "pcmpeqb " #lx ", " #sx " \n\t" \
01217 "paddb " #t1 ", " #t0 " \n\t"\
01218 "paddb " #t0 ", " #sx " \n\t"\
01219 \
01220 PAVGB(plx, pplx) \
01221 "movq " #dst ", " #t0 " \n\t" \
01222 "movq " #t0 ", " #t1 " \n\t" \
01223 "psubusb %3, " #t0 " \n\t"\
01224 "paddusb %3, " #t1 " \n\t"\
01225 PMAXUB(t0, pplx)\
01226 PMINUB(t1, pplx, t0)\
01227 "paddb " #sx ", " #ppsx " \n\t"\
01228 "paddb " #psx ", " #ppsx " \n\t"\
01229 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
01230 "pand "MANGLE(b08)", " #ppsx " \n\t"\
01231 "pcmpeqb " #lx ", " #ppsx " \n\t"\
01232 "pand " #ppsx ", " #pplx " \n\t"\
01233 "pandn " #dst ", " #ppsx " \n\t"\
01234 "por " #pplx ", " #ppsx " \n\t"\
01235 "movq " #ppsx ", " #dst " \n\t"\
01236 "movq 8(%4), " #lx " \n\t"
01237
01238 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
01239 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
01240
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251
01252
01253
01254
01255
01256 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01257 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01258 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
01259 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01260 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01261 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
01262 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01263 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01264
01265 "1: \n\t"
01266 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
01267 : "%"REG_a, "%"REG_d
01268 );
01269 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01270 int y;
01271 int min=255;
01272 int max=0;
01273 int avg;
01274 uint8_t *p;
01275 int s[10];
01276 const int QP2= c->QP/2 + 1;
01277
01278 for(y=1; y<9; y++){
01279 int x;
01280 p= src + stride*y;
01281 for(x=1; x<9; x++){
01282 p++;
01283 if(*p > max) max= *p;
01284 if(*p < min) min= *p;
01285 }
01286 }
01287 avg= (min + max + 1)>>1;
01288
01289 if(max - min <deringThreshold) return;
01290
01291 for(y=0; y<10; y++){
01292 int t = 0;
01293
01294 if(src[stride*y + 0] > avg) t+= 1;
01295 if(src[stride*y + 1] > avg) t+= 2;
01296 if(src[stride*y + 2] > avg) t+= 4;
01297 if(src[stride*y + 3] > avg) t+= 8;
01298 if(src[stride*y + 4] > avg) t+= 16;
01299 if(src[stride*y + 5] > avg) t+= 32;
01300 if(src[stride*y + 6] > avg) t+= 64;
01301 if(src[stride*y + 7] > avg) t+= 128;
01302 if(src[stride*y + 8] > avg) t+= 256;
01303 if(src[stride*y + 9] > avg) t+= 512;
01304
01305 t |= (~t)<<16;
01306 t &= (t<<1) & (t>>1);
01307 s[y] = t;
01308 }
01309
01310 for(y=1; y<9; y++){
01311 int t = s[y-1] & s[y] & s[y+1];
01312 t|= t>>16;
01313 s[y-1]= t;
01314 }
01315
01316 for(y=1; y<9; y++){
01317 int x;
01318 int t = s[y-1];
01319
01320 p= src + stride*y;
01321 for(x=1; x<9; x++){
01322 p++;
01323 if(t & (1<<x)){
01324 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
01325 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
01326 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
01327 f= (f + 8)>>4;
01328
01329 #ifdef DEBUG_DERING_THRESHOLD
01330 __asm__ volatile("emms\n\t":);
01331 {
01332 static long long numPixels=0;
01333 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
01334
01335
01336
01337 if(max-min < 20){
01338 static int numSkipped=0;
01339 static int errorSum=0;
01340 static int worstQP=0;
01341 static int worstRange=0;
01342 static int worstDiff=0;
01343 int diff= (f - *p);
01344 int absDiff= FFABS(diff);
01345 int error= diff*diff;
01346
01347 if(x==1 || x==8 || y==1 || y==8) continue;
01348
01349 numSkipped++;
01350 if(absDiff > worstDiff){
01351 worstDiff= absDiff;
01352 worstQP= QP;
01353 worstRange= max-min;
01354 }
01355 errorSum+= error;
01356
01357 if(1024LL*1024LL*1024LL % numSkipped == 0){
01358 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
01359 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
01360 (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
01361 worstDiff, (float)numSkipped/numPixels);
01362 }
01363 }
01364 }
01365 #endif
01366 if (*p + QP2 < f) *p= *p + QP2;
01367 else if(*p - QP2 > f) *p= *p - QP2;
01368 else *p=f;
01369 }
01370 }
01371 }
01372 #ifdef DEBUG_DERING_THRESHOLD
01373 if(max-min < 20){
01374 for(y=1; y<9; y++){
01375 int x;
01376 int t = 0;
01377 p= src + stride*y;
01378 for(x=1; x<9; x++){
01379 p++;
01380 *p = FFMIN(*p + 20, 255);
01381 }
01382 }
01383
01384 }
01385 #endif
01386 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01387 }
01388 #endif //HAVE_ALTIVEC
01389
01396 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
01397 {
01398 #if HAVE_MMX2 || HAVE_AMD3DNOW
01399 src+= 4*stride;
01400 __asm__ volatile(
01401 "lea (%0, %1), %%"REG_a" \n\t"
01402 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
01403
01404
01405
01406 "movq (%0), %%mm0 \n\t"
01407 "movq (%%"REG_a", %1), %%mm1 \n\t"
01408 PAVGB(%%mm1, %%mm0)
01409 "movq %%mm0, (%%"REG_a") \n\t"
01410 "movq (%0, %1, 4), %%mm0 \n\t"
01411 PAVGB(%%mm0, %%mm1)
01412 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
01413 "movq (%%"REG_c", %1), %%mm1 \n\t"
01414 PAVGB(%%mm1, %%mm0)
01415 "movq %%mm0, (%%"REG_c") \n\t"
01416 "movq (%0, %1, 8), %%mm0 \n\t"
01417 PAVGB(%%mm0, %%mm1)
01418 "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
01419
01420 : : "r" (src), "r" ((x86_reg)stride)
01421 : "%"REG_a, "%"REG_c
01422 );
01423 #else
01424 int a, b, x;
01425 src+= 4*stride;
01426
01427 for(x=0; x<2; x++){
01428 a= *(uint32_t*)&src[stride*0];
01429 b= *(uint32_t*)&src[stride*2];
01430 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01431 a= *(uint32_t*)&src[stride*4];
01432 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01433 b= *(uint32_t*)&src[stride*6];
01434 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01435 a= *(uint32_t*)&src[stride*8];
01436 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01437 src += 4;
01438 }
01439 #endif
01440 }
01441
01449 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
01450 {
01451 #if HAVE_MMX2 || HAVE_AMD3DNOW
01452 src+= stride*3;
01453 __asm__ volatile(
01454 "lea (%0, %1), %%"REG_a" \n\t"
01455 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01456 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
01457 "add %1, %%"REG_c" \n\t"
01458 "pxor %%mm7, %%mm7 \n\t"
01459
01460
01461
01462 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
01463 "movq " #a ", %%mm0 \n\t"\
01464 "movq " #b ", %%mm1 \n\t"\
01465 "movq " #d ", %%mm2 \n\t"\
01466 "movq " #e ", %%mm3 \n\t"\
01467 PAVGB(%%mm2, %%mm1) \
01468 PAVGB(%%mm3, %%mm0) \
01469 "movq %%mm0, %%mm2 \n\t"\
01470 "punpcklbw %%mm7, %%mm0 \n\t"\
01471 "punpckhbw %%mm7, %%mm2 \n\t"\
01472 "movq %%mm1, %%mm3 \n\t"\
01473 "punpcklbw %%mm7, %%mm1 \n\t"\
01474 "punpckhbw %%mm7, %%mm3 \n\t"\
01475 "psubw %%mm1, %%mm0 \n\t" \
01476 "psubw %%mm3, %%mm2 \n\t" \
01477 "psraw $3, %%mm0 \n\t" \
01478 "psraw $3, %%mm2 \n\t" \
01479 "psubw %%mm0, %%mm1 \n\t" \
01480 "psubw %%mm2, %%mm3 \n\t" \
01481 "packuswb %%mm3, %%mm1 \n\t"\
01482 "movq %%mm1, " #c " \n\t"
01483 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
01484
01485 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
01486 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
01487 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
01488 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
01489
01490 : : "r" (src), "r" ((x86_reg)stride)
01491 : "%"REG_a, "%"REG_d, "%"REG_c
01492 );
01493 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01494 int x;
01495 src+= stride*3;
01496 for(x=0; x<8; x++){
01497 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
01498 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
01499 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
01500 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
01501 src++;
01502 }
01503 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01504 }
01505
01513 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
01514 {
01515 #if HAVE_MMX2 || HAVE_AMD3DNOW
01516 src+= stride*4;
01517 __asm__ volatile(
01518 "lea (%0, %1), %%"REG_a" \n\t"
01519 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01520 "pxor %%mm7, %%mm7 \n\t"
01521 "movq (%2), %%mm0 \n\t"
01522
01523
01524
01525 #define REAL_DEINT_FF(a,b,c,d)\
01526 "movq " #a ", %%mm1 \n\t"\
01527 "movq " #b ", %%mm2 \n\t"\
01528 "movq " #c ", %%mm3 \n\t"\
01529 "movq " #d ", %%mm4 \n\t"\
01530 PAVGB(%%mm3, %%mm1) \
01531 PAVGB(%%mm4, %%mm0) \
01532 "movq %%mm0, %%mm3 \n\t"\
01533 "punpcklbw %%mm7, %%mm0 \n\t"\
01534 "punpckhbw %%mm7, %%mm3 \n\t"\
01535 "movq %%mm1, %%mm4 \n\t"\
01536 "punpcklbw %%mm7, %%mm1 \n\t"\
01537 "punpckhbw %%mm7, %%mm4 \n\t"\
01538 "psllw $2, %%mm1 \n\t"\
01539 "psllw $2, %%mm4 \n\t"\
01540 "psubw %%mm0, %%mm1 \n\t"\
01541 "psubw %%mm3, %%mm4 \n\t"\
01542 "movq %%mm2, %%mm5 \n\t"\
01543 "movq %%mm2, %%mm0 \n\t"\
01544 "punpcklbw %%mm7, %%mm2 \n\t"\
01545 "punpckhbw %%mm7, %%mm5 \n\t"\
01546 "paddw %%mm2, %%mm1 \n\t"\
01547 "paddw %%mm5, %%mm4 \n\t"\
01548 "psraw $2, %%mm1 \n\t"\
01549 "psraw $2, %%mm4 \n\t"\
01550 "packuswb %%mm4, %%mm1 \n\t"\
01551 "movq %%mm1, " #b " \n\t"\
01552
01553 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
01554
01555 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
01556 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
01557 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
01558 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
01559
01560 "movq %%mm0, (%2) \n\t"
01561 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
01562 : "%"REG_a, "%"REG_d
01563 );
01564 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01565 int x;
01566 src+= stride*4;
01567 for(x=0; x<8; x++){
01568 int t1= tmp[x];
01569 int t2= src[stride*1];
01570
01571 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
01572 t1= src[stride*4];
01573 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
01574 t2= src[stride*6];
01575 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
01576 t1= src[stride*8];
01577 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
01578 tmp[x]= t1;
01579
01580 src++;
01581 }
01582 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01583 }
01584
01592 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
01593 {
01594 #if HAVE_MMX2 || HAVE_AMD3DNOW
01595 src+= stride*4;
01596 __asm__ volatile(
01597 "lea (%0, %1), %%"REG_a" \n\t"
01598 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01599 "pxor %%mm7, %%mm7 \n\t"
01600 "movq (%2), %%mm0 \n\t"
01601 "movq (%3), %%mm1 \n\t"
01602
01603
01604
01605 #define REAL_DEINT_L5(t1,t2,a,b,c)\
01606 "movq " #a ", %%mm2 \n\t"\
01607 "movq " #b ", %%mm3 \n\t"\
01608 "movq " #c ", %%mm4 \n\t"\
01609 PAVGB(t2, %%mm3) \
01610 PAVGB(t1, %%mm4) \
01611 "movq %%mm2, %%mm5 \n\t"\
01612 "movq %%mm2, " #t1 " \n\t"\
01613 "punpcklbw %%mm7, %%mm2 \n\t"\
01614 "punpckhbw %%mm7, %%mm5 \n\t"\
01615 "movq %%mm2, %%mm6 \n\t"\
01616 "paddw %%mm2, %%mm2 \n\t"\
01617 "paddw %%mm6, %%mm2 \n\t"\
01618 "movq %%mm5, %%mm6 \n\t"\
01619 "paddw %%mm5, %%mm5 \n\t"\
01620 "paddw %%mm6, %%mm5 \n\t"\
01621 "movq %%mm3, %%mm6 \n\t"\
01622 "punpcklbw %%mm7, %%mm3 \n\t"\
01623 "punpckhbw %%mm7, %%mm6 \n\t"\
01624 "paddw %%mm3, %%mm3 \n\t"\
01625 "paddw %%mm6, %%mm6 \n\t"\
01626 "paddw %%mm3, %%mm2 \n\t"\
01627 "paddw %%mm6, %%mm5 \n\t"\
01628 "movq %%mm4, %%mm6 \n\t"\
01629 "punpcklbw %%mm7, %%mm4 \n\t"\
01630 "punpckhbw %%mm7, %%mm6 \n\t"\
01631 "psubw %%mm4, %%mm2 \n\t"\
01632 "psubw %%mm6, %%mm5 \n\t"\
01633 "psraw $2, %%mm2 \n\t"\
01634 "psraw $2, %%mm5 \n\t"\
01635 "packuswb %%mm5, %%mm2 \n\t"\
01636 "movq %%mm2, " #a " \n\t"\
01637
01638 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
01639
01640 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
01641 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
01642 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) )
01643 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
01644 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
01645 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
01646 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
01647 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
01648
01649 "movq %%mm0, (%2) \n\t"
01650 "movq %%mm1, (%3) \n\t"
01651 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
01652 : "%"REG_a, "%"REG_d
01653 );
01654 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01655 int x;
01656 src+= stride*4;
01657 for(x=0; x<8; x++){
01658 int t1= tmp[x];
01659 int t2= tmp2[x];
01660 int t3= src[0];
01661
01662 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
01663 t1= src[stride*1];
01664 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
01665 t2= src[stride*2];
01666 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
01667 t3= src[stride*3];
01668 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
01669 t1= src[stride*4];
01670 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
01671 t2= src[stride*5];
01672 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
01673 t3= src[stride*6];
01674 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
01675 t1= src[stride*7];
01676 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
01677
01678 tmp[x]= t3;
01679 tmp2[x]= t1;
01680
01681 src++;
01682 }
01683 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01684 }
01685
01693 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
01694 {
01695 #if HAVE_MMX2 || HAVE_AMD3DNOW
01696 src+= 4*stride;
01697 __asm__ volatile(
01698 "lea (%0, %1), %%"REG_a" \n\t"
01699 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01700
01701
01702
01703 "movq (%2), %%mm0 \n\t"
01704 "movq (%%"REG_a"), %%mm1 \n\t"
01705 PAVGB(%%mm1, %%mm0)
01706 "movq (%0), %%mm2 \n\t"
01707 PAVGB(%%mm2, %%mm0)
01708 "movq %%mm0, (%0) \n\t"
01709 "movq (%%"REG_a", %1), %%mm0 \n\t"
01710 PAVGB(%%mm0, %%mm2)
01711 PAVGB(%%mm1, %%mm2)
01712 "movq %%mm2, (%%"REG_a") \n\t"
01713 "movq (%%"REG_a", %1, 2), %%mm2 \n\t"
01714 PAVGB(%%mm2, %%mm1)
01715 PAVGB(%%mm0, %%mm1)
01716 "movq %%mm1, (%%"REG_a", %1) \n\t"
01717 "movq (%0, %1, 4), %%mm1 \n\t"
01718 PAVGB(%%mm1, %%mm0)
01719 PAVGB(%%mm2, %%mm0)
01720 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
01721 "movq (%%"REG_d"), %%mm0 \n\t"
01722 PAVGB(%%mm0, %%mm2)
01723 PAVGB(%%mm1, %%mm2)
01724 "movq %%mm2, (%0, %1, 4) \n\t"
01725 "movq (%%"REG_d", %1), %%mm2 \n\t"
01726 PAVGB(%%mm2, %%mm1)
01727 PAVGB(%%mm0, %%mm1)
01728 "movq %%mm1, (%%"REG_d") \n\t"
01729 "movq (%%"REG_d", %1, 2), %%mm1 \n\t"
01730 PAVGB(%%mm1, %%mm0)
01731 PAVGB(%%mm2, %%mm0)
01732 "movq %%mm0, (%%"REG_d", %1) \n\t"
01733 "movq (%0, %1, 8), %%mm0 \n\t"
01734 PAVGB(%%mm0, %%mm2)
01735 PAVGB(%%mm1, %%mm2)
01736 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
01737 "movq %%mm1, (%2) \n\t"
01738
01739 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
01740 : "%"REG_a, "%"REG_d
01741 );
01742 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01743 int a, b, c, x;
01744 src+= 4*stride;
01745
01746 for(x=0; x<2; x++){
01747 a= *(uint32_t*)&tmp[stride*0];
01748 b= *(uint32_t*)&src[stride*0];
01749 c= *(uint32_t*)&src[stride*1];
01750 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01751 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01752
01753 a= *(uint32_t*)&src[stride*2];
01754 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01755 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01756
01757 b= *(uint32_t*)&src[stride*3];
01758 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
01759 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
01760
01761 c= *(uint32_t*)&src[stride*4];
01762 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01763 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01764
01765 a= *(uint32_t*)&src[stride*5];
01766 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01767 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01768
01769 b= *(uint32_t*)&src[stride*6];
01770 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
01771 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
01772
01773 c= *(uint32_t*)&src[stride*7];
01774 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01775 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01776
01777 a= *(uint32_t*)&src[stride*8];
01778 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01779 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01780
01781 *(uint32_t*)&tmp[stride*0]= c;
01782 src += 4;
01783 tmp += 4;
01784 }
01785 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01786 }
01787
01794 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
01795 {
01796 #if HAVE_MMX
01797 src+= 4*stride;
01798 #if HAVE_MMX2
01799 __asm__ volatile(
01800 "lea (%0, %1), %%"REG_a" \n\t"
01801 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01802
01803
01804
01805 "movq (%0), %%mm0 \n\t"
01806 "movq (%%"REG_a", %1), %%mm2 \n\t"
01807 "movq (%%"REG_a"), %%mm1 \n\t"
01808 "movq %%mm0, %%mm3 \n\t"
01809 "pmaxub %%mm1, %%mm0 \n\t"
01810 "pminub %%mm3, %%mm1 \n\t"
01811 "pmaxub %%mm2, %%mm1 \n\t"
01812 "pminub %%mm1, %%mm0 \n\t"
01813 "movq %%mm0, (%%"REG_a") \n\t"
01814
01815 "movq (%0, %1, 4), %%mm0 \n\t"
01816 "movq (%%"REG_a", %1, 2), %%mm1 \n\t"
01817 "movq %%mm2, %%mm3 \n\t"
01818 "pmaxub %%mm1, %%mm2 \n\t"
01819 "pminub %%mm3, %%mm1 \n\t"
01820 "pmaxub %%mm0, %%mm1 \n\t"
01821 "pminub %%mm1, %%mm2 \n\t"
01822 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
01823
01824 "movq (%%"REG_d"), %%mm2 \n\t"
01825 "movq (%%"REG_d", %1), %%mm1 \n\t"
01826 "movq %%mm2, %%mm3 \n\t"
01827 "pmaxub %%mm0, %%mm2 \n\t"
01828 "pminub %%mm3, %%mm0 \n\t"
01829 "pmaxub %%mm1, %%mm0 \n\t"
01830 "pminub %%mm0, %%mm2 \n\t"
01831 "movq %%mm2, (%%"REG_d") \n\t"
01832
01833 "movq (%%"REG_d", %1, 2), %%mm2 \n\t"
01834 "movq (%0, %1, 8), %%mm0 \n\t"
01835 "movq %%mm2, %%mm3 \n\t"
01836 "pmaxub %%mm0, %%mm2 \n\t"
01837 "pminub %%mm3, %%mm0 \n\t"
01838 "pmaxub %%mm1, %%mm0 \n\t"
01839 "pminub %%mm0, %%mm2 \n\t"
01840 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
01841
01842
01843 : : "r" (src), "r" ((x86_reg)stride)
01844 : "%"REG_a, "%"REG_d
01845 );
01846
01847 #else // MMX without MMX2
01848 __asm__ volatile(
01849 "lea (%0, %1), %%"REG_a" \n\t"
01850 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01851
01852
01853 "pxor %%mm7, %%mm7 \n\t"
01854
01855 #define REAL_MEDIAN(a,b,c)\
01856 "movq " #a ", %%mm0 \n\t"\
01857 "movq " #b ", %%mm2 \n\t"\
01858 "movq " #c ", %%mm1 \n\t"\
01859 "movq %%mm0, %%mm3 \n\t"\
01860 "movq %%mm1, %%mm4 \n\t"\
01861 "movq %%mm2, %%mm5 \n\t"\
01862 "psubusb %%mm1, %%mm3 \n\t"\
01863 "psubusb %%mm2, %%mm4 \n\t"\
01864 "psubusb %%mm0, %%mm5 \n\t"\
01865 "pcmpeqb %%mm7, %%mm3 \n\t"\
01866 "pcmpeqb %%mm7, %%mm4 \n\t"\
01867 "pcmpeqb %%mm7, %%mm5 \n\t"\
01868 "movq %%mm3, %%mm6 \n\t"\
01869 "pxor %%mm4, %%mm3 \n\t"\
01870 "pxor %%mm5, %%mm4 \n\t"\
01871 "pxor %%mm6, %%mm5 \n\t"\
01872 "por %%mm3, %%mm1 \n\t"\
01873 "por %%mm4, %%mm2 \n\t"\
01874 "por %%mm5, %%mm0 \n\t"\
01875 "pand %%mm2, %%mm0 \n\t"\
01876 "pand %%mm1, %%mm0 \n\t"\
01877 "movq %%mm0, " #b " \n\t"
01878 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
01879
01880 MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
01881 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
01882 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
01883 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
01884
01885 : : "r" (src), "r" ((x86_reg)stride)
01886 : "%"REG_a, "%"REG_d
01887 );
01888 #endif //HAVE_MMX2
01889 #else //HAVE_MMX
01890 int x, y;
01891 src+= 4*stride;
01892
01893 for(x=0; x<8; x++){
01894 uint8_t *colsrc = src;
01895 for (y=0; y<4; y++){
01896 int a, b, c, d, e, f;
01897 a = colsrc[0 ];
01898 b = colsrc[stride ];
01899 c = colsrc[stride*2];
01900 d = (a-b)>>31;
01901 e = (b-c)>>31;
01902 f = (c-a)>>31;
01903 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
01904 colsrc += stride*2;
01905 }
01906 src++;
01907 }
01908 #endif //HAVE_MMX
01909 }
01910
01911 #if HAVE_MMX
01912
01915 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
01916 {
01917 __asm__(
01918 "lea (%0, %1), %%"REG_a" \n\t"
01919
01920
01921 "movq (%0), %%mm0 \n\t"
01922 "movq (%%"REG_a"), %%mm1 \n\t"
01923 "movq %%mm0, %%mm2 \n\t"
01924 "punpcklbw %%mm1, %%mm0 \n\t"
01925 "punpckhbw %%mm1, %%mm2 \n\t"
01926
01927 "movq (%%"REG_a", %1), %%mm1 \n\t"
01928 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
01929 "movq %%mm1, %%mm4 \n\t"
01930 "punpcklbw %%mm3, %%mm1 \n\t"
01931 "punpckhbw %%mm3, %%mm4 \n\t"
01932
01933 "movq %%mm0, %%mm3 \n\t"
01934 "punpcklwd %%mm1, %%mm0 \n\t"
01935 "punpckhwd %%mm1, %%mm3 \n\t"
01936 "movq %%mm2, %%mm1 \n\t"
01937 "punpcklwd %%mm4, %%mm2 \n\t"
01938 "punpckhwd %%mm4, %%mm1 \n\t"
01939
01940 "movd %%mm0, 128(%2) \n\t"
01941 "psrlq $32, %%mm0 \n\t"
01942 "movd %%mm0, 144(%2) \n\t"
01943 "movd %%mm3, 160(%2) \n\t"
01944 "psrlq $32, %%mm3 \n\t"
01945 "movd %%mm3, 176(%2) \n\t"
01946 "movd %%mm3, 48(%3) \n\t"
01947 "movd %%mm2, 192(%2) \n\t"
01948 "movd %%mm2, 64(%3) \n\t"
01949 "psrlq $32, %%mm2 \n\t"
01950 "movd %%mm2, 80(%3) \n\t"
01951 "movd %%mm1, 96(%3) \n\t"
01952 "psrlq $32, %%mm1 \n\t"
01953 "movd %%mm1, 112(%3) \n\t"
01954
01955 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
01956
01957 "movq (%0, %1, 4), %%mm0 \n\t"
01958 "movq (%%"REG_a"), %%mm1 \n\t"
01959 "movq %%mm0, %%mm2 \n\t"
01960 "punpcklbw %%mm1, %%mm0 \n\t"
01961 "punpckhbw %%mm1, %%mm2 \n\t"
01962
01963 "movq (%%"REG_a", %1), %%mm1 \n\t"
01964 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
01965 "movq %%mm1, %%mm4 \n\t"
01966 "punpcklbw %%mm3, %%mm1 \n\t"
01967 "punpckhbw %%mm3, %%mm4 \n\t"
01968
01969 "movq %%mm0, %%mm3 \n\t"
01970 "punpcklwd %%mm1, %%mm0 \n\t"
01971 "punpckhwd %%mm1, %%mm3 \n\t"
01972 "movq %%mm2, %%mm1 \n\t"
01973 "punpcklwd %%mm4, %%mm2 \n\t"
01974 "punpckhwd %%mm4, %%mm1 \n\t"
01975
01976 "movd %%mm0, 132(%2) \n\t"
01977 "psrlq $32, %%mm0 \n\t"
01978 "movd %%mm0, 148(%2) \n\t"
01979 "movd %%mm3, 164(%2) \n\t"
01980 "psrlq $32, %%mm3 \n\t"
01981 "movd %%mm3, 180(%2) \n\t"
01982 "movd %%mm3, 52(%3) \n\t"
01983 "movd %%mm2, 196(%2) \n\t"
01984 "movd %%mm2, 68(%3) \n\t"
01985 "psrlq $32, %%mm2 \n\t"
01986 "movd %%mm2, 84(%3) \n\t"
01987 "movd %%mm1, 100(%3) \n\t"
01988 "psrlq $32, %%mm1 \n\t"
01989 "movd %%mm1, 116(%3) \n\t"
01990
01991
01992 :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
01993 : "%"REG_a
01994 );
01995 }
01996
02000 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
02001 {
02002 __asm__(
02003 "lea (%0, %1), %%"REG_a" \n\t"
02004 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
02005
02006
02007 "movq (%2), %%mm0 \n\t"
02008 "movq 16(%2), %%mm1 \n\t"
02009 "movq %%mm0, %%mm2 \n\t"
02010 "punpcklbw %%mm1, %%mm0 \n\t"
02011 "punpckhbw %%mm1, %%mm2 \n\t"
02012
02013 "movq 32(%2), %%mm1 \n\t"
02014 "movq 48(%2), %%mm3 \n\t"
02015 "movq %%mm1, %%mm4 \n\t"
02016 "punpcklbw %%mm3, %%mm1 \n\t"
02017 "punpckhbw %%mm3, %%mm4 \n\t"
02018
02019 "movq %%mm0, %%mm3 \n\t"
02020 "punpcklwd %%mm1, %%mm0 \n\t"
02021 "punpckhwd %%mm1, %%mm3 \n\t"
02022 "movq %%mm2, %%mm1 \n\t"
02023 "punpcklwd %%mm4, %%mm2 \n\t"
02024 "punpckhwd %%mm4, %%mm1 \n\t"
02025
02026 "movd %%mm0, (%0) \n\t"
02027 "psrlq $32, %%mm0 \n\t"
02028 "movd %%mm0, (%%"REG_a") \n\t"
02029 "movd %%mm3, (%%"REG_a", %1) \n\t"
02030 "psrlq $32, %%mm3 \n\t"
02031 "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
02032 "movd %%mm2, (%0, %1, 4) \n\t"
02033 "psrlq $32, %%mm2 \n\t"
02034 "movd %%mm2, (%%"REG_d") \n\t"
02035 "movd %%mm1, (%%"REG_d", %1) \n\t"
02036 "psrlq $32, %%mm1 \n\t"
02037 "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
02038
02039
02040 "movq 64(%2), %%mm0 \n\t"
02041 "movq 80(%2), %%mm1 \n\t"
02042 "movq %%mm0, %%mm2 \n\t"
02043 "punpcklbw %%mm1, %%mm0 \n\t"
02044 "punpckhbw %%mm1, %%mm2 \n\t"
02045
02046 "movq 96(%2), %%mm1 \n\t"
02047 "movq 112(%2), %%mm3 \n\t"
02048 "movq %%mm1, %%mm4 \n\t"
02049 "punpcklbw %%mm3, %%mm1 \n\t"
02050 "punpckhbw %%mm3, %%mm4 \n\t"
02051
02052 "movq %%mm0, %%mm3 \n\t"
02053 "punpcklwd %%mm1, %%mm0 \n\t"
02054 "punpckhwd %%mm1, %%mm3 \n\t"
02055 "movq %%mm2, %%mm1 \n\t"
02056 "punpcklwd %%mm4, %%mm2 \n\t"
02057 "punpckhwd %%mm4, %%mm1 \n\t"
02058
02059 "movd %%mm0, 4(%0) \n\t"
02060 "psrlq $32, %%mm0 \n\t"
02061 "movd %%mm0, 4(%%"REG_a") \n\t"
02062 "movd %%mm3, 4(%%"REG_a", %1) \n\t"
02063 "psrlq $32, %%mm3 \n\t"
02064 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
02065 "movd %%mm2, 4(%0, %1, 4) \n\t"
02066 "psrlq $32, %%mm2 \n\t"
02067 "movd %%mm2, 4(%%"REG_d") \n\t"
02068 "movd %%mm1, 4(%%"REG_d", %1) \n\t"
02069 "psrlq $32, %%mm1 \n\t"
02070 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
02071
02072 :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
02073 : "%"REG_a, "%"REG_d
02074 );
02075 }
02076 #endif //HAVE_MMX
02077
02078
02079 #if !HAVE_ALTIVEC
02080 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
02081 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
02082 {
02083
02084 tempBlurredPast[127]= maxNoise[0];
02085 tempBlurredPast[128]= maxNoise[1];
02086 tempBlurredPast[129]= maxNoise[2];
02087
02088 #define FAST_L2_DIFF
02089
02090 #if HAVE_MMX2 || HAVE_AMD3DNOW
02091 __asm__ volatile(
02092 "lea (%2, %2, 2), %%"REG_a" \n\t"
02093 "lea (%2, %2, 4), %%"REG_d" \n\t"
02094 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02095
02096
02097
02098 #ifdef L1_DIFF //needs mmx2
02099 "movq (%0), %%mm0 \n\t"
02100 "psadbw (%1), %%mm0 \n\t"
02101 "movq (%0, %2), %%mm1 \n\t"
02102 "psadbw (%1, %2), %%mm1 \n\t"
02103 "movq (%0, %2, 2), %%mm2 \n\t"
02104 "psadbw (%1, %2, 2), %%mm2 \n\t"
02105 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02106 "psadbw (%1, %%"REG_a"), %%mm3 \n\t"
02107
02108 "movq (%0, %2, 4), %%mm4 \n\t"
02109 "paddw %%mm1, %%mm0 \n\t"
02110 "psadbw (%1, %2, 4), %%mm4 \n\t"
02111 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02112 "paddw %%mm2, %%mm0 \n\t"
02113 "psadbw (%1, %%"REG_d"), %%mm5 \n\t"
02114 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02115 "paddw %%mm3, %%mm0 \n\t"
02116 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t"
02117 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02118 "paddw %%mm4, %%mm0 \n\t"
02119 "psadbw (%1, %%"REG_c"), %%mm7 \n\t"
02120 "paddw %%mm5, %%mm6 \n\t"
02121 "paddw %%mm7, %%mm6 \n\t"
02122 "paddw %%mm6, %%mm0 \n\t"
02123 #else //L1_DIFF
02124 #if defined (FAST_L2_DIFF)
02125 "pcmpeqb %%mm7, %%mm7 \n\t"
02126 "movq "MANGLE(b80)", %%mm6 \n\t"
02127 "pxor %%mm0, %%mm0 \n\t"
02128 #define REAL_L2_DIFF_CORE(a, b)\
02129 "movq " #a ", %%mm5 \n\t"\
02130 "movq " #b ", %%mm2 \n\t"\
02131 "pxor %%mm7, %%mm2 \n\t"\
02132 PAVGB(%%mm2, %%mm5)\
02133 "paddb %%mm6, %%mm5 \n\t"\
02134 "movq %%mm5, %%mm2 \n\t"\
02135 "psllw $8, %%mm5 \n\t"\
02136 "pmaddwd %%mm5, %%mm5 \n\t"\
02137 "pmaddwd %%mm2, %%mm2 \n\t"\
02138 "paddd %%mm2, %%mm5 \n\t"\
02139 "psrld $14, %%mm5 \n\t"\
02140 "paddd %%mm5, %%mm0 \n\t"
02141
02142 #else //defined (FAST_L2_DIFF)
02143 "pxor %%mm7, %%mm7 \n\t"
02144 "pxor %%mm0, %%mm0 \n\t"
02145 #define REAL_L2_DIFF_CORE(a, b)\
02146 "movq " #a ", %%mm5 \n\t"\
02147 "movq " #b ", %%mm2 \n\t"\
02148 "movq %%mm5, %%mm1 \n\t"\
02149 "movq %%mm2, %%mm3 \n\t"\
02150 "punpcklbw %%mm7, %%mm5 \n\t"\
02151 "punpckhbw %%mm7, %%mm1 \n\t"\
02152 "punpcklbw %%mm7, %%mm2 \n\t"\
02153 "punpckhbw %%mm7, %%mm3 \n\t"\
02154 "psubw %%mm2, %%mm5 \n\t"\
02155 "psubw %%mm3, %%mm1 \n\t"\
02156 "pmaddwd %%mm5, %%mm5 \n\t"\
02157 "pmaddwd %%mm1, %%mm1 \n\t"\
02158 "paddd %%mm1, %%mm5 \n\t"\
02159 "paddd %%mm5, %%mm0 \n\t"
02160
02161 #endif //defined (FAST_L2_DIFF)
02162
02163 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
02164
02165 L2_DIFF_CORE((%0) , (%1))
02166 L2_DIFF_CORE((%0, %2) , (%1, %2))
02167 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
02168 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
02169 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
02170 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
02171 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
02172 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
02173
02174 #endif //L1_DIFF
02175
02176 "movq %%mm0, %%mm4 \n\t"
02177 "psrlq $32, %%mm0 \n\t"
02178 "paddd %%mm0, %%mm4 \n\t"
02179 "movd %%mm4, %%ecx \n\t"
02180 "shll $2, %%ecx \n\t"
02181 "mov %3, %%"REG_d" \n\t"
02182 "addl -4(%%"REG_d"), %%ecx \n\t"
02183 "addl 4(%%"REG_d"), %%ecx \n\t"
02184 "addl -1024(%%"REG_d"), %%ecx \n\t"
02185 "addl $4, %%ecx \n\t"
02186 "addl 1024(%%"REG_d"), %%ecx \n\t"
02187 "shrl $3, %%ecx \n\t"
02188 "movl %%ecx, (%%"REG_d") \n\t"
02189
02190
02191
02192
02193 "cmpl 512(%%"REG_d"), %%ecx \n\t"
02194 " jb 2f \n\t"
02195 "cmpl 516(%%"REG_d"), %%ecx \n\t"
02196 " jb 1f \n\t"
02197
02198 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02199 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02200 "movq (%0), %%mm0 \n\t"
02201 "movq (%0, %2), %%mm1 \n\t"
02202 "movq (%0, %2, 2), %%mm2 \n\t"
02203 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02204 "movq (%0, %2, 4), %%mm4 \n\t"
02205 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02206 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02207 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02208 "movq %%mm0, (%1) \n\t"
02209 "movq %%mm1, (%1, %2) \n\t"
02210 "movq %%mm2, (%1, %2, 2) \n\t"
02211 "movq %%mm3, (%1, %%"REG_a") \n\t"
02212 "movq %%mm4, (%1, %2, 4) \n\t"
02213 "movq %%mm5, (%1, %%"REG_d") \n\t"
02214 "movq %%mm6, (%1, %%"REG_a", 2) \n\t"
02215 "movq %%mm7, (%1, %%"REG_c") \n\t"
02216 "jmp 4f \n\t"
02217
02218 "1: \n\t"
02219 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02220 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02221 "movq (%0), %%mm0 \n\t"
02222 PAVGB((%1), %%mm0)
02223 "movq (%0, %2), %%mm1 \n\t"
02224 PAVGB((%1, %2), %%mm1)
02225 "movq (%0, %2, 2), %%mm2 \n\t"
02226 PAVGB((%1, %2, 2), %%mm2)
02227 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02228 PAVGB((%1, %%REGa), %%mm3)
02229 "movq (%0, %2, 4), %%mm4 \n\t"
02230 PAVGB((%1, %2, 4), %%mm4)
02231 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02232 PAVGB((%1, %%REGd), %%mm5)
02233 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02234 PAVGB((%1, %%REGa, 2), %%mm6)
02235 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02236 PAVGB((%1, %%REGc), %%mm7)
02237 "movq %%mm0, (%1) \n\t"
02238 "movq %%mm1, (%1, %2) \n\t"
02239 "movq %%mm2, (%1, %2, 2) \n\t"
02240 "movq %%mm3, (%1, %%"REG_a") \n\t"
02241 "movq %%mm4, (%1, %2, 4) \n\t"
02242 "movq %%mm5, (%1, %%"REG_d") \n\t"
02243 "movq %%mm6, (%1, %%"REG_a", 2) \n\t"
02244 "movq %%mm7, (%1, %%"REG_c") \n\t"
02245 "movq %%mm0, (%0) \n\t"
02246 "movq %%mm1, (%0, %2) \n\t"
02247 "movq %%mm2, (%0, %2, 2) \n\t"
02248 "movq %%mm3, (%0, %%"REG_a") \n\t"
02249 "movq %%mm4, (%0, %2, 4) \n\t"
02250 "movq %%mm5, (%0, %%"REG_d") \n\t"
02251 "movq %%mm6, (%0, %%"REG_a", 2) \n\t"
02252 "movq %%mm7, (%0, %%"REG_c") \n\t"
02253 "jmp 4f \n\t"
02254
02255 "2: \n\t"
02256 "cmpl 508(%%"REG_d"), %%ecx \n\t"
02257 " jb 3f \n\t"
02258
02259 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02260 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02261 "movq (%0), %%mm0 \n\t"
02262 "movq (%0, %2), %%mm1 \n\t"
02263 "movq (%0, %2, 2), %%mm2 \n\t"
02264 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02265 "movq (%1), %%mm4 \n\t"
02266 "movq (%1, %2), %%mm5 \n\t"
02267 "movq (%1, %2, 2), %%mm6 \n\t"
02268 "movq (%1, %%"REG_a"), %%mm7 \n\t"
02269 PAVGB(%%mm4, %%mm0)
02270 PAVGB(%%mm5, %%mm1)
02271 PAVGB(%%mm6, %%mm2)
02272 PAVGB(%%mm7, %%mm3)
02273 PAVGB(%%mm4, %%mm0)
02274 PAVGB(%%mm5, %%mm1)
02275 PAVGB(%%mm6, %%mm2)
02276 PAVGB(%%mm7, %%mm3)
02277 "movq %%mm0, (%1) \n\t"
02278 "movq %%mm1, (%1, %2) \n\t"
02279 "movq %%mm2, (%1, %2, 2) \n\t"
02280 "movq %%mm3, (%1, %%"REG_a") \n\t"
02281 "movq %%mm0, (%0) \n\t"
02282 "movq %%mm1, (%0, %2) \n\t"
02283 "movq %%mm2, (%0, %2, 2) \n\t"
02284 "movq %%mm3, (%0, %%"REG_a") \n\t"
02285
02286 "movq (%0, %2, 4), %%mm0 \n\t"
02287 "movq (%0, %%"REG_d"), %%mm1 \n\t"
02288 "movq (%0, %%"REG_a", 2), %%mm2 \n\t"
02289 "movq (%0, %%"REG_c"), %%mm3 \n\t"
02290 "movq (%1, %2, 4), %%mm4 \n\t"
02291 "movq (%1, %%"REG_d"), %%mm5 \n\t"
02292 "movq (%1, %%"REG_a", 2), %%mm6 \n\t"
02293 "movq (%1, %%"REG_c"), %%mm7 \n\t"
02294 PAVGB(%%mm4, %%mm0)
02295 PAVGB(%%mm5, %%mm1)
02296 PAVGB(%%mm6, %%mm2)
02297 PAVGB(%%mm7, %%mm3)
02298 PAVGB(%%mm4, %%mm0)
02299 PAVGB(%%mm5, %%mm1)
02300 PAVGB(%%mm6, %%mm2)
02301 PAVGB(%%mm7, %%mm3)
02302 "movq %%mm0, (%1, %2, 4) \n\t"
02303 "movq %%mm1, (%1, %%"REG_d") \n\t"
02304 "movq %%mm2, (%1, %%"REG_a", 2) \n\t"
02305 "movq %%mm3, (%1, %%"REG_c") \n\t"
02306 "movq %%mm0, (%0, %2, 4) \n\t"
02307 "movq %%mm1, (%0, %%"REG_d") \n\t"
02308 "movq %%mm2, (%0, %%"REG_a", 2) \n\t"
02309 "movq %%mm3, (%0, %%"REG_c") \n\t"
02310 "jmp 4f \n\t"
02311
02312 "3: \n\t"
02313 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02314 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02315 "movq (%0), %%mm0 \n\t"
02316 "movq (%0, %2), %%mm1 \n\t"
02317 "movq (%0, %2, 2), %%mm2 \n\t"
02318 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02319 "movq (%1), %%mm4 \n\t"
02320 "movq (%1, %2), %%mm5 \n\t"
02321 "movq (%1, %2, 2), %%mm6 \n\t"
02322 "movq (%1, %%"REG_a"), %%mm7 \n\t"
02323 PAVGB(%%mm4, %%mm0)
02324 PAVGB(%%mm5, %%mm1)
02325 PAVGB(%%mm6, %%mm2)
02326 PAVGB(%%mm7, %%mm3)
02327 PAVGB(%%mm4, %%mm0)
02328 PAVGB(%%mm5, %%mm1)
02329 PAVGB(%%mm6, %%mm2)
02330 PAVGB(%%mm7, %%mm3)
02331 PAVGB(%%mm4, %%mm0)
02332 PAVGB(%%mm5, %%mm1)
02333 PAVGB(%%mm6, %%mm2)
02334 PAVGB(%%mm7, %%mm3)
02335 "movq %%mm0, (%1) \n\t"
02336 "movq %%mm1, (%1, %2) \n\t"
02337 "movq %%mm2, (%1, %2, 2) \n\t"
02338 "movq %%mm3, (%1, %%"REG_a") \n\t"
02339 "movq %%mm0, (%0) \n\t"
02340 "movq %%mm1, (%0, %2) \n\t"
02341 "movq %%mm2, (%0, %2, 2) \n\t"
02342 "movq %%mm3, (%0, %%"REG_a") \n\t"
02343
02344 "movq (%0, %2, 4), %%mm0 \n\t"
02345 "movq (%0, %%"REG_d"), %%mm1 \n\t"
02346 "movq (%0, %%"REG_a", 2), %%mm2 \n\t"
02347 "movq (%0, %%"REG_c"), %%mm3 \n\t"
02348 "movq (%1, %2, 4), %%mm4 \n\t"
02349 "movq (%1, %%"REG_d"), %%mm5 \n\t"
02350 "movq (%1, %%"REG_a", 2), %%mm6 \n\t"
02351 "movq (%1, %%"REG_c"), %%mm7 \n\t"
02352 PAVGB(%%mm4, %%mm0)
02353 PAVGB(%%mm5, %%mm1)
02354 PAVGB(%%mm6, %%mm2)
02355 PAVGB(%%mm7, %%mm3)
02356 PAVGB(%%mm4, %%mm0)
02357 PAVGB(%%mm5, %%mm1)
02358 PAVGB(%%mm6, %%mm2)
02359 PAVGB(%%mm7, %%mm3)
02360 PAVGB(%%mm4, %%mm0)
02361 PAVGB(%%mm5, %%mm1)
02362 PAVGB(%%mm6, %%mm2)
02363 PAVGB(%%mm7, %%mm3)
02364 "movq %%mm0, (%1, %2, 4) \n\t"
02365 "movq %%mm1, (%1, %%"REG_d") \n\t"
02366 "movq %%mm2, (%1, %%"REG_a", 2) \n\t"
02367 "movq %%mm3, (%1, %%"REG_c") \n\t"
02368 "movq %%mm0, (%0, %2, 4) \n\t"
02369 "movq %%mm1, (%0, %%"REG_d") \n\t"
02370 "movq %%mm2, (%0, %%"REG_a", 2) \n\t"
02371 "movq %%mm3, (%0, %%"REG_c") \n\t"
02372
02373 "4: \n\t"
02374
02375 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
02376 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
02377 );
02378 #else //HAVE_MMX2 || HAVE_AMD3DNOW
02379 {
02380 int y;
02381 int d=0;
02382
02383 int i;
02384
02385 for(y=0; y<8; y++){
02386 int x;
02387 for(x=0; x<8; x++){
02388 int ref= tempBlurred[ x + y*stride ];
02389 int cur= src[ x + y*stride ];
02390 int d1=ref - cur;
02391
02392
02393
02394 d+= d1*d1;
02395
02396 }
02397 }
02398 i=d;
02399 d= (
02400 4*d
02401 +(*(tempBlurredPast-256))
02402 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
02403 +(*(tempBlurredPast+256))
02404 +4)>>3;
02405 *tempBlurredPast=i;
02406
02407
02408
02409
02410
02411
02412
02413
02414
02415 if(d > maxNoise[1]){
02416 if(d < maxNoise[2]){
02417 for(y=0; y<8; y++){
02418 int x;
02419 for(x=0; x<8; x++){
02420 int ref= tempBlurred[ x + y*stride ];
02421 int cur= src[ x + y*stride ];
02422 tempBlurred[ x + y*stride ]=
02423 src[ x + y*stride ]=
02424 (ref + cur + 1)>>1;
02425 }
02426 }
02427 }else{
02428 for(y=0; y<8; y++){
02429 int x;
02430 for(x=0; x<8; x++){
02431 tempBlurred[ x + y*stride ]= src[ x + y*stride ];
02432 }
02433 }
02434 }
02435 }else{
02436 if(d < maxNoise[0]){
02437 for(y=0; y<8; y++){
02438 int x;
02439 for(x=0; x<8; x++){
02440 int ref= tempBlurred[ x + y*stride ];
02441 int cur= src[ x + y*stride ];
02442 tempBlurred[ x + y*stride ]=
02443 src[ x + y*stride ]=
02444 (ref*7 + cur + 4)>>3;
02445 }
02446 }
02447 }else{
02448 for(y=0; y<8; y++){
02449 int x;
02450 for(x=0; x<8; x++){
02451 int ref= tempBlurred[ x + y*stride ];
02452 int cur= src[ x + y*stride ];
02453 tempBlurred[ x + y*stride ]=
02454 src[ x + y*stride ]=
02455 (ref*3 + cur + 2)>>2;
02456 }
02457 }
02458 }
02459 }
02460 }
02461 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
02462 }
02463 #endif //HAVE_ALTIVEC
02464
02465 #if HAVE_MMX
02466
02469 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
02470 int64_t dc_mask, eq_mask, both_masks;
02471 int64_t sums[10*8*2];
02472 src+= step*3;
02473
02474 __asm__ volatile(
02475 "movq %0, %%mm7 \n\t"
02476 "movq %1, %%mm6 \n\t"
02477 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
02478 );
02479
02480 __asm__ volatile(
02481 "lea (%2, %3), %%"REG_a" \n\t"
02482
02483
02484
02485 "movq (%2), %%mm0 \n\t"
02486 "movq (%%"REG_a"), %%mm1 \n\t"
02487 "movq %%mm1, %%mm3 \n\t"
02488 "movq %%mm1, %%mm4 \n\t"
02489 "psubb %%mm1, %%mm0 \n\t"
02490 "paddb %%mm7, %%mm0 \n\t"
02491 "pcmpgtb %%mm6, %%mm0 \n\t"
02492
02493 "movq (%%"REG_a",%3), %%mm2 \n\t"
02494 PMAXUB(%%mm2, %%mm4)
02495 PMINUB(%%mm2, %%mm3, %%mm5)
02496 "psubb %%mm2, %%mm1 \n\t"
02497 "paddb %%mm7, %%mm1 \n\t"
02498 "pcmpgtb %%mm6, %%mm1 \n\t"
02499 "paddb %%mm1, %%mm0 \n\t"
02500
02501 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
02502 PMAXUB(%%mm1, %%mm4)
02503 PMINUB(%%mm1, %%mm3, %%mm5)
02504 "psubb %%mm1, %%mm2 \n\t"
02505 "paddb %%mm7, %%mm2 \n\t"
02506 "pcmpgtb %%mm6, %%mm2 \n\t"
02507 "paddb %%mm2, %%mm0 \n\t"
02508
02509 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
02510
02511 "movq (%2, %3, 4), %%mm2 \n\t"
02512 PMAXUB(%%mm2, %%mm4)
02513 PMINUB(%%mm2, %%mm3, %%mm5)
02514 "psubb %%mm2, %%mm1 \n\t"
02515 "paddb %%mm7, %%mm1 \n\t"
02516 "pcmpgtb %%mm6, %%mm1 \n\t"
02517 "paddb %%mm1, %%mm0 \n\t"
02518
02519 "movq (%%"REG_a"), %%mm1 \n\t"
02520 PMAXUB(%%mm1, %%mm4)
02521 PMINUB(%%mm1, %%mm3, %%mm5)
02522 "psubb %%mm1, %%mm2 \n\t"
02523 "paddb %%mm7, %%mm2 \n\t"
02524 "pcmpgtb %%mm6, %%mm2 \n\t"
02525 "paddb %%mm2, %%mm0 \n\t"
02526
02527 "movq (%%"REG_a", %3), %%mm2 \n\t"
02528 PMAXUB(%%mm2, %%mm4)
02529 PMINUB(%%mm2, %%mm3, %%mm5)
02530 "psubb %%mm2, %%mm1 \n\t"
02531 "paddb %%mm7, %%mm1 \n\t"
02532 "pcmpgtb %%mm6, %%mm1 \n\t"
02533 "paddb %%mm1, %%mm0 \n\t"
02534
02535 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
02536 PMAXUB(%%mm1, %%mm4)
02537 PMINUB(%%mm1, %%mm3, %%mm5)
02538 "psubb %%mm1, %%mm2 \n\t"
02539 "paddb %%mm7, %%mm2 \n\t"
02540 "pcmpgtb %%mm6, %%mm2 \n\t"
02541 "paddb %%mm2, %%mm0 \n\t"
02542
02543 "movq (%2, %3, 8), %%mm2 \n\t"
02544 PMAXUB(%%mm2, %%mm4)
02545 PMINUB(%%mm2, %%mm3, %%mm5)
02546 "psubb %%mm2, %%mm1 \n\t"
02547 "paddb %%mm7, %%mm1 \n\t"
02548 "pcmpgtb %%mm6, %%mm1 \n\t"
02549 "paddb %%mm1, %%mm0 \n\t"
02550
02551 "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
02552 "psubb %%mm1, %%mm2 \n\t"
02553 "paddb %%mm7, %%mm2 \n\t"
02554 "pcmpgtb %%mm6, %%mm2 \n\t"
02555 "paddb %%mm2, %%mm0 \n\t"
02556 "psubusb %%mm3, %%mm4 \n\t"
02557
02558 "pxor %%mm6, %%mm6 \n\t"
02559 "movq %4, %%mm7 \n\t"
02560 "paddusb %%mm7, %%mm7 \n\t"
02561 "psubusb %%mm4, %%mm7 \n\t"
02562 "pcmpeqb %%mm6, %%mm7 \n\t"
02563 "pcmpeqb %%mm6, %%mm7 \n\t"
02564 "movq %%mm7, %1 \n\t"
02565
02566 "movq %5, %%mm7 \n\t"
02567 "punpcklbw %%mm7, %%mm7 \n\t"
02568 "punpcklbw %%mm7, %%mm7 \n\t"
02569 "punpcklbw %%mm7, %%mm7 \n\t"
02570 "psubb %%mm0, %%mm6 \n\t"
02571 "pcmpgtb %%mm7, %%mm6 \n\t"
02572 "movq %%mm6, %0 \n\t"
02573
02574 : "=m" (eq_mask), "=m" (dc_mask)
02575 : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
02576 : "%"REG_a
02577 );
02578
02579 both_masks = dc_mask & eq_mask;
02580
02581 if(both_masks){
02582 x86_reg offset= -8*step;
02583 int64_t *temp_sums= sums;
02584
02585 __asm__ volatile(
02586 "movq %2, %%mm0 \n\t"
02587 "pxor %%mm4, %%mm4 \n\t"
02588
02589 "movq (%0), %%mm6 \n\t"
02590 "movq (%0, %1), %%mm5 \n\t"
02591 "movq %%mm5, %%mm1 \n\t"
02592 "movq %%mm6, %%mm2 \n\t"
02593 "psubusb %%mm6, %%mm5 \n\t"
02594 "psubusb %%mm1, %%mm2 \n\t"
02595 "por %%mm5, %%mm2 \n\t"
02596 "psubusb %%mm2, %%mm0 \n\t"
02597 "pcmpeqb %%mm4, %%mm0 \n\t"
02598
02599 "pxor %%mm6, %%mm1 \n\t"
02600 "pand %%mm0, %%mm1 \n\t"
02601 "pxor %%mm1, %%mm6 \n\t"
02602
02603
02604 "movq (%0, %1, 8), %%mm5 \n\t"
02605 "add %1, %0 \n\t"
02606 "movq (%0, %1, 8), %%mm7 \n\t"
02607 "movq %%mm5, %%mm1 \n\t"
02608 "movq %%mm7, %%mm2 \n\t"
02609 "psubusb %%mm7, %%mm5 \n\t"
02610 "psubusb %%mm1, %%mm2 \n\t"
02611 "por %%mm5, %%mm2 \n\t"
02612 "movq %2, %%mm0 \n\t"
02613 "psubusb %%mm2, %%mm0 \n\t"
02614 "pcmpeqb %%mm4, %%mm0 \n\t"
02615
02616 "pxor %%mm7, %%mm1 \n\t"
02617 "pand %%mm0, %%mm1 \n\t"
02618 "pxor %%mm1, %%mm7 \n\t"
02619
02620 "movq %%mm6, %%mm5 \n\t"
02621 "punpckhbw %%mm4, %%mm6 \n\t"
02622 "punpcklbw %%mm4, %%mm5 \n\t"
02623
02624
02625 "movq %%mm5, %%mm0 \n\t"
02626 "movq %%mm6, %%mm1 \n\t"
02627 "psllw $2, %%mm0 \n\t"
02628 "psllw $2, %%mm1 \n\t"
02629 "paddw "MANGLE(w04)", %%mm0 \n\t"
02630 "paddw "MANGLE(w04)", %%mm1 \n\t"
02631
02632 #define NEXT\
02633 "movq (%0), %%mm2 \n\t"\
02634 "movq (%0), %%mm3 \n\t"\
02635 "add %1, %0 \n\t"\
02636 "punpcklbw %%mm4, %%mm2 \n\t"\
02637 "punpckhbw %%mm4, %%mm3 \n\t"\
02638 "paddw %%mm2, %%mm0 \n\t"\
02639 "paddw %%mm3, %%mm1 \n\t"
02640
02641 #define PREV\
02642 "movq (%0), %%mm2 \n\t"\
02643 "movq (%0), %%mm3 \n\t"\
02644 "add %1, %0 \n\t"\
02645 "punpcklbw %%mm4, %%mm2 \n\t"\
02646 "punpckhbw %%mm4, %%mm3 \n\t"\
02647 "psubw %%mm2, %%mm0 \n\t"\
02648 "psubw %%mm3, %%mm1 \n\t"
02649
02650
02651 NEXT
02652 NEXT
02653 NEXT
02654 "movq %%mm0, (%3) \n\t"
02655 "movq %%mm1, 8(%3) \n\t"
02656
02657 NEXT
02658 "psubw %%mm5, %%mm0 \n\t"
02659 "psubw %%mm6, %%mm1 \n\t"
02660 "movq %%mm0, 16(%3) \n\t"
02661 "movq %%mm1, 24(%3) \n\t"
02662
02663 NEXT
02664 "psubw %%mm5, %%mm0 \n\t"
02665 "psubw %%mm6, %%mm1 \n\t"
02666 "movq %%mm0, 32(%3) \n\t"
02667 "movq %%mm1, 40(%3) \n\t"
02668
02669 NEXT
02670 "psubw %%mm5, %%mm0 \n\t"
02671 "psubw %%mm6, %%mm1 \n\t"
02672 "movq %%mm0, 48(%3) \n\t"
02673 "movq %%mm1, 56(%3) \n\t"
02674
02675 NEXT
02676 "psubw %%mm5, %%mm0 \n\t"
02677 "psubw %%mm6, %%mm1 \n\t"
02678 "movq %%mm0, 64(%3) \n\t"
02679 "movq %%mm1, 72(%3) \n\t"
02680
02681 "movq %%mm7, %%mm6 \n\t"
02682 "punpckhbw %%mm4, %%mm7 \n\t"
02683 "punpcklbw %%mm4, %%mm6 \n\t"
02684
02685 NEXT
02686 "mov %4, %0 \n\t"
02687 "add %1, %0 \n\t"
02688 PREV
02689 "movq %%mm0, 80(%3) \n\t"
02690 "movq %%mm1, 88(%3) \n\t"
02691
02692 PREV
02693 "paddw %%mm6, %%mm0 \n\t"
02694 "paddw %%mm7, %%mm1 \n\t"
02695 "movq %%mm0, 96(%3) \n\t"
02696 "movq %%mm1, 104(%3) \n\t"
02697
02698 PREV
02699 "paddw %%mm6, %%mm0 \n\t"
02700 "paddw %%mm7, %%mm1 \n\t"
02701 "movq %%mm0, 112(%3) \n\t"
02702 "movq %%mm1, 120(%3) \n\t"
02703
02704 PREV
02705 "paddw %%mm6, %%mm0 \n\t"
02706 "paddw %%mm7, %%mm1 \n\t"
02707 "movq %%mm0, 128(%3) \n\t"
02708 "movq %%mm1, 136(%3) \n\t"
02709
02710 PREV
02711 "paddw %%mm6, %%mm0 \n\t"
02712 "paddw %%mm7, %%mm1 \n\t"
02713 "movq %%mm0, 144(%3) \n\t"
02714 "movq %%mm1, 152(%3) \n\t"
02715
02716 "mov %4, %0 \n\t"
02717
02718 : "+&r"(src)
02719 : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
02720 );
02721
02722 src+= step;
02723
02724 __asm__ volatile(
02725 "movq %4, %%mm6 \n\t"
02726 "pcmpeqb %%mm5, %%mm5 \n\t"
02727 "pxor %%mm6, %%mm5 \n\t"
02728 "pxor %%mm7, %%mm7 \n\t"
02729
02730 "1: \n\t"
02731 "movq (%1), %%mm0 \n\t"
02732 "movq 8(%1), %%mm1 \n\t"
02733 "paddw 32(%1), %%mm0 \n\t"
02734 "paddw 40(%1), %%mm1 \n\t"
02735 "movq (%0, %3), %%mm2 \n\t"
02736 "movq %%mm2, %%mm3 \n\t"
02737 "movq %%mm2, %%mm4 \n\t"
02738 "punpcklbw %%mm7, %%mm2 \n\t"
02739 "punpckhbw %%mm7, %%mm3 \n\t"
02740 "paddw %%mm2, %%mm0 \n\t"
02741 "paddw %%mm3, %%mm1 \n\t"
02742 "paddw %%mm2, %%mm0 \n\t"
02743 "paddw %%mm3, %%mm1 \n\t"
02744 "psrlw $4, %%mm0 \n\t"
02745 "psrlw $4, %%mm1 \n\t"
02746 "packuswb %%mm1, %%mm0 \n\t"
02747 "pand %%mm6, %%mm0 \n\t"
02748 "pand %%mm5, %%mm4 \n\t"
02749 "por %%mm4, %%mm0 \n\t"
02750 "movq %%mm0, (%0, %3) \n\t"
02751 "add $16, %1 \n\t"
02752 "add %2, %0 \n\t"
02753 " js 1b \n\t"
02754
02755 : "+r"(offset), "+r"(temp_sums)
02756 : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
02757 );
02758 }else
02759 src+= step;
02760
02761 if(eq_mask != -1LL){
02762 uint8_t *temp_src= src;
02763 DECLARE_ALIGNED(8, uint64_t, tmp)[4];
02764 __asm__ volatile(
02765 "pxor %%mm7, %%mm7 \n\t"
02766
02767
02768
02769 "movq (%0), %%mm0 \n\t"
02770 "movq %%mm0, %%mm1 \n\t"
02771 "punpcklbw %%mm7, %%mm0 \n\t"
02772 "punpckhbw %%mm7, %%mm1 \n\t"
02773
02774 "movq (%0, %1), %%mm2 \n\t"
02775 "lea (%0, %1, 2), %%"REG_a" \n\t"
02776 "movq %%mm2, %%mm3 \n\t"
02777 "punpcklbw %%mm7, %%mm2 \n\t"
02778 "punpckhbw %%mm7, %%mm3 \n\t"
02779
02780 "movq (%%"REG_a"), %%mm4 \n\t"
02781 "movq %%mm4, %%mm5 \n\t"
02782 "punpcklbw %%mm7, %%mm4 \n\t"
02783 "punpckhbw %%mm7, %%mm5 \n\t"
02784
02785 "paddw %%mm0, %%mm0 \n\t"
02786 "paddw %%mm1, %%mm1 \n\t"
02787 "psubw %%mm4, %%mm2 \n\t"
02788 "psubw %%mm5, %%mm3 \n\t"
02789 "psubw %%mm2, %%mm0 \n\t"
02790 "psubw %%mm3, %%mm1 \n\t"
02791
02792 "psllw $2, %%mm2 \n\t"
02793 "psllw $2, %%mm3 \n\t"
02794 "psubw %%mm2, %%mm0 \n\t"
02795 "psubw %%mm3, %%mm1 \n\t"
02796
02797 "movq (%%"REG_a", %1), %%mm2 \n\t"
02798 "movq %%mm2, %%mm3 \n\t"
02799 "punpcklbw %%mm7, %%mm2 \n\t"
02800 "punpckhbw %%mm7, %%mm3 \n\t"
02801
02802 "psubw %%mm2, %%mm0 \n\t"
02803 "psubw %%mm3, %%mm1 \n\t"
02804 "psubw %%mm2, %%mm0 \n\t"
02805 "psubw %%mm3, %%mm1 \n\t"
02806 "movq %%mm0, (%4) \n\t"
02807 "movq %%mm1, 8(%4) \n\t"
02808
02809 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
02810 "movq %%mm0, %%mm1 \n\t"
02811 "punpcklbw %%mm7, %%mm0 \n\t"
02812 "punpckhbw %%mm7, %%mm1 \n\t"
02813
02814 "psubw %%mm0, %%mm2 \n\t"
02815 "psubw %%mm1, %%mm3 \n\t"
02816 "movq %%mm2, 16(%4) \n\t"
02817 "movq %%mm3, 24(%4) \n\t"
02818 "paddw %%mm4, %%mm4 \n\t"
02819 "paddw %%mm5, %%mm5 \n\t"
02820 "psubw %%mm2, %%mm4 \n\t"
02821 "psubw %%mm3, %%mm5 \n\t"
02822
02823 "lea (%%"REG_a", %1), %0 \n\t"
02824 "psllw $2, %%mm2 \n\t"
02825 "psllw $2, %%mm3 \n\t"
02826 "psubw %%mm2, %%mm4 \n\t"
02827 "psubw %%mm3, %%mm5 \n\t"
02828
02829 "movq (%0, %1, 2), %%mm2 \n\t"
02830 "movq %%mm2, %%mm3 \n\t"
02831 "punpcklbw %%mm7, %%mm2 \n\t"
02832 "punpckhbw %%mm7, %%mm3 \n\t"
02833 "psubw %%mm2, %%mm4 \n\t"
02834 "psubw %%mm3, %%mm5 \n\t"
02835 "psubw %%mm2, %%mm4 \n\t"
02836 "psubw %%mm3, %%mm5 \n\t"
02837
02838 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
02839 "punpcklbw %%mm7, %%mm6 \n\t"
02840 "psubw %%mm6, %%mm2 \n\t"
02841 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
02842 "punpckhbw %%mm7, %%mm6 \n\t"
02843 "psubw %%mm6, %%mm3 \n\t"
02844
02845 "paddw %%mm0, %%mm0 \n\t"
02846 "paddw %%mm1, %%mm1 \n\t"
02847 "psubw %%mm2, %%mm0 \n\t"
02848 "psubw %%mm3, %%mm1 \n\t"
02849
02850 "psllw $2, %%mm2 \n\t"
02851 "psllw $2, %%mm3 \n\t"
02852 "psubw %%mm2, %%mm0 \n\t"
02853 "psubw %%mm3, %%mm1 \n\t"
02854
02855 "movq (%0, %1, 4), %%mm2 \n\t"
02856 "movq %%mm2, %%mm3 \n\t"
02857 "punpcklbw %%mm7, %%mm2 \n\t"
02858 "punpckhbw %%mm7, %%mm3 \n\t"
02859
02860 "paddw %%mm2, %%mm2 \n\t"
02861 "paddw %%mm3, %%mm3 \n\t"
02862 "psubw %%mm2, %%mm0 \n\t"
02863 "psubw %%mm3, %%mm1 \n\t"
02864
02865 "movq (%4), %%mm2 \n\t"
02866 "movq 8(%4), %%mm3 \n\t"
02867
02868 #if HAVE_MMX2
02869 "movq %%mm7, %%mm6 \n\t"
02870 "psubw %%mm0, %%mm6 \n\t"
02871 "pmaxsw %%mm6, %%mm0 \n\t"
02872 "movq %%mm7, %%mm6 \n\t"
02873 "psubw %%mm1, %%mm6 \n\t"
02874 "pmaxsw %%mm6, %%mm1 \n\t"
02875 "movq %%mm7, %%mm6 \n\t"
02876 "psubw %%mm2, %%mm6 \n\t"
02877 "pmaxsw %%mm6, %%mm2 \n\t"
02878 "movq %%mm7, %%mm6 \n\t"
02879 "psubw %%mm3, %%mm6 \n\t"
02880 "pmaxsw %%mm6, %%mm3 \n\t"
02881 #else
02882 "movq %%mm7, %%mm6 \n\t"
02883 "pcmpgtw %%mm0, %%mm6 \n\t"
02884 "pxor %%mm6, %%mm0 \n\t"
02885 "psubw %%mm6, %%mm0 \n\t"
02886 "movq %%mm7, %%mm6 \n\t"
02887 "pcmpgtw %%mm1, %%mm6 \n\t"
02888 "pxor %%mm6, %%mm1 \n\t"
02889 "psubw %%mm6, %%mm1 \n\t"
02890 "movq %%mm7, %%mm6 \n\t"
02891 "pcmpgtw %%mm2, %%mm6 \n\t"
02892 "pxor %%mm6, %%mm2 \n\t"
02893 "psubw %%mm6, %%mm2 \n\t"
02894 "movq %%mm7, %%mm6 \n\t"
02895 "pcmpgtw %%mm3, %%mm6 \n\t"
02896 "pxor %%mm6, %%mm3 \n\t"
02897 "psubw %%mm6, %%mm3 \n\t"
02898 #endif
02899
02900 #if HAVE_MMX2
02901 "pminsw %%mm2, %%mm0 \n\t"
02902 "pminsw %%mm3, %%mm1 \n\t"
02903 #else
02904 "movq %%mm0, %%mm6 \n\t"
02905 "psubusw %%mm2, %%mm6 \n\t"
02906 "psubw %%mm6, %%mm0 \n\t"
02907 "movq %%mm1, %%mm6 \n\t"
02908 "psubusw %%mm3, %%mm6 \n\t"
02909 "psubw %%mm6, %%mm1 \n\t"
02910 #endif
02911
02912 "movd %2, %%mm2 \n\t"
02913 "punpcklbw %%mm7, %%mm2 \n\t"
02914
02915 "movq %%mm7, %%mm6 \n\t"
02916 "pcmpgtw %%mm4, %%mm6 \n\t"
02917 "pxor %%mm6, %%mm4 \n\t"
02918 "psubw %%mm6, %%mm4 \n\t"
02919 "pcmpgtw %%mm5, %%mm7 \n\t"
02920 "pxor %%mm7, %%mm5 \n\t"
02921 "psubw %%mm7, %%mm5 \n\t"
02922
02923 "psllw $3, %%mm2 \n\t"
02924 "movq %%mm2, %%mm3 \n\t"
02925 "pcmpgtw %%mm4, %%mm2 \n\t"
02926 "pcmpgtw %%mm5, %%mm3 \n\t"
02927 "pand %%mm2, %%mm4 \n\t"
02928 "pand %%mm3, %%mm5 \n\t"
02929
02930
02931 "psubusw %%mm0, %%mm4 \n\t"
02932 "psubusw %%mm1, %%mm5 \n\t"
02933
02934
02935 "movq "MANGLE(w05)", %%mm2 \n\t"
02936 "pmullw %%mm2, %%mm4 \n\t"
02937 "pmullw %%mm2, %%mm5 \n\t"
02938 "movq "MANGLE(w20)", %%mm2 \n\t"
02939 "paddw %%mm2, %%mm4 \n\t"
02940 "paddw %%mm2, %%mm5 \n\t"
02941 "psrlw $6, %%mm4 \n\t"
02942 "psrlw $6, %%mm5 \n\t"
02943
02944 "movq 16(%4), %%mm0 \n\t"
02945 "movq 24(%4), %%mm1 \n\t"
02946
02947 "pxor %%mm2, %%mm2 \n\t"
02948 "pxor %%mm3, %%mm3 \n\t"
02949
02950 "pcmpgtw %%mm0, %%mm2 \n\t"
02951 "pcmpgtw %%mm1, %%mm3 \n\t"
02952 "pxor %%mm2, %%mm0 \n\t"
02953 "pxor %%mm3, %%mm1 \n\t"
02954 "psubw %%mm2, %%mm0 \n\t"
02955 "psubw %%mm3, %%mm1 \n\t"
02956 "psrlw $1, %%mm0 \n\t"
02957 "psrlw $1, %%mm1 \n\t"
02958
02959 "pxor %%mm6, %%mm2 \n\t"
02960 "pxor %%mm7, %%mm3 \n\t"
02961 "pand %%mm2, %%mm4 \n\t"
02962 "pand %%mm3, %%mm5 \n\t"
02963
02964 #if HAVE_MMX2
02965 "pminsw %%mm0, %%mm4 \n\t"
02966 "pminsw %%mm1, %%mm5 \n\t"
02967 #else
02968 "movq %%mm4, %%mm2 \n\t"
02969 "psubusw %%mm0, %%mm2 \n\t"
02970 "psubw %%mm2, %%mm4 \n\t"
02971 "movq %%mm5, %%mm2 \n\t"
02972 "psubusw %%mm1, %%mm2 \n\t"
02973 "psubw %%mm2, %%mm5 \n\t"
02974 #endif
02975 "pxor %%mm6, %%mm4 \n\t"
02976 "pxor %%mm7, %%mm5 \n\t"
02977 "psubw %%mm6, %%mm4 \n\t"
02978 "psubw %%mm7, %%mm5 \n\t"
02979 "packsswb %%mm5, %%mm4 \n\t"
02980 "movq %3, %%mm1 \n\t"
02981 "pandn %%mm4, %%mm1 \n\t"
02982 "movq (%0), %%mm0 \n\t"
02983 "paddb %%mm1, %%mm0 \n\t"
02984 "movq %%mm0, (%0) \n\t"
02985 "movq (%0, %1), %%mm0 \n\t"
02986 "psubb %%mm1, %%mm0 \n\t"
02987 "movq %%mm0, (%0, %1) \n\t"
02988
02989 : "+r" (temp_src)
02990 : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
02991 : "%"REG_a
02992 );
02993 }
02994
02995
02996
02997
02998
02999
03000 }
03001 #endif //HAVE_MMX
03002
03003 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
03004 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
03005
03010 #undef REAL_SCALED_CPY
03011 #undef SCALED_CPY
03012
03013 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
03014 int levelFix, int64_t *packedOffsetAndScale)
03015 {
03016 #if !HAVE_MMX
03017 int i;
03018 #endif
03019 if(levelFix){
03020 #if HAVE_MMX
03021 __asm__ volatile(
03022 "movq (%%"REG_a"), %%mm2 \n\t"
03023 "movq 8(%%"REG_a"), %%mm3 \n\t"
03024 "lea (%2,%4), %%"REG_a" \n\t"
03025 "lea (%3,%5), %%"REG_d" \n\t"
03026 "pxor %%mm4, %%mm4 \n\t"
03027 #if HAVE_MMX2
03028 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
03029 "movq " #src1 ", %%mm0 \n\t"\
03030 "movq " #src1 ", %%mm5 \n\t"\
03031 "movq " #src2 ", %%mm1 \n\t"\
03032 "movq " #src2 ", %%mm6 \n\t"\
03033 "punpcklbw %%mm0, %%mm0 \n\t"\
03034 "punpckhbw %%mm5, %%mm5 \n\t"\
03035 "punpcklbw %%mm1, %%mm1 \n\t"\
03036 "punpckhbw %%mm6, %%mm6 \n\t"\
03037 "pmulhuw %%mm3, %%mm0 \n\t"\
03038 "pmulhuw %%mm3, %%mm5 \n\t"\
03039 "pmulhuw %%mm3, %%mm1 \n\t"\
03040 "pmulhuw %%mm3, %%mm6 \n\t"\
03041 "psubw %%mm2, %%mm0 \n\t"\
03042 "psubw %%mm2, %%mm5 \n\t"\
03043 "psubw %%mm2, %%mm1 \n\t"\
03044 "psubw %%mm2, %%mm6 \n\t"\
03045 "packuswb %%mm5, %%mm0 \n\t"\
03046 "packuswb %%mm6, %%mm1 \n\t"\
03047 "movq %%mm0, " #dst1 " \n\t"\
03048 "movq %%mm1, " #dst2 " \n\t"\
03049
03050 #else //HAVE_MMX2
03051 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
03052 "movq " #src1 ", %%mm0 \n\t"\
03053 "movq " #src1 ", %%mm5 \n\t"\
03054 "punpcklbw %%mm4, %%mm0 \n\t"\
03055 "punpckhbw %%mm4, %%mm5 \n\t"\
03056 "psubw %%mm2, %%mm0 \n\t"\
03057 "psubw %%mm2, %%mm5 \n\t"\
03058 "movq " #src2 ", %%mm1 \n\t"\
03059 "psllw $6, %%mm0 \n\t"\
03060 "psllw $6, %%mm5 \n\t"\
03061 "pmulhw %%mm3, %%mm0 \n\t"\
03062 "movq " #src2 ", %%mm6 \n\t"\
03063 "pmulhw %%mm3, %%mm5 \n\t"\
03064 "punpcklbw %%mm4, %%mm1 \n\t"\
03065 "punpckhbw %%mm4, %%mm6 \n\t"\
03066 "psubw %%mm2, %%mm1 \n\t"\
03067 "psubw %%mm2, %%mm6 \n\t"\
03068 "psllw $6, %%mm1 \n\t"\
03069 "psllw $6, %%mm6 \n\t"\
03070 "pmulhw %%mm3, %%mm1 \n\t"\
03071 "pmulhw %%mm3, %%mm6 \n\t"\
03072 "packuswb %%mm5, %%mm0 \n\t"\
03073 "packuswb %%mm6, %%mm1 \n\t"\
03074 "movq %%mm0, " #dst1 " \n\t"\
03075 "movq %%mm1, " #dst2 " \n\t"\
03076
03077 #endif //HAVE_MMX2
03078 #define SCALED_CPY(src1, src2, dst1, dst2)\
03079 REAL_SCALED_CPY(src1, src2, dst1, dst2)
03080
03081 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
03082 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
03083 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
03084 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
03085 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
03086 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
03087
03088
03089 : "=&a" (packedOffsetAndScale)
03090 : "0" (packedOffsetAndScale),
03091 "r"(src),
03092 "r"(dst),
03093 "r" ((x86_reg)srcStride),
03094 "r" ((x86_reg)dstStride)
03095 : "%"REG_d
03096 );
03097 #else //HAVE_MMX
03098 for(i=0; i<8; i++)
03099 memcpy( &(dst[dstStride*i]),
03100 &(src[srcStride*i]), BLOCK_SIZE);
03101 #endif //HAVE_MMX
03102 }else{
03103 #if HAVE_MMX
03104 __asm__ volatile(
03105 "lea (%0,%2), %%"REG_a" \n\t"
03106 "lea (%1,%3), %%"REG_d" \n\t"
03107
03108 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
03109 "movq " #src1 ", %%mm0 \n\t"\
03110 "movq " #src2 ", %%mm1 \n\t"\
03111 "movq %%mm0, " #dst1 " \n\t"\
03112 "movq %%mm1, " #dst2 " \n\t"\
03113
03114 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
03115 REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
03116
03117 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
03118 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
03119 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
03120 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
03121 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
03122 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
03123
03124 : : "r" (src),
03125 "r" (dst),
03126 "r" ((x86_reg)srcStride),
03127 "r" ((x86_reg)dstStride)
03128 : "%"REG_a, "%"REG_d
03129 );
03130 #else //HAVE_MMX
03131 for(i=0; i<8; i++)
03132 memcpy( &(dst[dstStride*i]),
03133 &(src[srcStride*i]), BLOCK_SIZE);
03134 #endif //HAVE_MMX
03135 }
03136 }
03137
03141 static inline void RENAME(duplicate)(uint8_t src[], int stride)
03142 {
03143 #if HAVE_MMX
03144 __asm__ volatile(
03145 "movq (%0), %%mm0 \n\t"
03146 "add %1, %0 \n\t"
03147 "movq %%mm0, (%0) \n\t"
03148 "movq %%mm0, (%0, %1) \n\t"
03149 "movq %%mm0, (%0, %1, 2) \n\t"
03150 : "+r" (src)
03151 : "r" ((x86_reg)-stride)
03152 );
03153 #else
03154 int i;
03155 uint8_t *p=src;
03156 for(i=0; i<3; i++){
03157 p-= stride;
03158 memcpy(p, src, 8);
03159 }
03160 #endif
03161 }
03162
03166 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
03167 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
03168 {
03169 DECLARE_ALIGNED(8, PPContext, c)= *c2;
03170 int x,y;
03171 #ifdef COMPILE_TIME_MODE
03172 const int mode= COMPILE_TIME_MODE;
03173 #else
03174 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
03175 #endif
03176 int black=0, white=255;
03177 int QPCorrecture= 256*256;
03178
03179 int copyAhead;
03180 #if HAVE_MMX
03181 int i;
03182 #endif
03183
03184 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
03185 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
03186
03187
03188 uint64_t * const yHistogram= c.yHistogram;
03189 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
03190 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
03191
03192
03193 #if HAVE_MMX
03194 for(i=0; i<57; i++){
03195 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
03196 int threshold= offset*2 + 1;
03197 c.mmxDcOffset[i]= 0x7F - offset;
03198 c.mmxDcThreshold[i]= 0x7F - threshold;
03199 c.mmxDcOffset[i]*= 0x0101010101010101LL;
03200 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
03201 }
03202 #endif
03203
03204 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
03205 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
03206 || (mode & FFMPEG_DEINT_FILTER)
03207 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
03208 else if( (mode & V_DEBLOCK)
03209 || (mode & LINEAR_IPOL_DEINT_FILTER)
03210 || (mode & MEDIAN_DEINT_FILTER)
03211 || (mode & V_A_DEBLOCK)) copyAhead=13;
03212 else if(mode & V_X1_FILTER) copyAhead=11;
03213
03214 else if(mode & DERING) copyAhead=9;
03215 else copyAhead=8;
03216
03217 copyAhead-= 8;
03218
03219 if(!isColor){
03220 uint64_t sum= 0;
03221 int i;
03222 uint64_t maxClipped;
03223 uint64_t clipped;
03224 double scale;
03225
03226 c.frameNum++;
03227
03228 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
03229
03230 for(i=0; i<256; i++){
03231 sum+= yHistogram[i];
03232 }
03233
03234
03235 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
03236
03237 clipped= sum;
03238 for(black=255; black>0; black--){
03239 if(clipped < maxClipped) break;
03240 clipped-= yHistogram[black];
03241 }
03242
03243 clipped= sum;
03244 for(white=0; white<256; white++){
03245 if(clipped < maxClipped) break;
03246 clipped-= yHistogram[white];
03247 }
03248
03249 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
03250
03251 #if HAVE_MMX2
03252 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
03253 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
03254 #else
03255 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
03256 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
03257 #endif
03258
03259 c.packedYOffset|= c.packedYOffset<<32;
03260 c.packedYOffset|= c.packedYOffset<<16;
03261
03262 c.packedYScale|= c.packedYScale<<32;
03263 c.packedYScale|= c.packedYScale<<16;
03264
03265 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
03266 else QPCorrecture= 256*256;
03267 }else{
03268 c.packedYScale= 0x0100010001000100LL;
03269 c.packedYOffset= 0;
03270 QPCorrecture= 256*256;
03271 }
03272
03273
03274 y=-BLOCK_SIZE;
03275 {
03276 const uint8_t *srcBlock= &(src[y*srcStride]);
03277 uint8_t *dstBlock= tempDst + dstStride;
03278
03279
03280
03281
03282 for(x=0; x<width; x+=BLOCK_SIZE){
03283
03284 #if HAVE_MMX2
03285
03286
03287
03288
03289
03290
03291
03292 __asm__(
03293 "mov %4, %%"REG_a" \n\t"
03294 "shr $2, %%"REG_a" \n\t"
03295 "and $6, %%"REG_a" \n\t"
03296 "add %5, %%"REG_a" \n\t"
03297 "mov %%"REG_a", %%"REG_d" \n\t"
03298 "imul %1, %%"REG_a" \n\t"
03299 "imul %3, %%"REG_d" \n\t"
03300 "prefetchnta 32(%%"REG_a", %0) \n\t"
03301 "prefetcht0 32(%%"REG_d", %2) \n\t"
03302 "add %1, %%"REG_a" \n\t"
03303 "add %3, %%"REG_d" \n\t"
03304 "prefetchnta 32(%%"REG_a", %0) \n\t"
03305 "prefetcht0 32(%%"REG_d", %2) \n\t"
03306 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
03307 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
03308 : "%"REG_a, "%"REG_d
03309 );
03310
03311 #elif HAVE_AMD3DNOW
03312
03313
03314
03315
03316
03317
03318 #endif
03319
03320 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
03321 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
03322
03323 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
03324
03325 if(mode & LINEAR_IPOL_DEINT_FILTER)
03326 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
03327 else if(mode & LINEAR_BLEND_DEINT_FILTER)
03328 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
03329 else if(mode & MEDIAN_DEINT_FILTER)
03330 RENAME(deInterlaceMedian)(dstBlock, dstStride);
03331 else if(mode & CUBIC_IPOL_DEINT_FILTER)
03332 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
03333 else if(mode & FFMPEG_DEINT_FILTER)
03334 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
03335 else if(mode & LOWPASS5_DEINT_FILTER)
03336 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
03337
03338
03339
03340 dstBlock+=8;
03341 srcBlock+=8;
03342 }
03343 if(width==FFABS(dstStride))
03344 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
03345 else{
03346 int i;
03347 for(i=0; i<copyAhead; i++){
03348 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
03349 }
03350 }
03351 }
03352
03353 for(y=0; y<height; y+=BLOCK_SIZE){
03354
03355 const uint8_t *srcBlock= &(src[y*srcStride]);
03356 uint8_t *dstBlock= &(dst[y*dstStride]);
03357 #if HAVE_MMX
03358 uint8_t *tempBlock1= c.tempBlocks;
03359 uint8_t *tempBlock2= c.tempBlocks + 8;
03360 #endif
03361 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
03362 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
03363 int QP=0;
03364
03365
03366 if(y+15 >= height){
03367 int i;
03368
03369
03370 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
03371 FFMAX(height-y-copyAhead, 0), srcStride);
03372
03373
03374 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
03375 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
03376
03377
03378 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
03379
03380
03381 for(i=height-y+1; i<=copyAhead; i++)
03382 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
03383
03384 dstBlock= tempDst + dstStride;
03385 srcBlock= tempSrc;
03386 }
03387
03388
03389
03390
03391 for(x=0; x<width; x+=BLOCK_SIZE){
03392 const int stride= dstStride;
03393 #if HAVE_MMX
03394 uint8_t *tmpXchg;
03395 #endif
03396 if(isColor){
03397 QP= QPptr[x>>qpHShift];
03398 c.nonBQP= nonBQPptr[x>>qpHShift];
03399 }else{
03400 QP= QPptr[x>>4];
03401 QP= (QP* QPCorrecture + 256*128)>>16;
03402 c.nonBQP= nonBQPptr[x>>4];
03403 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
03404 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
03405 }
03406 c.QP= QP;
03407 #if HAVE_MMX
03408 __asm__ volatile(
03409 "movd %1, %%mm7 \n\t"
03410 "packuswb %%mm7, %%mm7 \n\t"
03411 "packuswb %%mm7, %%mm7 \n\t"
03412 "packuswb %%mm7, %%mm7 \n\t"
03413 "movq %%mm7, %0 \n\t"
03414 : "=m" (c.pQPb)
03415 : "r" (QP)
03416 );
03417 #endif
03418
03419
03420 #if HAVE_MMX2
03421
03422
03423
03424
03425
03426
03427
03428 __asm__(
03429 "mov %4, %%"REG_a" \n\t"
03430 "shr $2, %%"REG_a" \n\t"
03431 "and $6, %%"REG_a" \n\t"
03432 "add %5, %%"REG_a" \n\t"
03433 "mov %%"REG_a", %%"REG_d" \n\t"
03434 "imul %1, %%"REG_a" \n\t"
03435 "imul %3, %%"REG_d" \n\t"
03436 "prefetchnta 32(%%"REG_a", %0) \n\t"
03437 "prefetcht0 32(%%"REG_d", %2) \n\t"
03438 "add %1, %%"REG_a" \n\t"
03439 "add %3, %%"REG_d" \n\t"
03440 "prefetchnta 32(%%"REG_a", %0) \n\t"
03441 "prefetcht0 32(%%"REG_d", %2) \n\t"
03442 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
03443 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
03444 : "%"REG_a, "%"REG_d
03445 );
03446
03447 #elif HAVE_AMD3DNOW
03448
03449
03450
03451
03452
03453
03454 #endif
03455
03456 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
03457 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
03458
03459 if(mode & LINEAR_IPOL_DEINT_FILTER)
03460 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
03461 else if(mode & LINEAR_BLEND_DEINT_FILTER)
03462 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
03463 else if(mode & MEDIAN_DEINT_FILTER)
03464 RENAME(deInterlaceMedian)(dstBlock, dstStride);
03465 else if(mode & CUBIC_IPOL_DEINT_FILTER)
03466 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
03467 else if(mode & FFMPEG_DEINT_FILTER)
03468 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
03469 else if(mode & LOWPASS5_DEINT_FILTER)
03470 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
03471
03472
03473
03474
03475
03476 if(y + 8 < height){
03477 if(mode & V_X1_FILTER)
03478 RENAME(vertX1Filter)(dstBlock, stride, &c);
03479 else if(mode & V_DEBLOCK){
03480 const int t= RENAME(vertClassify)(dstBlock, stride, &c);
03481
03482 if(t==1)
03483 RENAME(doVertLowPass)(dstBlock, stride, &c);
03484 else if(t==2)
03485 RENAME(doVertDefFilter)(dstBlock, stride, &c);
03486 }else if(mode & V_A_DEBLOCK){
03487 RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
03488 }
03489 }
03490
03491 #if HAVE_MMX
03492 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
03493 #endif
03494
03495 if(x - 8 >= 0){
03496 #if HAVE_MMX
03497 if(mode & H_X1_FILTER)
03498 RENAME(vertX1Filter)(tempBlock1, 16, &c);
03499 else if(mode & H_DEBLOCK){
03500
03501 const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
03502
03503 if(t==1)
03504 RENAME(doVertLowPass)(tempBlock1, 16, &c);
03505 else if(t==2)
03506 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
03507 }else if(mode & H_A_DEBLOCK){
03508 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
03509 }
03510
03511 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
03512
03513 #else
03514 if(mode & H_X1_FILTER)
03515 horizX1Filter(dstBlock-4, stride, QP);
03516 else if(mode & H_DEBLOCK){
03517 #if HAVE_ALTIVEC
03518 DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
03519 int t;
03520 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
03521
03522 t = vertClassify_altivec(tempBlock-48, 16, &c);
03523 if(t==1) {
03524 doVertLowPass_altivec(tempBlock-48, 16, &c);
03525 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
03526 }
03527 else if(t==2) {
03528 doVertDefFilter_altivec(tempBlock-48, 16, &c);
03529 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
03530 }
03531 #else
03532 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
03533
03534 if(t==1)
03535 RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
03536 else if(t==2)
03537 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
03538 #endif
03539 }else if(mode & H_A_DEBLOCK){
03540 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
03541 }
03542 #endif //HAVE_MMX
03543 if(mode & DERING){
03544
03545 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
03546 }
03547
03548 if(mode & TEMP_NOISE_FILTER)
03549 {
03550 RENAME(tempNoiseReducer)(dstBlock-8, stride,
03551 c.tempBlurred[isColor] + y*dstStride + x,
03552 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
03553 c.ppMode.maxTmpNoise);
03554 }
03555 }
03556
03557 dstBlock+=8;
03558 srcBlock+=8;
03559
03560 #if HAVE_MMX
03561 tmpXchg= tempBlock1;
03562 tempBlock1= tempBlock2;
03563 tempBlock2 = tmpXchg;
03564 #endif
03565 }
03566
03567 if(mode & DERING){
03568 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
03569 }
03570
03571 if((mode & TEMP_NOISE_FILTER)){
03572 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
03573 c.tempBlurred[isColor] + y*dstStride + x,
03574 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
03575 c.ppMode.maxTmpNoise);
03576 }
03577
03578
03579 if(y+15 >= height){
03580 uint8_t *dstBlock= &(dst[y*dstStride]);
03581 if(width==FFABS(dstStride))
03582 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
03583 else{
03584 int i;
03585 for(i=0; i<height-y; i++){
03586 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
03587 }
03588 }
03589 }
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599 }
03600 #if HAVE_AMD3DNOW
03601 __asm__ volatile("femms");
03602 #elif HAVE_MMX
03603 __asm__ volatile("emms");
03604 #endif
03605
03606 #ifdef DEBUG_BRIGHTNESS
03607 if(!isColor){
03608 int max=1;
03609 int i;
03610 for(i=0; i<256; i++)
03611 if(yHistogram[i] > max) max=yHistogram[i];
03612
03613 for(i=1; i<256; i++){
03614 int x;
03615 int start=yHistogram[i-1]/(max/256+1);
03616 int end=yHistogram[i]/(max/256+1);
03617 int inc= end > start ? 1 : -1;
03618 for(x=start; x!=end+inc; x+=inc)
03619 dst[ i*dstStride + x]+=128;
03620 }
03621
03622 for(i=0; i<100; i+=2){
03623 dst[ (white)*dstStride + i]+=128;
03624 dst[ (black)*dstStride + i]+=128;
03625 }
03626 }
03627 #endif
03628
03629 *c2= c;
03630
03631 }