00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00031 {
00032 __asm__ volatile(
00033 "lea (%3, %3), %%"REG_a" \n\t"
00034 "1: \n\t"
00035 "movq (%1), %%mm0 \n\t"
00036 "movq (%1, %3), %%mm1 \n\t"
00037 PAVGB" 1(%1), %%mm0 \n\t"
00038 PAVGB" 1(%1, %3), %%mm1 \n\t"
00039 "movq %%mm0, (%2) \n\t"
00040 "movq %%mm1, (%2, %3) \n\t"
00041 "add %%"REG_a", %1 \n\t"
00042 "add %%"REG_a", %2 \n\t"
00043 "movq (%1), %%mm0 \n\t"
00044 "movq (%1, %3), %%mm1 \n\t"
00045 PAVGB" 1(%1), %%mm0 \n\t"
00046 PAVGB" 1(%1, %3), %%mm1 \n\t"
00047 "add %%"REG_a", %1 \n\t"
00048 "movq %%mm0, (%2) \n\t"
00049 "movq %%mm1, (%2, %3) \n\t"
00050 "add %%"REG_a", %2 \n\t"
00051 "subl $4, %0 \n\t"
00052 "jnz 1b \n\t"
00053 :"+g"(h), "+S"(pixels), "+D"(block)
00054 :"r" ((x86_reg)line_size)
00055 :"%"REG_a, "memory");
00056 }
00057
00058 #ifndef SKIP_FOR_3DNOW
00059 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00060 {
00061 __asm__ volatile(
00062 "testl $1, %0 \n\t"
00063 " jz 1f \n\t"
00064 "movq (%1), %%mm0 \n\t"
00065 "movq (%2), %%mm1 \n\t"
00066 "add %4, %1 \n\t"
00067 "add $8, %2 \n\t"
00068 PAVGB" %%mm1, %%mm0 \n\t"
00069 "movq %%mm0, (%3) \n\t"
00070 "add %5, %3 \n\t"
00071 "decl %0 \n\t"
00072 "1: \n\t"
00073 "movq (%1), %%mm0 \n\t"
00074 "add %4, %1 \n\t"
00075 "movq (%1), %%mm1 \n\t"
00076 "add %4, %1 \n\t"
00077 PAVGB" (%2), %%mm0 \n\t"
00078 PAVGB" 8(%2), %%mm1 \n\t"
00079 "movq %%mm0, (%3) \n\t"
00080 "add %5, %3 \n\t"
00081 "movq %%mm1, (%3) \n\t"
00082 "add %5, %3 \n\t"
00083 "movq (%1), %%mm0 \n\t"
00084 "add %4, %1 \n\t"
00085 "movq (%1), %%mm1 \n\t"
00086 "add %4, %1 \n\t"
00087 PAVGB" 16(%2), %%mm0 \n\t"
00088 PAVGB" 24(%2), %%mm1 \n\t"
00089 "movq %%mm0, (%3) \n\t"
00090 "add %5, %3 \n\t"
00091 "movq %%mm1, (%3) \n\t"
00092 "add %5, %3 \n\t"
00093 "add $32, %2 \n\t"
00094 "subl $4, %0 \n\t"
00095 "jnz 1b \n\t"
00096 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00097 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00098 #else
00099 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00100 #endif
00101 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00102 :"memory");
00103
00104
00105
00106
00107 }
00108
00109 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00110 {
00111 __asm__ volatile(
00112 "pcmpeqb %%mm6, %%mm6 \n\t"
00113 "testl $1, %0 \n\t"
00114 " jz 1f \n\t"
00115 "movq (%1), %%mm0 \n\t"
00116 "movq (%2), %%mm1 \n\t"
00117 "add %4, %1 \n\t"
00118 "add $8, %2 \n\t"
00119 "pxor %%mm6, %%mm0 \n\t"
00120 "pxor %%mm6, %%mm1 \n\t"
00121 PAVGB" %%mm1, %%mm0 \n\t"
00122 "pxor %%mm6, %%mm0 \n\t"
00123 "movq %%mm0, (%3) \n\t"
00124 "add %5, %3 \n\t"
00125 "decl %0 \n\t"
00126 "1: \n\t"
00127 "movq (%1), %%mm0 \n\t"
00128 "add %4, %1 \n\t"
00129 "movq (%1), %%mm1 \n\t"
00130 "add %4, %1 \n\t"
00131 "movq (%2), %%mm2 \n\t"
00132 "movq 8(%2), %%mm3 \n\t"
00133 "pxor %%mm6, %%mm0 \n\t"
00134 "pxor %%mm6, %%mm1 \n\t"
00135 "pxor %%mm6, %%mm2 \n\t"
00136 "pxor %%mm6, %%mm3 \n\t"
00137 PAVGB" %%mm2, %%mm0 \n\t"
00138 PAVGB" %%mm3, %%mm1 \n\t"
00139 "pxor %%mm6, %%mm0 \n\t"
00140 "pxor %%mm6, %%mm1 \n\t"
00141 "movq %%mm0, (%3) \n\t"
00142 "add %5, %3 \n\t"
00143 "movq %%mm1, (%3) \n\t"
00144 "add %5, %3 \n\t"
00145 "movq (%1), %%mm0 \n\t"
00146 "add %4, %1 \n\t"
00147 "movq (%1), %%mm1 \n\t"
00148 "add %4, %1 \n\t"
00149 "movq 16(%2), %%mm2 \n\t"
00150 "movq 24(%2), %%mm3 \n\t"
00151 "pxor %%mm6, %%mm0 \n\t"
00152 "pxor %%mm6, %%mm1 \n\t"
00153 "pxor %%mm6, %%mm2 \n\t"
00154 "pxor %%mm6, %%mm3 \n\t"
00155 PAVGB" %%mm2, %%mm0 \n\t"
00156 PAVGB" %%mm3, %%mm1 \n\t"
00157 "pxor %%mm6, %%mm0 \n\t"
00158 "pxor %%mm6, %%mm1 \n\t"
00159 "movq %%mm0, (%3) \n\t"
00160 "add %5, %3 \n\t"
00161 "movq %%mm1, (%3) \n\t"
00162 "add %5, %3 \n\t"
00163 "add $32, %2 \n\t"
00164 "subl $4, %0 \n\t"
00165 "jnz 1b \n\t"
00166 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00167 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00168 #else
00169 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00170 #endif
00171 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00172 :"memory");
00173
00174
00175
00176
00177 }
00178
00179 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00180 {
00181 __asm__ volatile(
00182 "testl $1, %0 \n\t"
00183 " jz 1f \n\t"
00184 "movq (%1), %%mm0 \n\t"
00185 "movq (%2), %%mm1 \n\t"
00186 "add %4, %1 \n\t"
00187 "add $8, %2 \n\t"
00188 PAVGB" %%mm1, %%mm0 \n\t"
00189 PAVGB" (%3), %%mm0 \n\t"
00190 "movq %%mm0, (%3) \n\t"
00191 "add %5, %3 \n\t"
00192 "decl %0 \n\t"
00193 "1: \n\t"
00194 "movq (%1), %%mm0 \n\t"
00195 "add %4, %1 \n\t"
00196 "movq (%1), %%mm1 \n\t"
00197 "add %4, %1 \n\t"
00198 PAVGB" (%2), %%mm0 \n\t"
00199 PAVGB" 8(%2), %%mm1 \n\t"
00200 PAVGB" (%3), %%mm0 \n\t"
00201 "movq %%mm0, (%3) \n\t"
00202 "add %5, %3 \n\t"
00203 PAVGB" (%3), %%mm1 \n\t"
00204 "movq %%mm1, (%3) \n\t"
00205 "add %5, %3 \n\t"
00206 "movq (%1), %%mm0 \n\t"
00207 "add %4, %1 \n\t"
00208 "movq (%1), %%mm1 \n\t"
00209 "add %4, %1 \n\t"
00210 PAVGB" 16(%2), %%mm0 \n\t"
00211 PAVGB" 24(%2), %%mm1 \n\t"
00212 PAVGB" (%3), %%mm0 \n\t"
00213 "movq %%mm0, (%3) \n\t"
00214 "add %5, %3 \n\t"
00215 PAVGB" (%3), %%mm1 \n\t"
00216 "movq %%mm1, (%3) \n\t"
00217 "add %5, %3 \n\t"
00218 "add $32, %2 \n\t"
00219 "subl $4, %0 \n\t"
00220 "jnz 1b \n\t"
00221 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00222 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00223 #else
00224 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00225 #endif
00226 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00227 :"memory");
00228
00229
00230
00231
00232 }
00233 #endif
00234
00235 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00236 {
00237 __asm__ volatile(
00238 "lea (%3, %3), %%"REG_a" \n\t"
00239 "1: \n\t"
00240 "movq (%1), %%mm0 \n\t"
00241 "movq (%1, %3), %%mm1 \n\t"
00242 "movq 8(%1), %%mm2 \n\t"
00243 "movq 8(%1, %3), %%mm3 \n\t"
00244 PAVGB" 1(%1), %%mm0 \n\t"
00245 PAVGB" 1(%1, %3), %%mm1 \n\t"
00246 PAVGB" 9(%1), %%mm2 \n\t"
00247 PAVGB" 9(%1, %3), %%mm3 \n\t"
00248 "movq %%mm0, (%2) \n\t"
00249 "movq %%mm1, (%2, %3) \n\t"
00250 "movq %%mm2, 8(%2) \n\t"
00251 "movq %%mm3, 8(%2, %3) \n\t"
00252 "add %%"REG_a", %1 \n\t"
00253 "add %%"REG_a", %2 \n\t"
00254 "movq (%1), %%mm0 \n\t"
00255 "movq (%1, %3), %%mm1 \n\t"
00256 "movq 8(%1), %%mm2 \n\t"
00257 "movq 8(%1, %3), %%mm3 \n\t"
00258 PAVGB" 1(%1), %%mm0 \n\t"
00259 PAVGB" 1(%1, %3), %%mm1 \n\t"
00260 PAVGB" 9(%1), %%mm2 \n\t"
00261 PAVGB" 9(%1, %3), %%mm3 \n\t"
00262 "add %%"REG_a", %1 \n\t"
00263 "movq %%mm0, (%2) \n\t"
00264 "movq %%mm1, (%2, %3) \n\t"
00265 "movq %%mm2, 8(%2) \n\t"
00266 "movq %%mm3, 8(%2, %3) \n\t"
00267 "add %%"REG_a", %2 \n\t"
00268 "subl $4, %0 \n\t"
00269 "jnz 1b \n\t"
00270 :"+g"(h), "+S"(pixels), "+D"(block)
00271 :"r" ((x86_reg)line_size)
00272 :"%"REG_a, "memory");
00273 }
00274
00275 #ifndef SKIP_FOR_3DNOW
00276 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00277 {
00278 __asm__ volatile(
00279 "testl $1, %0 \n\t"
00280 " jz 1f \n\t"
00281 "movq (%1), %%mm0 \n\t"
00282 "movq 8(%1), %%mm1 \n\t"
00283 PAVGB" (%2), %%mm0 \n\t"
00284 PAVGB" 8(%2), %%mm1 \n\t"
00285 "add %4, %1 \n\t"
00286 "add $16, %2 \n\t"
00287 "movq %%mm0, (%3) \n\t"
00288 "movq %%mm1, 8(%3) \n\t"
00289 "add %5, %3 \n\t"
00290 "decl %0 \n\t"
00291 "1: \n\t"
00292 "movq (%1), %%mm0 \n\t"
00293 "movq 8(%1), %%mm1 \n\t"
00294 "add %4, %1 \n\t"
00295 PAVGB" (%2), %%mm0 \n\t"
00296 PAVGB" 8(%2), %%mm1 \n\t"
00297 "movq %%mm0, (%3) \n\t"
00298 "movq %%mm1, 8(%3) \n\t"
00299 "add %5, %3 \n\t"
00300 "movq (%1), %%mm0 \n\t"
00301 "movq 8(%1), %%mm1 \n\t"
00302 "add %4, %1 \n\t"
00303 PAVGB" 16(%2), %%mm0 \n\t"
00304 PAVGB" 24(%2), %%mm1 \n\t"
00305 "movq %%mm0, (%3) \n\t"
00306 "movq %%mm1, 8(%3) \n\t"
00307 "add %5, %3 \n\t"
00308 "add $32, %2 \n\t"
00309 "subl $2, %0 \n\t"
00310 "jnz 1b \n\t"
00311 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00312 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00313 #else
00314 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00315 #endif
00316 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00317 :"memory");
00318
00319
00320
00321
00322 }
00323
00324 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00325 {
00326 __asm__ volatile(
00327 "testl $1, %0 \n\t"
00328 " jz 1f \n\t"
00329 "movq (%1), %%mm0 \n\t"
00330 "movq 8(%1), %%mm1 \n\t"
00331 PAVGB" (%2), %%mm0 \n\t"
00332 PAVGB" 8(%2), %%mm1 \n\t"
00333 "add %4, %1 \n\t"
00334 "add $16, %2 \n\t"
00335 PAVGB" (%3), %%mm0 \n\t"
00336 PAVGB" 8(%3), %%mm1 \n\t"
00337 "movq %%mm0, (%3) \n\t"
00338 "movq %%mm1, 8(%3) \n\t"
00339 "add %5, %3 \n\t"
00340 "decl %0 \n\t"
00341 "1: \n\t"
00342 "movq (%1), %%mm0 \n\t"
00343 "movq 8(%1), %%mm1 \n\t"
00344 "add %4, %1 \n\t"
00345 PAVGB" (%2), %%mm0 \n\t"
00346 PAVGB" 8(%2), %%mm1 \n\t"
00347 PAVGB" (%3), %%mm0 \n\t"
00348 PAVGB" 8(%3), %%mm1 \n\t"
00349 "movq %%mm0, (%3) \n\t"
00350 "movq %%mm1, 8(%3) \n\t"
00351 "add %5, %3 \n\t"
00352 "movq (%1), %%mm0 \n\t"
00353 "movq 8(%1), %%mm1 \n\t"
00354 "add %4, %1 \n\t"
00355 PAVGB" 16(%2), %%mm0 \n\t"
00356 PAVGB" 24(%2), %%mm1 \n\t"
00357 PAVGB" (%3), %%mm0 \n\t"
00358 PAVGB" 8(%3), %%mm1 \n\t"
00359 "movq %%mm0, (%3) \n\t"
00360 "movq %%mm1, 8(%3) \n\t"
00361 "add %5, %3 \n\t"
00362 "add $32, %2 \n\t"
00363 "subl $2, %0 \n\t"
00364 "jnz 1b \n\t"
00365 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00366 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00367 #else
00368 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00369 #endif
00370 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00371 :"memory");
00372
00373
00374
00375
00376 }
00377
00378 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00379 {
00380 __asm__ volatile(
00381 "pcmpeqb %%mm6, %%mm6 \n\t"
00382 "testl $1, %0 \n\t"
00383 " jz 1f \n\t"
00384 "movq (%1), %%mm0 \n\t"
00385 "movq 8(%1), %%mm1 \n\t"
00386 "movq (%2), %%mm2 \n\t"
00387 "movq 8(%2), %%mm3 \n\t"
00388 "pxor %%mm6, %%mm0 \n\t"
00389 "pxor %%mm6, %%mm1 \n\t"
00390 "pxor %%mm6, %%mm2 \n\t"
00391 "pxor %%mm6, %%mm3 \n\t"
00392 PAVGB" %%mm2, %%mm0 \n\t"
00393 PAVGB" %%mm3, %%mm1 \n\t"
00394 "pxor %%mm6, %%mm0 \n\t"
00395 "pxor %%mm6, %%mm1 \n\t"
00396 "add %4, %1 \n\t"
00397 "add $16, %2 \n\t"
00398 "movq %%mm0, (%3) \n\t"
00399 "movq %%mm1, 8(%3) \n\t"
00400 "add %5, %3 \n\t"
00401 "decl %0 \n\t"
00402 "1: \n\t"
00403 "movq (%1), %%mm0 \n\t"
00404 "movq 8(%1), %%mm1 \n\t"
00405 "add %4, %1 \n\t"
00406 "movq (%2), %%mm2 \n\t"
00407 "movq 8(%2), %%mm3 \n\t"
00408 "pxor %%mm6, %%mm0 \n\t"
00409 "pxor %%mm6, %%mm1 \n\t"
00410 "pxor %%mm6, %%mm2 \n\t"
00411 "pxor %%mm6, %%mm3 \n\t"
00412 PAVGB" %%mm2, %%mm0 \n\t"
00413 PAVGB" %%mm3, %%mm1 \n\t"
00414 "pxor %%mm6, %%mm0 \n\t"
00415 "pxor %%mm6, %%mm1 \n\t"
00416 "movq %%mm0, (%3) \n\t"
00417 "movq %%mm1, 8(%3) \n\t"
00418 "add %5, %3 \n\t"
00419 "movq (%1), %%mm0 \n\t"
00420 "movq 8(%1), %%mm1 \n\t"
00421 "add %4, %1 \n\t"
00422 "movq 16(%2), %%mm2 \n\t"
00423 "movq 24(%2), %%mm3 \n\t"
00424 "pxor %%mm6, %%mm0 \n\t"
00425 "pxor %%mm6, %%mm1 \n\t"
00426 "pxor %%mm6, %%mm2 \n\t"
00427 "pxor %%mm6, %%mm3 \n\t"
00428 PAVGB" %%mm2, %%mm0 \n\t"
00429 PAVGB" %%mm3, %%mm1 \n\t"
00430 "pxor %%mm6, %%mm0 \n\t"
00431 "pxor %%mm6, %%mm1 \n\t"
00432 "movq %%mm0, (%3) \n\t"
00433 "movq %%mm1, 8(%3) \n\t"
00434 "add %5, %3 \n\t"
00435 "add $32, %2 \n\t"
00436 "subl $2, %0 \n\t"
00437 "jnz 1b \n\t"
00438 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00439 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00440 #else
00441 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00442 #endif
00443 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00444 :"memory");
00445
00446
00447
00448
00449 }
00450 #endif
00451
00452
00453 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00454 {
00455 MOVQ_BONE(mm6);
00456 __asm__ volatile(
00457 "lea (%3, %3), %%"REG_a" \n\t"
00458 "1: \n\t"
00459 "movq (%1), %%mm0 \n\t"
00460 "movq (%1, %3), %%mm2 \n\t"
00461 "movq 1(%1), %%mm1 \n\t"
00462 "movq 1(%1, %3), %%mm3 \n\t"
00463 "add %%"REG_a", %1 \n\t"
00464 "psubusb %%mm6, %%mm0 \n\t"
00465 "psubusb %%mm6, %%mm2 \n\t"
00466 PAVGB" %%mm1, %%mm0 \n\t"
00467 PAVGB" %%mm3, %%mm2 \n\t"
00468 "movq %%mm0, (%2) \n\t"
00469 "movq %%mm2, (%2, %3) \n\t"
00470 "movq (%1), %%mm0 \n\t"
00471 "movq 1(%1), %%mm1 \n\t"
00472 "movq (%1, %3), %%mm2 \n\t"
00473 "movq 1(%1, %3), %%mm3 \n\t"
00474 "add %%"REG_a", %2 \n\t"
00475 "add %%"REG_a", %1 \n\t"
00476 "psubusb %%mm6, %%mm0 \n\t"
00477 "psubusb %%mm6, %%mm2 \n\t"
00478 PAVGB" %%mm1, %%mm0 \n\t"
00479 PAVGB" %%mm3, %%mm2 \n\t"
00480 "movq %%mm0, (%2) \n\t"
00481 "movq %%mm2, (%2, %3) \n\t"
00482 "add %%"REG_a", %2 \n\t"
00483 "subl $4, %0 \n\t"
00484 "jnz 1b \n\t"
00485 :"+g"(h), "+S"(pixels), "+D"(block)
00486 :"r" ((x86_reg)line_size)
00487 :"%"REG_a, "memory");
00488 }
00489
00490 static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00491 {
00492 __asm__ volatile (
00493 "pcmpeqb %%mm6, %%mm6 \n\t"
00494 "1: \n\t"
00495 "movq (%1), %%mm0 \n\t"
00496 "movq (%1, %3), %%mm2 \n\t"
00497 "movq 1(%1), %%mm1 \n\t"
00498 "movq 1(%1, %3), %%mm3 \n\t"
00499 "pxor %%mm6, %%mm0 \n\t"
00500 "pxor %%mm6, %%mm2 \n\t"
00501 "pxor %%mm6, %%mm1 \n\t"
00502 "pxor %%mm6, %%mm3 \n\t"
00503 PAVGB" %%mm1, %%mm0 \n\t"
00504 PAVGB" %%mm3, %%mm2 \n\t"
00505 "pxor %%mm6, %%mm0 \n\t"
00506 "pxor %%mm6, %%mm2 \n\t"
00507 "movq %%mm0, (%2) \n\t"
00508 "movq %%mm2, (%2, %3) \n\t"
00509 "movq (%1, %3,2), %%mm0 \n\t"
00510 "movq 1(%1, %3,2), %%mm1 \n\t"
00511 "movq (%1, %4), %%mm2 \n\t"
00512 "movq 1(%1, %4), %%mm3 \n\t"
00513 "pxor %%mm6, %%mm0 \n\t"
00514 "pxor %%mm6, %%mm1 \n\t"
00515 "pxor %%mm6, %%mm2 \n\t"
00516 "pxor %%mm6, %%mm3 \n\t"
00517 PAVGB" %%mm1, %%mm0 \n\t"
00518 PAVGB" %%mm3, %%mm2 \n\t"
00519 "pxor %%mm6, %%mm0 \n\t"
00520 "pxor %%mm6, %%mm2 \n\t"
00521 "movq %%mm0, (%2, %3,2) \n\t"
00522 "movq %%mm2, (%2, %4) \n\t"
00523 "lea (%1, %3,4), %1 \n\t"
00524 "lea (%2, %3,4), %2 \n\t"
00525 "subl $4, %0 \n\t"
00526 "jg 1b \n\t"
00527 : "+g"(h), "+r"(pixels), "+r"(block)
00528 : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
00529 : "memory"
00530 );
00531 }
00532
00533 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00534 {
00535 __asm__ volatile(
00536 "lea (%3, %3), %%"REG_a" \n\t"
00537 "movq (%1), %%mm0 \n\t"
00538 "sub %3, %2 \n\t"
00539 "1: \n\t"
00540 "movq (%1, %3), %%mm1 \n\t"
00541 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00542 "add %%"REG_a", %1 \n\t"
00543 PAVGB" %%mm1, %%mm0 \n\t"
00544 PAVGB" %%mm2, %%mm1 \n\t"
00545 "movq %%mm0, (%2, %3) \n\t"
00546 "movq %%mm1, (%2, %%"REG_a") \n\t"
00547 "movq (%1, %3), %%mm1 \n\t"
00548 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00549 "add %%"REG_a", %2 \n\t"
00550 "add %%"REG_a", %1 \n\t"
00551 PAVGB" %%mm1, %%mm2 \n\t"
00552 PAVGB" %%mm0, %%mm1 \n\t"
00553 "movq %%mm2, (%2, %3) \n\t"
00554 "movq %%mm1, (%2, %%"REG_a") \n\t"
00555 "add %%"REG_a", %2 \n\t"
00556 "subl $4, %0 \n\t"
00557 "jnz 1b \n\t"
00558 :"+g"(h), "+S"(pixels), "+D" (block)
00559 :"r" ((x86_reg)line_size)
00560 :"%"REG_a, "memory");
00561 }
00562
00563
00564 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00565 {
00566 MOVQ_BONE(mm6);
00567 __asm__ volatile(
00568 "lea (%3, %3), %%"REG_a" \n\t"
00569 "movq (%1), %%mm0 \n\t"
00570 "sub %3, %2 \n\t"
00571 "1: \n\t"
00572 "movq (%1, %3), %%mm1 \n\t"
00573 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00574 "add %%"REG_a", %1 \n\t"
00575 "psubusb %%mm6, %%mm1 \n\t"
00576 PAVGB" %%mm1, %%mm0 \n\t"
00577 PAVGB" %%mm2, %%mm1 \n\t"
00578 "movq %%mm0, (%2, %3) \n\t"
00579 "movq %%mm1, (%2, %%"REG_a") \n\t"
00580 "movq (%1, %3), %%mm1 \n\t"
00581 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00582 "add %%"REG_a", %2 \n\t"
00583 "add %%"REG_a", %1 \n\t"
00584 "psubusb %%mm6, %%mm1 \n\t"
00585 PAVGB" %%mm1, %%mm2 \n\t"
00586 PAVGB" %%mm0, %%mm1 \n\t"
00587 "movq %%mm2, (%2, %3) \n\t"
00588 "movq %%mm1, (%2, %%"REG_a") \n\t"
00589 "add %%"REG_a", %2 \n\t"
00590 "subl $4, %0 \n\t"
00591 "jnz 1b \n\t"
00592 :"+g"(h), "+S"(pixels), "+D" (block)
00593 :"r" ((x86_reg)line_size)
00594 :"%"REG_a, "memory");
00595 }
00596
00597 static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00598 {
00599 __asm__ volatile (
00600 "movq (%1), %%mm0 \n\t"
00601 "pcmpeqb %%mm6, %%mm6 \n\t"
00602 "add %3, %1 \n\t"
00603 "pxor %%mm6, %%mm0 \n\t"
00604 "1: \n\t"
00605 "movq (%1), %%mm1 \n\t"
00606 "movq (%1, %3), %%mm2 \n\t"
00607 "pxor %%mm6, %%mm1 \n\t"
00608 "pxor %%mm6, %%mm2 \n\t"
00609 PAVGB" %%mm1, %%mm0 \n\t"
00610 PAVGB" %%mm2, %%mm1 \n\t"
00611 "pxor %%mm6, %%mm0 \n\t"
00612 "pxor %%mm6, %%mm1 \n\t"
00613 "movq %%mm0, (%2) \n\t"
00614 "movq %%mm1, (%2, %3) \n\t"
00615 "movq (%1, %3,2), %%mm1 \n\t"
00616 "movq (%1, %4), %%mm0 \n\t"
00617 "pxor %%mm6, %%mm1 \n\t"
00618 "pxor %%mm6, %%mm0 \n\t"
00619 PAVGB" %%mm1, %%mm2 \n\t"
00620 PAVGB" %%mm0, %%mm1 \n\t"
00621 "pxor %%mm6, %%mm2 \n\t"
00622 "pxor %%mm6, %%mm1 \n\t"
00623 "movq %%mm2, (%2, %3,2) \n\t"
00624 "movq %%mm1, (%2, %4) \n\t"
00625 "lea (%1, %3,4), %1 \n\t"
00626 "lea (%2, %3,4), %2 \n\t"
00627 "subl $4, %0 \n\t"
00628 "jg 1b \n\t"
00629 :"+g"(h), "+r"(pixels), "+r" (block)
00630 :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
00631 :"memory"
00632 );
00633 }
00634
00635 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00636 {
00637 __asm__ volatile(
00638 "lea (%3, %3), %%"REG_a" \n\t"
00639 "1: \n\t"
00640 "movq (%2), %%mm0 \n\t"
00641 "movq (%2, %3), %%mm1 \n\t"
00642 PAVGB" (%1), %%mm0 \n\t"
00643 PAVGB" (%1, %3), %%mm1 \n\t"
00644 "movq %%mm0, (%2) \n\t"
00645 "movq %%mm1, (%2, %3) \n\t"
00646 "add %%"REG_a", %1 \n\t"
00647 "add %%"REG_a", %2 \n\t"
00648 "movq (%2), %%mm0 \n\t"
00649 "movq (%2, %3), %%mm1 \n\t"
00650 PAVGB" (%1), %%mm0 \n\t"
00651 PAVGB" (%1, %3), %%mm1 \n\t"
00652 "add %%"REG_a", %1 \n\t"
00653 "movq %%mm0, (%2) \n\t"
00654 "movq %%mm1, (%2, %3) \n\t"
00655 "add %%"REG_a", %2 \n\t"
00656 "subl $4, %0 \n\t"
00657 "jnz 1b \n\t"
00658 :"+g"(h), "+S"(pixels), "+D"(block)
00659 :"r" ((x86_reg)line_size)
00660 :"%"REG_a, "memory");
00661 }
00662
00663 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00664 {
00665 __asm__ volatile(
00666 "lea (%3, %3), %%"REG_a" \n\t"
00667 "1: \n\t"
00668 "movq (%1), %%mm0 \n\t"
00669 "movq (%1, %3), %%mm2 \n\t"
00670 PAVGB" 1(%1), %%mm0 \n\t"
00671 PAVGB" 1(%1, %3), %%mm2 \n\t"
00672 PAVGB" (%2), %%mm0 \n\t"
00673 PAVGB" (%2, %3), %%mm2 \n\t"
00674 "add %%"REG_a", %1 \n\t"
00675 "movq %%mm0, (%2) \n\t"
00676 "movq %%mm2, (%2, %3) \n\t"
00677 "movq (%1), %%mm0 \n\t"
00678 "movq (%1, %3), %%mm2 \n\t"
00679 PAVGB" 1(%1), %%mm0 \n\t"
00680 PAVGB" 1(%1, %3), %%mm2 \n\t"
00681 "add %%"REG_a", %2 \n\t"
00682 "add %%"REG_a", %1 \n\t"
00683 PAVGB" (%2), %%mm0 \n\t"
00684 PAVGB" (%2, %3), %%mm2 \n\t"
00685 "movq %%mm0, (%2) \n\t"
00686 "movq %%mm2, (%2, %3) \n\t"
00687 "add %%"REG_a", %2 \n\t"
00688 "subl $4, %0 \n\t"
00689 "jnz 1b \n\t"
00690 :"+g"(h), "+S"(pixels), "+D"(block)
00691 :"r" ((x86_reg)line_size)
00692 :"%"REG_a, "memory");
00693 }
00694
00695 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00696 {
00697 __asm__ volatile(
00698 "lea (%3, %3), %%"REG_a" \n\t"
00699 "movq (%1), %%mm0 \n\t"
00700 "sub %3, %2 \n\t"
00701 "1: \n\t"
00702 "movq (%1, %3), %%mm1 \n\t"
00703 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00704 "add %%"REG_a", %1 \n\t"
00705 PAVGB" %%mm1, %%mm0 \n\t"
00706 PAVGB" %%mm2, %%mm1 \n\t"
00707 "movq (%2, %3), %%mm3 \n\t"
00708 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00709 PAVGB" %%mm3, %%mm0 \n\t"
00710 PAVGB" %%mm4, %%mm1 \n\t"
00711 "movq %%mm0, (%2, %3) \n\t"
00712 "movq %%mm1, (%2, %%"REG_a") \n\t"
00713 "movq (%1, %3), %%mm1 \n\t"
00714 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00715 PAVGB" %%mm1, %%mm2 \n\t"
00716 PAVGB" %%mm0, %%mm1 \n\t"
00717 "add %%"REG_a", %2 \n\t"
00718 "add %%"REG_a", %1 \n\t"
00719 "movq (%2, %3), %%mm3 \n\t"
00720 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00721 PAVGB" %%mm3, %%mm2 \n\t"
00722 PAVGB" %%mm4, %%mm1 \n\t"
00723 "movq %%mm2, (%2, %3) \n\t"
00724 "movq %%mm1, (%2, %%"REG_a") \n\t"
00725 "add %%"REG_a", %2 \n\t"
00726 "subl $4, %0 \n\t"
00727 "jnz 1b \n\t"
00728 :"+g"(h), "+S"(pixels), "+D"(block)
00729 :"r" ((x86_reg)line_size)
00730 :"%"REG_a, "memory");
00731 }
00732
00733
00734
00735 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00736 {
00737 MOVQ_BONE(mm6);
00738 __asm__ volatile(
00739 "lea (%3, %3), %%"REG_a" \n\t"
00740 "movq (%1), %%mm0 \n\t"
00741 PAVGB" 1(%1), %%mm0 \n\t"
00742 ".p2align 3 \n\t"
00743 "1: \n\t"
00744 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00745 "movq (%1, %3), %%mm1 \n\t"
00746 "psubusb %%mm6, %%mm2 \n\t"
00747 PAVGB" 1(%1, %3), %%mm1 \n\t"
00748 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
00749 "add %%"REG_a", %1 \n\t"
00750 PAVGB" %%mm1, %%mm0 \n\t"
00751 PAVGB" %%mm2, %%mm1 \n\t"
00752 PAVGB" (%2), %%mm0 \n\t"
00753 PAVGB" (%2, %3), %%mm1 \n\t"
00754 "movq %%mm0, (%2) \n\t"
00755 "movq %%mm1, (%2, %3) \n\t"
00756 "movq (%1, %3), %%mm1 \n\t"
00757 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00758 PAVGB" 1(%1, %3), %%mm1 \n\t"
00759 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
00760 "add %%"REG_a", %2 \n\t"
00761 "add %%"REG_a", %1 \n\t"
00762 PAVGB" %%mm1, %%mm2 \n\t"
00763 PAVGB" %%mm0, %%mm1 \n\t"
00764 PAVGB" (%2), %%mm2 \n\t"
00765 PAVGB" (%2, %3), %%mm1 \n\t"
00766 "movq %%mm2, (%2) \n\t"
00767 "movq %%mm1, (%2, %3) \n\t"
00768 "add %%"REG_a", %2 \n\t"
00769 "subl $4, %0 \n\t"
00770 "jnz 1b \n\t"
00771 :"+g"(h), "+S"(pixels), "+D"(block)
00772 :"r" ((x86_reg)line_size)
00773 :"%"REG_a, "memory");
00774 }
00775
00776
00777 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00778 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
00779 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
00780 }
00781 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00782 DEF(put_pixels8_y2)(block , pixels , line_size, h);
00783 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
00784 }
00785 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00786 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
00787 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
00788 }
00789 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00790 DEF(avg_pixels8)(block , pixels , line_size, h);
00791 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
00792 }
00793 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00794 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
00795 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
00796 }
00797 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00798 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
00799 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
00800 }
00801 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00802 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
00803 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
00804 }
00805
00806 #define QPEL_2TAP_L3(OPNAME) \
00807 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00808 __asm__ volatile(\
00809 "1: \n\t"\
00810 "movq (%1,%2), %%mm0 \n\t"\
00811 "movq 8(%1,%2), %%mm1 \n\t"\
00812 PAVGB" (%1,%3), %%mm0 \n\t"\
00813 PAVGB" 8(%1,%3), %%mm1 \n\t"\
00814 PAVGB" (%1), %%mm0 \n\t"\
00815 PAVGB" 8(%1), %%mm1 \n\t"\
00816 STORE_OP( (%1,%4),%%mm0)\
00817 STORE_OP(8(%1,%4),%%mm1)\
00818 "movq %%mm0, (%1,%4) \n\t"\
00819 "movq %%mm1, 8(%1,%4) \n\t"\
00820 "add %5, %1 \n\t"\
00821 "decl %0 \n\t"\
00822 "jnz 1b \n\t"\
00823 :"+g"(h), "+r"(src)\
00824 :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00825 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00826 :"memory"\
00827 );\
00828 }\
00829 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00830 __asm__ volatile(\
00831 "1: \n\t"\
00832 "movq (%1,%2), %%mm0 \n\t"\
00833 PAVGB" (%1,%3), %%mm0 \n\t"\
00834 PAVGB" (%1), %%mm0 \n\t"\
00835 STORE_OP((%1,%4),%%mm0)\
00836 "movq %%mm0, (%1,%4) \n\t"\
00837 "add %5, %1 \n\t"\
00838 "decl %0 \n\t"\
00839 "jnz 1b \n\t"\
00840 :"+g"(h), "+r"(src)\
00841 :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00842 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00843 :"memory"\
00844 );\
00845 }
00846
00847 #ifndef SKIP_FOR_3DNOW
00848 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
00849 QPEL_2TAP_L3(avg_)
00850 #undef STORE_OP
00851 #define STORE_OP(a,b)
00852 QPEL_2TAP_L3(put_)
00853 #undef STORE_OP
00854 #undef QPEL_2TAP_L3
00855 #endif