00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #undef REAL_MOVNTQ
00022 #undef MOVNTQ
00023 #undef PREFETCH
00024
00025 #if COMPILE_TEMPLATE_MMX2
00026 #define PREFETCH "prefetchnta"
00027 #else
00028 #define PREFETCH " # nop"
00029 #endif
00030
00031 #if COMPILE_TEMPLATE_MMX2
00032 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00033 #else
00034 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00035 #endif
00036 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
00037
00038 #define YSCALEYUV2YV12X(offset, dest, end, pos) \
00039 __asm__ volatile(\
00040 "movq "DITHER16"+0(%0), %%mm3 \n\t"\
00041 "movq "DITHER16"+8(%0), %%mm4 \n\t"\
00042 "lea " offset "(%0), %%"REG_d" \n\t"\
00043 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00044 ".p2align 4 \n\t" \
00045 "1: \n\t"\
00046 "movq 8(%%"REG_d"), %%mm0 \n\t" \
00047 "movq (%%"REG_S", %3, 2), %%mm2 \n\t" \
00048 "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" \
00049 "add $16, %%"REG_d" \n\t"\
00050 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00051 "test %%"REG_S", %%"REG_S" \n\t"\
00052 "pmulhw %%mm0, %%mm2 \n\t"\
00053 "pmulhw %%mm0, %%mm5 \n\t"\
00054 "paddw %%mm2, %%mm3 \n\t"\
00055 "paddw %%mm5, %%mm4 \n\t"\
00056 " jnz 1b \n\t"\
00057 "psraw $3, %%mm3 \n\t"\
00058 "psraw $3, %%mm4 \n\t"\
00059 "packuswb %%mm4, %%mm3 \n\t"\
00060 MOVNTQ(%%mm3, (%1, %3))\
00061 "add $8, %3 \n\t"\
00062 "cmp %2, %3 \n\t"\
00063 "movq "DITHER16"+0(%0), %%mm3 \n\t"\
00064 "movq "DITHER16"+8(%0), %%mm4 \n\t"\
00065 "lea " offset "(%0), %%"REG_d" \n\t"\
00066 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00067 "jb 1b \n\t"\
00068 :: "r" (&c->redDither),\
00069 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
00070 : "%"REG_d, "%"REG_S\
00071 );
00072
00073 #if !COMPILE_TEMPLATE_MMX2
00074 static av_always_inline void
00075 dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
00076 {
00077 if (rot) {
00078 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
00079 "movq (%0), %%mm3\n\t"
00080 "movq %%mm3, %%mm4\n\t"
00081 "psrlq $24, %%mm3\n\t"
00082 "psllq $40, %%mm4\n\t"
00083 "por %%mm4, %%mm3\n\t"
00084 "movq %%mm3, %%mm4\n\t"
00085 "punpcklbw %%mm0, %%mm3\n\t"
00086 "punpckhbw %%mm0, %%mm4\n\t"
00087 "psraw $4, %%mm3\n\t"
00088 "psraw $4, %%mm4\n\t"
00089 "movq %%mm3, "DITHER16"+0(%1)\n\t"
00090 "movq %%mm4, "DITHER16"+8(%1)\n\t"
00091 :: "r"(srcDither), "r"(&c->redDither)
00092 );
00093 } else {
00094 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
00095 "movq (%0), %%mm3\n\t"
00096 "movq %%mm3, %%mm4\n\t"
00097 "punpcklbw %%mm0, %%mm3\n\t"
00098 "punpckhbw %%mm0, %%mm4\n\t"
00099 "psraw $4, %%mm3\n\t"
00100 "psraw $4, %%mm4\n\t"
00101 "movq %%mm3, "DITHER16"+0(%1)\n\t"
00102 "movq %%mm4, "DITHER16"+8(%1)\n\t"
00103 :: "r"(srcDither), "r"(&c->redDither)
00104 );
00105 }
00106 }
00107 #endif
00108
00109 static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
00110 const int16_t **lumSrc, int lumFilterSize,
00111 const int16_t *chrFilter, const int16_t **chrUSrc,
00112 const int16_t **chrVSrc,
00113 int chrFilterSize, const int16_t **alpSrc,
00114 uint8_t *dest[4], int dstW, int chrDstW)
00115 {
00116 uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
00117 *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
00118 const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
00119
00120 if (uDest) {
00121 x86_reg uv_off = c->uv_offx2 >> 1;
00122 dither_8to16(c, chrDither, 0);
00123 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
00124 dither_8to16(c, chrDither, 1);
00125 YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
00126 }
00127 dither_8to16(c, lumDither, 0);
00128 if (CONFIG_SWSCALE_ALPHA && aDest) {
00129 YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
00130 }
00131
00132 YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
00133 }
00134
00135 #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
00136 __asm__ volatile(\
00137 "lea " offset "(%0), %%"REG_d" \n\t"\
00138 "movq "DITHER32"+0(%0), %%mm4 \n\t"\
00139 "movq "DITHER32"+8(%0), %%mm5 \n\t"\
00140 "movq "DITHER32"+16(%0), %%mm6 \n\t"\
00141 "movq "DITHER32"+24(%0), %%mm7 \n\t"\
00142 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00143 ".p2align 4 \n\t"\
00144 "1: \n\t"\
00145 "movq (%%"REG_S", %3, 2), %%mm0 \n\t" \
00146 "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" \
00147 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00148 "movq (%%"REG_S", %3, 2), %%mm1 \n\t" \
00149 "movq %%mm0, %%mm3 \n\t"\
00150 "punpcklwd %%mm1, %%mm0 \n\t"\
00151 "punpckhwd %%mm1, %%mm3 \n\t"\
00152 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" \
00153 "pmaddwd %%mm1, %%mm0 \n\t"\
00154 "pmaddwd %%mm1, %%mm3 \n\t"\
00155 "paddd %%mm0, %%mm4 \n\t"\
00156 "paddd %%mm3, %%mm5 \n\t"\
00157 "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" \
00158 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00159 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00160 "test %%"REG_S", %%"REG_S" \n\t"\
00161 "movq %%mm2, %%mm0 \n\t"\
00162 "punpcklwd %%mm3, %%mm2 \n\t"\
00163 "punpckhwd %%mm3, %%mm0 \n\t"\
00164 "pmaddwd %%mm1, %%mm2 \n\t"\
00165 "pmaddwd %%mm1, %%mm0 \n\t"\
00166 "paddd %%mm2, %%mm6 \n\t"\
00167 "paddd %%mm0, %%mm7 \n\t"\
00168 " jnz 1b \n\t"\
00169 "psrad $19, %%mm4 \n\t"\
00170 "psrad $19, %%mm5 \n\t"\
00171 "psrad $19, %%mm6 \n\t"\
00172 "psrad $19, %%mm7 \n\t"\
00173 "packssdw %%mm5, %%mm4 \n\t"\
00174 "packssdw %%mm7, %%mm6 \n\t"\
00175 "packuswb %%mm6, %%mm4 \n\t"\
00176 MOVNTQ(%%mm4, (%1, %3))\
00177 "add $8, %3 \n\t"\
00178 "cmp %2, %3 \n\t"\
00179 "lea " offset "(%0), %%"REG_d" \n\t"\
00180 "movq "DITHER32"+0(%0), %%mm4 \n\t"\
00181 "movq "DITHER32"+8(%0), %%mm5 \n\t"\
00182 "movq "DITHER32"+16(%0), %%mm6 \n\t"\
00183 "movq "DITHER32"+24(%0), %%mm7 \n\t"\
00184 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00185 "jb 1b \n\t"\
00186 :: "r" (&c->redDither),\
00187 "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
00188 : "%"REG_a, "%"REG_d, "%"REG_S\
00189 );
00190
00191 #if !COMPILE_TEMPLATE_MMX2
00192 static av_always_inline void
00193 dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
00194 {
00195 if (rot) {
00196 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
00197 "movq (%0), %%mm4\n\t"
00198 "movq %%mm4, %%mm5\n\t"
00199 "psrlq $24, %%mm4\n\t"
00200 "psllq $40, %%mm5\n\t"
00201 "por %%mm5, %%mm4\n\t"
00202 "movq %%mm4, %%mm6\n\t"
00203 "punpcklbw %%mm0, %%mm4\n\t"
00204 "punpckhbw %%mm0, %%mm6\n\t"
00205 "movq %%mm4, %%mm5\n\t"
00206 "movq %%mm6, %%mm7\n\t"
00207 "punpcklwd %%mm0, %%mm4\n\t"
00208 "punpckhwd %%mm0, %%mm5\n\t"
00209 "punpcklwd %%mm0, %%mm6\n\t"
00210 "punpckhwd %%mm0, %%mm7\n\t"
00211 "pslld $12, %%mm4\n\t"
00212 "pslld $12, %%mm5\n\t"
00213 "pslld $12, %%mm6\n\t"
00214 "pslld $12, %%mm7\n\t"
00215 "movq %%mm4, "DITHER32"+0(%1)\n\t"
00216 "movq %%mm5, "DITHER32"+8(%1)\n\t"
00217 "movq %%mm6, "DITHER32"+16(%1)\n\t"
00218 "movq %%mm7, "DITHER32"+24(%1)\n\t"
00219 :: "r"(srcDither), "r"(&c->redDither)
00220 );
00221 } else {
00222 __asm__ volatile("pxor %%mm0, %%mm0\n\t"
00223 "movq (%0), %%mm4\n\t"
00224 "movq %%mm4, %%mm6\n\t"
00225 "punpcklbw %%mm0, %%mm4\n\t"
00226 "punpckhbw %%mm0, %%mm6\n\t"
00227 "movq %%mm4, %%mm5\n\t"
00228 "movq %%mm6, %%mm7\n\t"
00229 "punpcklwd %%mm0, %%mm4\n\t"
00230 "punpckhwd %%mm0, %%mm5\n\t"
00231 "punpcklwd %%mm0, %%mm6\n\t"
00232 "punpckhwd %%mm0, %%mm7\n\t"
00233 "pslld $12, %%mm4\n\t"
00234 "pslld $12, %%mm5\n\t"
00235 "pslld $12, %%mm6\n\t"
00236 "pslld $12, %%mm7\n\t"
00237 "movq %%mm4, "DITHER32"+0(%1)\n\t"
00238 "movq %%mm5, "DITHER32"+8(%1)\n\t"
00239 "movq %%mm6, "DITHER32"+16(%1)\n\t"
00240 "movq %%mm7, "DITHER32"+24(%1)\n\t"
00241 :: "r"(srcDither), "r"(&c->redDither)
00242 );
00243 }
00244 }
00245 #endif
00246
00247 static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
00248 const int16_t **lumSrc, int lumFilterSize,
00249 const int16_t *chrFilter, const int16_t **chrUSrc,
00250 const int16_t **chrVSrc,
00251 int chrFilterSize, const int16_t **alpSrc,
00252 uint8_t *dest[4], int dstW, int chrDstW)
00253 {
00254 uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
00255 *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
00256 const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
00257
00258 if (uDest) {
00259 x86_reg uv_off = c->uv_offx2 >> 1;
00260 dither_8to32(c, chrDither, 0);
00261 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
00262 dither_8to32(c, chrDither, 1);
00263 YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
00264 }
00265 dither_8to32(c, lumDither, 0);
00266 if (CONFIG_SWSCALE_ALPHA && aDest) {
00267 YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
00268 }
00269
00270 YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0)
00271 }
00272
00273 static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
00274 const int16_t *chrUSrc, const int16_t *chrVSrc,
00275 const int16_t *alpSrc,
00276 uint8_t *dst[4], int dstW, int chrDstW)
00277 {
00278 int p= 4;
00279 const int16_t *src[4]= {
00280 lumSrc + dstW, chrUSrc + chrDstW,
00281 chrVSrc + chrDstW, alpSrc + dstW
00282 };
00283 x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
00284
00285 while (p--) {
00286 if (dst[p]) {
00287 __asm__ volatile(
00288 "mov %2, %%"REG_a" \n\t"
00289 ".p2align 4 \n\t"
00290 "1: \n\t"
00291 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
00292 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
00293 "psraw $7, %%mm0 \n\t"
00294 "psraw $7, %%mm1 \n\t"
00295 "packuswb %%mm1, %%mm0 \n\t"
00296 MOVNTQ(%%mm0, (%1, %%REGa))
00297 "add $8, %%"REG_a" \n\t"
00298 "jnc 1b \n\t"
00299 :: "r" (src[p]), "r" (dst[p] + counter[p]),
00300 "g" (-counter[p])
00301 : "%"REG_a
00302 );
00303 }
00304 }
00305 }
00306
00307 static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
00308 const int16_t *chrUSrc, const int16_t *chrVSrc,
00309 const int16_t *alpSrc,
00310 uint8_t *dst[4], int dstW, int chrDstW)
00311 {
00312 int p= 4;
00313 const int16_t *src[4]= {
00314 lumSrc + dstW, chrUSrc + chrDstW,
00315 chrVSrc + chrDstW, alpSrc + dstW
00316 };
00317 x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
00318 const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
00319
00320 while (p--) {
00321 if (dst[p]) {
00322 int i;
00323 for(i=0; i<8; i++) c->dither16[i] = (p == 2 || p == 3) ? lumDither[i] : chrDither[i];
00324 __asm__ volatile(
00325 "mov %2, %%"REG_a" \n\t"
00326 "movq "DITHER16"+0(%3), %%mm6 \n\t"
00327 "movq "DITHER16"+8(%3), %%mm7 \n\t"
00328 ".p2align 4 \n\t"
00329 "1: \n\t"
00330 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"
00331 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
00332 "paddsw %%mm6, %%mm0 \n\t"
00333 "paddsw %%mm7, %%mm1 \n\t"
00334 "psraw $7, %%mm0 \n\t"
00335 "psraw $7, %%mm1 \n\t"
00336 "packuswb %%mm1, %%mm0 \n\t"
00337 MOVNTQ(%%mm0, (%1, %%REGa))
00338 "add $8, %%"REG_a" \n\t"
00339 "jnc 1b \n\t"
00340 :: "r" (src[p]), "r" (dst[p] + counter[p]),
00341 "g" (-counter[p]), "r"(&c->redDither)
00342 : "%"REG_a
00343 );
00344 }
00345 }
00346 }
00347
00348 #define YSCALEYUV2PACKEDX_UV \
00349 __asm__ volatile(\
00350 "xor %%"REG_a", %%"REG_a" \n\t"\
00351 ".p2align 4 \n\t"\
00352 "nop \n\t"\
00353 "1: \n\t"\
00354 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
00355 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00356 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
00357 "movq %%mm3, %%mm4 \n\t"\
00358 ".p2align 4 \n\t"\
00359 "2: \n\t"\
00360 "movq 8(%%"REG_d"), %%mm0 \n\t" \
00361 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
00362 "add %6, %%"REG_S" \n\t" \
00363 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" \
00364 "add $16, %%"REG_d" \n\t"\
00365 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00366 "pmulhw %%mm0, %%mm2 \n\t"\
00367 "pmulhw %%mm0, %%mm5 \n\t"\
00368 "paddw %%mm2, %%mm3 \n\t"\
00369 "paddw %%mm5, %%mm4 \n\t"\
00370 "test %%"REG_S", %%"REG_S" \n\t"\
00371 " jnz 2b \n\t"\
00372
00373 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
00374 "lea "offset"(%0), %%"REG_d" \n\t"\
00375 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00376 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
00377 "movq "#dst1", "#dst2" \n\t"\
00378 ".p2align 4 \n\t"\
00379 "2: \n\t"\
00380 "movq 8(%%"REG_d"), "#coeff" \n\t" \
00381 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \
00382 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \
00383 "add $16, %%"REG_d" \n\t"\
00384 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00385 "pmulhw "#coeff", "#src1" \n\t"\
00386 "pmulhw "#coeff", "#src2" \n\t"\
00387 "paddw "#src1", "#dst1" \n\t"\
00388 "paddw "#src2", "#dst2" \n\t"\
00389 "test %%"REG_S", %%"REG_S" \n\t"\
00390 " jnz 2b \n\t"\
00391
00392 #define YSCALEYUV2PACKEDX \
00393 YSCALEYUV2PACKEDX_UV \
00394 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
00395
00396 #define YSCALEYUV2PACKEDX_END \
00397 :: "r" (&c->redDither), \
00398 "m" (dummy), "m" (dummy), "m" (dummy),\
00399 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
00400 : "%"REG_a, "%"REG_d, "%"REG_S \
00401 );
00402
00403 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00404 __asm__ volatile(\
00405 "xor %%"REG_a", %%"REG_a" \n\t"\
00406 ".p2align 4 \n\t"\
00407 "nop \n\t"\
00408 "1: \n\t"\
00409 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
00410 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00411 "pxor %%mm4, %%mm4 \n\t"\
00412 "pxor %%mm5, %%mm5 \n\t"\
00413 "pxor %%mm6, %%mm6 \n\t"\
00414 "pxor %%mm7, %%mm7 \n\t"\
00415 ".p2align 4 \n\t"\
00416 "2: \n\t"\
00417 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \
00418 "add %6, %%"REG_S" \n\t" \
00419 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
00420 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00421 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \
00422 "movq %%mm0, %%mm3 \n\t"\
00423 "punpcklwd %%mm1, %%mm0 \n\t"\
00424 "punpckhwd %%mm1, %%mm3 \n\t"\
00425 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \
00426 "pmaddwd %%mm1, %%mm0 \n\t"\
00427 "pmaddwd %%mm1, %%mm3 \n\t"\
00428 "paddd %%mm0, %%mm4 \n\t"\
00429 "paddd %%mm3, %%mm5 \n\t"\
00430 "add %6, %%"REG_S" \n\t" \
00431 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" \
00432 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00433 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00434 "test %%"REG_S", %%"REG_S" \n\t"\
00435 "movq %%mm2, %%mm0 \n\t"\
00436 "punpcklwd %%mm3, %%mm2 \n\t"\
00437 "punpckhwd %%mm3, %%mm0 \n\t"\
00438 "pmaddwd %%mm1, %%mm2 \n\t"\
00439 "pmaddwd %%mm1, %%mm0 \n\t"\
00440 "paddd %%mm2, %%mm6 \n\t"\
00441 "paddd %%mm0, %%mm7 \n\t"\
00442 " jnz 2b \n\t"\
00443 "psrad $16, %%mm4 \n\t"\
00444 "psrad $16, %%mm5 \n\t"\
00445 "psrad $16, %%mm6 \n\t"\
00446 "psrad $16, %%mm7 \n\t"\
00447 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
00448 "packssdw %%mm5, %%mm4 \n\t"\
00449 "packssdw %%mm7, %%mm6 \n\t"\
00450 "paddw %%mm0, %%mm4 \n\t"\
00451 "paddw %%mm0, %%mm6 \n\t"\
00452 "movq %%mm4, "U_TEMP"(%0) \n\t"\
00453 "movq %%mm6, "V_TEMP"(%0) \n\t"\
00454
00455 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00456 "lea "offset"(%0), %%"REG_d" \n\t"\
00457 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00458 "pxor %%mm1, %%mm1 \n\t"\
00459 "pxor %%mm5, %%mm5 \n\t"\
00460 "pxor %%mm7, %%mm7 \n\t"\
00461 "pxor %%mm6, %%mm6 \n\t"\
00462 ".p2align 4 \n\t"\
00463 "2: \n\t"\
00464 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \
00465 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
00466 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00467 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \
00468 "movq %%mm0, %%mm3 \n\t"\
00469 "punpcklwd %%mm4, %%mm0 \n\t"\
00470 "punpckhwd %%mm4, %%mm3 \n\t"\
00471 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \
00472 "pmaddwd %%mm4, %%mm0 \n\t"\
00473 "pmaddwd %%mm4, %%mm3 \n\t"\
00474 "paddd %%mm0, %%mm1 \n\t"\
00475 "paddd %%mm3, %%mm5 \n\t"\
00476 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \
00477 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00478 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00479 "test %%"REG_S", %%"REG_S" \n\t"\
00480 "movq %%mm2, %%mm0 \n\t"\
00481 "punpcklwd %%mm3, %%mm2 \n\t"\
00482 "punpckhwd %%mm3, %%mm0 \n\t"\
00483 "pmaddwd %%mm4, %%mm2 \n\t"\
00484 "pmaddwd %%mm4, %%mm0 \n\t"\
00485 "paddd %%mm2, %%mm7 \n\t"\
00486 "paddd %%mm0, %%mm6 \n\t"\
00487 " jnz 2b \n\t"\
00488 "psrad $16, %%mm1 \n\t"\
00489 "psrad $16, %%mm5 \n\t"\
00490 "psrad $16, %%mm7 \n\t"\
00491 "psrad $16, %%mm6 \n\t"\
00492 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
00493 "packssdw %%mm5, %%mm1 \n\t"\
00494 "packssdw %%mm6, %%mm7 \n\t"\
00495 "paddw %%mm0, %%mm1 \n\t"\
00496 "paddw %%mm0, %%mm7 \n\t"\
00497 "movq "U_TEMP"(%0), %%mm3 \n\t"\
00498 "movq "V_TEMP"(%0), %%mm4 \n\t"\
00499
00500 #define YSCALEYUV2PACKEDX_ACCURATE \
00501 YSCALEYUV2PACKEDX_ACCURATE_UV \
00502 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00503
00504 #define YSCALEYUV2RGBX \
00505 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
00506 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
00507 "movq %%mm3, %%mm2 \n\t" \
00508 "movq %%mm4, %%mm5 \n\t" \
00509 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
00510 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
00511 \
00512 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
00513 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
00514 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
00515 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
00516 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
00517 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
00518 \
00519 "paddw %%mm3, %%mm4 \n\t"\
00520 "movq %%mm2, %%mm0 \n\t"\
00521 "movq %%mm5, %%mm6 \n\t"\
00522 "movq %%mm4, %%mm3 \n\t"\
00523 "punpcklwd %%mm2, %%mm2 \n\t"\
00524 "punpcklwd %%mm5, %%mm5 \n\t"\
00525 "punpcklwd %%mm4, %%mm4 \n\t"\
00526 "paddw %%mm1, %%mm2 \n\t"\
00527 "paddw %%mm1, %%mm5 \n\t"\
00528 "paddw %%mm1, %%mm4 \n\t"\
00529 "punpckhwd %%mm0, %%mm0 \n\t"\
00530 "punpckhwd %%mm6, %%mm6 \n\t"\
00531 "punpckhwd %%mm3, %%mm3 \n\t"\
00532 "paddw %%mm7, %%mm0 \n\t"\
00533 "paddw %%mm7, %%mm6 \n\t"\
00534 "paddw %%mm7, %%mm3 \n\t"\
00535 \
00536 "packuswb %%mm0, %%mm2 \n\t"\
00537 "packuswb %%mm6, %%mm5 \n\t"\
00538 "packuswb %%mm3, %%mm4 \n\t"\
00539
00540 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00541 "movq "#b", "#q2" \n\t" \
00542 "movq "#r", "#t" \n\t" \
00543 "punpcklbw "#g", "#b" \n\t" \
00544 "punpcklbw "#a", "#r" \n\t" \
00545 "punpckhbw "#g", "#q2" \n\t" \
00546 "punpckhbw "#a", "#t" \n\t" \
00547 "movq "#b", "#q0" \n\t" \
00548 "movq "#q2", "#q3" \n\t" \
00549 "punpcklwd "#r", "#q0" \n\t" \
00550 "punpckhwd "#r", "#b" \n\t" \
00551 "punpcklwd "#t", "#q2" \n\t" \
00552 "punpckhwd "#t", "#q3" \n\t" \
00553 \
00554 MOVNTQ( q0, (dst, index, 4))\
00555 MOVNTQ( b, 8(dst, index, 4))\
00556 MOVNTQ( q2, 16(dst, index, 4))\
00557 MOVNTQ( q3, 24(dst, index, 4))\
00558 \
00559 "add $8, "#index" \n\t"\
00560 "cmp "#dstw", "#index" \n\t"\
00561 " jb 1b \n\t"
00562 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00563
00564 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
00565 const int16_t **lumSrc, int lumFilterSize,
00566 const int16_t *chrFilter, const int16_t **chrUSrc,
00567 const int16_t **chrVSrc,
00568 int chrFilterSize, const int16_t **alpSrc,
00569 uint8_t *dest, int dstW, int dstY)
00570 {
00571 x86_reg dummy=0;
00572 x86_reg dstW_reg = dstW;
00573 x86_reg uv_off = c->uv_offx2;
00574
00575 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00576 YSCALEYUV2PACKEDX_ACCURATE
00577 YSCALEYUV2RGBX
00578 "movq %%mm2, "U_TEMP"(%0) \n\t"
00579 "movq %%mm4, "V_TEMP"(%0) \n\t"
00580 "movq %%mm5, "Y_TEMP"(%0) \n\t"
00581 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
00582 "movq "Y_TEMP"(%0), %%mm5 \n\t"
00583 "psraw $3, %%mm1 \n\t"
00584 "psraw $3, %%mm7 \n\t"
00585 "packuswb %%mm7, %%mm1 \n\t"
00586 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
00587 YSCALEYUV2PACKEDX_END
00588 } else {
00589 YSCALEYUV2PACKEDX_ACCURATE
00590 YSCALEYUV2RGBX
00591 "pcmpeqd %%mm7, %%mm7 \n\t"
00592 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00593 YSCALEYUV2PACKEDX_END
00594 }
00595 }
00596
00597 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
00598 const int16_t **lumSrc, int lumFilterSize,
00599 const int16_t *chrFilter, const int16_t **chrUSrc,
00600 const int16_t **chrVSrc,
00601 int chrFilterSize, const int16_t **alpSrc,
00602 uint8_t *dest, int dstW, int dstY)
00603 {
00604 x86_reg dummy=0;
00605 x86_reg dstW_reg = dstW;
00606 x86_reg uv_off = c->uv_offx2;
00607
00608 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00609 YSCALEYUV2PACKEDX
00610 YSCALEYUV2RGBX
00611 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
00612 "psraw $3, %%mm1 \n\t"
00613 "psraw $3, %%mm7 \n\t"
00614 "packuswb %%mm7, %%mm1 \n\t"
00615 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00616 YSCALEYUV2PACKEDX_END
00617 } else {
00618 YSCALEYUV2PACKEDX
00619 YSCALEYUV2RGBX
00620 "pcmpeqd %%mm7, %%mm7 \n\t"
00621 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00622 YSCALEYUV2PACKEDX_END
00623 }
00624 }
00625
00626 #define REAL_WRITERGB16(dst, dstw, index) \
00627 "pand "MANGLE(bF8)", %%mm2 \n\t" \
00628 "pand "MANGLE(bFC)", %%mm4 \n\t" \
00629 "pand "MANGLE(bF8)", %%mm5 \n\t" \
00630 "psrlq $3, %%mm2 \n\t"\
00631 \
00632 "movq %%mm2, %%mm1 \n\t"\
00633 "movq %%mm4, %%mm3 \n\t"\
00634 \
00635 "punpcklbw %%mm7, %%mm3 \n\t"\
00636 "punpcklbw %%mm5, %%mm2 \n\t"\
00637 "punpckhbw %%mm7, %%mm4 \n\t"\
00638 "punpckhbw %%mm5, %%mm1 \n\t"\
00639 \
00640 "psllq $3, %%mm3 \n\t"\
00641 "psllq $3, %%mm4 \n\t"\
00642 \
00643 "por %%mm3, %%mm2 \n\t"\
00644 "por %%mm4, %%mm1 \n\t"\
00645 \
00646 MOVNTQ(%%mm2, (dst, index, 2))\
00647 MOVNTQ(%%mm1, 8(dst, index, 2))\
00648 \
00649 "add $8, "#index" \n\t"\
00650 "cmp "#dstw", "#index" \n\t"\
00651 " jb 1b \n\t"
00652 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
00653
00654 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
00655 const int16_t **lumSrc, int lumFilterSize,
00656 const int16_t *chrFilter, const int16_t **chrUSrc,
00657 const int16_t **chrVSrc,
00658 int chrFilterSize, const int16_t **alpSrc,
00659 uint8_t *dest, int dstW, int dstY)
00660 {
00661 x86_reg dummy=0;
00662 x86_reg dstW_reg = dstW;
00663 x86_reg uv_off = c->uv_offx2;
00664
00665 YSCALEYUV2PACKEDX_ACCURATE
00666 YSCALEYUV2RGBX
00667 "pxor %%mm7, %%mm7 \n\t"
00668
00669 #ifdef DITHER1XBPP
00670 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00671 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00672 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00673 #endif
00674 WRITERGB16(%4, %5, %%REGa)
00675 YSCALEYUV2PACKEDX_END
00676 }
00677
00678 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
00679 const int16_t **lumSrc, int lumFilterSize,
00680 const int16_t *chrFilter, const int16_t **chrUSrc,
00681 const int16_t **chrVSrc,
00682 int chrFilterSize, const int16_t **alpSrc,
00683 uint8_t *dest, int dstW, int dstY)
00684 {
00685 x86_reg dummy=0;
00686 x86_reg dstW_reg = dstW;
00687 x86_reg uv_off = c->uv_offx2;
00688
00689 YSCALEYUV2PACKEDX
00690 YSCALEYUV2RGBX
00691 "pxor %%mm7, %%mm7 \n\t"
00692
00693 #ifdef DITHER1XBPP
00694 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
00695 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
00696 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
00697 #endif
00698 WRITERGB16(%4, %5, %%REGa)
00699 YSCALEYUV2PACKEDX_END
00700 }
00701
00702 #define REAL_WRITERGB15(dst, dstw, index) \
00703 "pand "MANGLE(bF8)", %%mm2 \n\t" \
00704 "pand "MANGLE(bF8)", %%mm4 \n\t" \
00705 "pand "MANGLE(bF8)", %%mm5 \n\t" \
00706 "psrlq $3, %%mm2 \n\t"\
00707 "psrlq $1, %%mm5 \n\t"\
00708 \
00709 "movq %%mm2, %%mm1 \n\t"\
00710 "movq %%mm4, %%mm3 \n\t"\
00711 \
00712 "punpcklbw %%mm7, %%mm3 \n\t"\
00713 "punpcklbw %%mm5, %%mm2 \n\t"\
00714 "punpckhbw %%mm7, %%mm4 \n\t"\
00715 "punpckhbw %%mm5, %%mm1 \n\t"\
00716 \
00717 "psllq $2, %%mm3 \n\t"\
00718 "psllq $2, %%mm4 \n\t"\
00719 \
00720 "por %%mm3, %%mm2 \n\t"\
00721 "por %%mm4, %%mm1 \n\t"\
00722 \
00723 MOVNTQ(%%mm2, (dst, index, 2))\
00724 MOVNTQ(%%mm1, 8(dst, index, 2))\
00725 \
00726 "add $8, "#index" \n\t"\
00727 "cmp "#dstw", "#index" \n\t"\
00728 " jb 1b \n\t"
00729 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
00730
00731 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
00732 const int16_t **lumSrc, int lumFilterSize,
00733 const int16_t *chrFilter, const int16_t **chrUSrc,
00734 const int16_t **chrVSrc,
00735 int chrFilterSize, const int16_t **alpSrc,
00736 uint8_t *dest, int dstW, int dstY)
00737 {
00738 x86_reg dummy=0;
00739 x86_reg dstW_reg = dstW;
00740 x86_reg uv_off = c->uv_offx2;
00741
00742 YSCALEYUV2PACKEDX_ACCURATE
00743 YSCALEYUV2RGBX
00744 "pxor %%mm7, %%mm7 \n\t"
00745
00746 #ifdef DITHER1XBPP
00747 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00748 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00749 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00750 #endif
00751 WRITERGB15(%4, %5, %%REGa)
00752 YSCALEYUV2PACKEDX_END
00753 }
00754
00755 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
00756 const int16_t **lumSrc, int lumFilterSize,
00757 const int16_t *chrFilter, const int16_t **chrUSrc,
00758 const int16_t **chrVSrc,
00759 int chrFilterSize, const int16_t **alpSrc,
00760 uint8_t *dest, int dstW, int dstY)
00761 {
00762 x86_reg dummy=0;
00763 x86_reg dstW_reg = dstW;
00764 x86_reg uv_off = c->uv_offx2;
00765
00766 YSCALEYUV2PACKEDX
00767 YSCALEYUV2RGBX
00768 "pxor %%mm7, %%mm7 \n\t"
00769
00770 #ifdef DITHER1XBPP
00771 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
00772 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
00773 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
00774 #endif
00775 WRITERGB15(%4, %5, %%REGa)
00776 YSCALEYUV2PACKEDX_END
00777 }
00778
00779 #define WRITEBGR24MMX(dst, dstw, index) \
00780 \
00781 "movq %%mm2, %%mm1 \n\t" \
00782 "movq %%mm5, %%mm6 \n\t" \
00783 "punpcklbw %%mm4, %%mm2 \n\t" \
00784 "punpcklbw %%mm7, %%mm5 \n\t" \
00785 "punpckhbw %%mm4, %%mm1 \n\t" \
00786 "punpckhbw %%mm7, %%mm6 \n\t" \
00787 "movq %%mm2, %%mm0 \n\t" \
00788 "movq %%mm1, %%mm3 \n\t" \
00789 "punpcklwd %%mm5, %%mm0 \n\t" \
00790 "punpckhwd %%mm5, %%mm2 \n\t" \
00791 "punpcklwd %%mm6, %%mm1 \n\t" \
00792 "punpckhwd %%mm6, %%mm3 \n\t" \
00793 \
00794 "movq %%mm0, %%mm4 \n\t" \
00795 "movq %%mm2, %%mm6 \n\t" \
00796 "movq %%mm1, %%mm5 \n\t" \
00797 "movq %%mm3, %%mm7 \n\t" \
00798 \
00799 "psllq $40, %%mm0 \n\t" \
00800 "psllq $40, %%mm2 \n\t" \
00801 "psllq $40, %%mm1 \n\t" \
00802 "psllq $40, %%mm3 \n\t" \
00803 \
00804 "punpckhdq %%mm4, %%mm0 \n\t" \
00805 "punpckhdq %%mm6, %%mm2 \n\t" \
00806 "punpckhdq %%mm5, %%mm1 \n\t" \
00807 "punpckhdq %%mm7, %%mm3 \n\t" \
00808 \
00809 "psrlq $8, %%mm0 \n\t" \
00810 "movq %%mm2, %%mm6 \n\t" \
00811 "psllq $40, %%mm2 \n\t" \
00812 "por %%mm2, %%mm0 \n\t" \
00813 MOVNTQ(%%mm0, (dst))\
00814 \
00815 "psrlq $24, %%mm6 \n\t" \
00816 "movq %%mm1, %%mm5 \n\t" \
00817 "psllq $24, %%mm1 \n\t" \
00818 "por %%mm1, %%mm6 \n\t" \
00819 MOVNTQ(%%mm6, 8(dst))\
00820 \
00821 "psrlq $40, %%mm5 \n\t" \
00822 "psllq $8, %%mm3 \n\t" \
00823 "por %%mm3, %%mm5 \n\t" \
00824 MOVNTQ(%%mm5, 16(dst))\
00825 \
00826 "add $24, "#dst" \n\t"\
00827 \
00828 "add $8, "#index" \n\t"\
00829 "cmp "#dstw", "#index" \n\t"\
00830 " jb 1b \n\t"
00831
00832 #define WRITEBGR24MMX2(dst, dstw, index) \
00833 \
00834 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00835 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00836 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
00837 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
00838 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
00839 \
00840 "pand %%mm0, %%mm1 \n\t" \
00841 "pand %%mm0, %%mm3 \n\t" \
00842 "pand %%mm7, %%mm6 \n\t" \
00843 \
00844 "psllq $8, %%mm3 \n\t" \
00845 "por %%mm1, %%mm6 \n\t"\
00846 "por %%mm3, %%mm6 \n\t"\
00847 MOVNTQ(%%mm6, (dst))\
00848 \
00849 "psrlq $8, %%mm4 \n\t" \
00850 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
00851 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
00852 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
00853 \
00854 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
00855 "pand %%mm7, %%mm3 \n\t" \
00856 "pand %%mm0, %%mm6 \n\t" \
00857 \
00858 "por %%mm1, %%mm3 \n\t" \
00859 "por %%mm3, %%mm6 \n\t"\
00860 MOVNTQ(%%mm6, 8(dst))\
00861 \
00862 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
00863 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
00864 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
00865 \
00866 "pand %%mm7, %%mm1 \n\t" \
00867 "pand %%mm0, %%mm3 \n\t" \
00868 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
00869 \
00870 "por %%mm1, %%mm3 \n\t"\
00871 "por %%mm3, %%mm6 \n\t"\
00872 MOVNTQ(%%mm6, 16(dst))\
00873 \
00874 "add $24, "#dst" \n\t"\
00875 \
00876 "add $8, "#index" \n\t"\
00877 "cmp "#dstw", "#index" \n\t"\
00878 " jb 1b \n\t"
00879
00880 #if COMPILE_TEMPLATE_MMX2
00881 #undef WRITEBGR24
00882 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
00883 #else
00884 #undef WRITEBGR24
00885 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
00886 #endif
00887
00888 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
00889 const int16_t **lumSrc, int lumFilterSize,
00890 const int16_t *chrFilter, const int16_t **chrUSrc,
00891 const int16_t **chrVSrc,
00892 int chrFilterSize, const int16_t **alpSrc,
00893 uint8_t *dest, int dstW, int dstY)
00894 {
00895 x86_reg dummy=0;
00896 x86_reg dstW_reg = dstW;
00897 x86_reg uv_off = c->uv_offx2;
00898
00899 YSCALEYUV2PACKEDX_ACCURATE
00900 YSCALEYUV2RGBX
00901 "pxor %%mm7, %%mm7 \n\t"
00902 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t"
00903 "add %4, %%"REG_c" \n\t"
00904 WRITEBGR24(%%REGc, %5, %%REGa)
00905 :: "r" (&c->redDither),
00906 "m" (dummy), "m" (dummy), "m" (dummy),
00907 "r" (dest), "m" (dstW_reg), "m"(uv_off)
00908 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00909 );
00910 }
00911
00912 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
00913 const int16_t **lumSrc, int lumFilterSize,
00914 const int16_t *chrFilter, const int16_t **chrUSrc,
00915 const int16_t **chrVSrc,
00916 int chrFilterSize, const int16_t **alpSrc,
00917 uint8_t *dest, int dstW, int dstY)
00918 {
00919 x86_reg dummy=0;
00920 x86_reg dstW_reg = dstW;
00921 x86_reg uv_off = c->uv_offx2;
00922
00923 YSCALEYUV2PACKEDX
00924 YSCALEYUV2RGBX
00925 "pxor %%mm7, %%mm7 \n\t"
00926 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t"
00927 "add %4, %%"REG_c" \n\t"
00928 WRITEBGR24(%%REGc, %5, %%REGa)
00929 :: "r" (&c->redDither),
00930 "m" (dummy), "m" (dummy), "m" (dummy),
00931 "r" (dest), "m" (dstW_reg), "m"(uv_off)
00932 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00933 );
00934 }
00935
00936 #define REAL_WRITEYUY2(dst, dstw, index) \
00937 "packuswb %%mm3, %%mm3 \n\t"\
00938 "packuswb %%mm4, %%mm4 \n\t"\
00939 "packuswb %%mm7, %%mm1 \n\t"\
00940 "punpcklbw %%mm4, %%mm3 \n\t"\
00941 "movq %%mm1, %%mm7 \n\t"\
00942 "punpcklbw %%mm3, %%mm1 \n\t"\
00943 "punpckhbw %%mm3, %%mm7 \n\t"\
00944 \
00945 MOVNTQ(%%mm1, (dst, index, 2))\
00946 MOVNTQ(%%mm7, 8(dst, index, 2))\
00947 \
00948 "add $8, "#index" \n\t"\
00949 "cmp "#dstw", "#index" \n\t"\
00950 " jb 1b \n\t"
00951 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
00952
00953 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
00954 const int16_t **lumSrc, int lumFilterSize,
00955 const int16_t *chrFilter, const int16_t **chrUSrc,
00956 const int16_t **chrVSrc,
00957 int chrFilterSize, const int16_t **alpSrc,
00958 uint8_t *dest, int dstW, int dstY)
00959 {
00960 x86_reg dummy=0;
00961 x86_reg dstW_reg = dstW;
00962 x86_reg uv_off = c->uv_offx2;
00963
00964 YSCALEYUV2PACKEDX_ACCURATE
00965
00966 "psraw $3, %%mm3 \n\t"
00967 "psraw $3, %%mm4 \n\t"
00968 "psraw $3, %%mm1 \n\t"
00969 "psraw $3, %%mm7 \n\t"
00970 WRITEYUY2(%4, %5, %%REGa)
00971 YSCALEYUV2PACKEDX_END
00972 }
00973
00974 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
00975 const int16_t **lumSrc, int lumFilterSize,
00976 const int16_t *chrFilter, const int16_t **chrUSrc,
00977 const int16_t **chrVSrc,
00978 int chrFilterSize, const int16_t **alpSrc,
00979 uint8_t *dest, int dstW, int dstY)
00980 {
00981 x86_reg dummy=0;
00982 x86_reg dstW_reg = dstW;
00983 x86_reg uv_off = c->uv_offx2;
00984
00985 YSCALEYUV2PACKEDX
00986
00987 "psraw $3, %%mm3 \n\t"
00988 "psraw $3, %%mm4 \n\t"
00989 "psraw $3, %%mm1 \n\t"
00990 "psraw $3, %%mm7 \n\t"
00991 WRITEYUY2(%4, %5, %%REGa)
00992 YSCALEYUV2PACKEDX_END
00993 }
00994
00995 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00996 "xor "#index", "#index" \n\t"\
00997 ".p2align 4 \n\t"\
00998 "1: \n\t"\
00999 "movq (%2, "#index"), %%mm2 \n\t" \
01000 "movq (%3, "#index"), %%mm3 \n\t" \
01001 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01002 "movq (%2, "#index"), %%mm5 \n\t" \
01003 "movq (%3, "#index"), %%mm4 \n\t" \
01004 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01005 "psubw %%mm3, %%mm2 \n\t" \
01006 "psubw %%mm4, %%mm5 \n\t" \
01007 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
01008 "pmulhw %%mm0, %%mm2 \n\t" \
01009 "pmulhw %%mm0, %%mm5 \n\t" \
01010 "psraw $4, %%mm3 \n\t" \
01011 "psraw $4, %%mm4 \n\t" \
01012 "paddw %%mm2, %%mm3 \n\t" \
01013 "paddw %%mm5, %%mm4 \n\t" \
01014 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
01015 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
01016 "movq %%mm3, %%mm2 \n\t" \
01017 "movq %%mm4, %%mm5 \n\t" \
01018 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
01019 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
01020 \
01021
01022 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
01023 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
01024 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
01025 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
01026 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
01027 "psubw %%mm1, %%mm0 \n\t" \
01028 "psubw %%mm7, %%mm6 \n\t" \
01029 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
01030 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
01031 "psraw $4, %%mm1 \n\t" \
01032 "psraw $4, %%mm7 \n\t" \
01033 "paddw %%mm0, %%mm1 \n\t" \
01034 "paddw %%mm6, %%mm7 \n\t" \
01035
01036 #define REAL_YSCALEYUV2RGB_COEFF(c) \
01037 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
01038 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
01039 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
01040 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
01041 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
01042 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
01043 \
01044 "paddw %%mm3, %%mm4 \n\t"\
01045 "movq %%mm2, %%mm0 \n\t"\
01046 "movq %%mm5, %%mm6 \n\t"\
01047 "movq %%mm4, %%mm3 \n\t"\
01048 "punpcklwd %%mm2, %%mm2 \n\t"\
01049 "punpcklwd %%mm5, %%mm5 \n\t"\
01050 "punpcklwd %%mm4, %%mm4 \n\t"\
01051 "paddw %%mm1, %%mm2 \n\t"\
01052 "paddw %%mm1, %%mm5 \n\t"\
01053 "paddw %%mm1, %%mm4 \n\t"\
01054 "punpckhwd %%mm0, %%mm0 \n\t"\
01055 "punpckhwd %%mm6, %%mm6 \n\t"\
01056 "punpckhwd %%mm3, %%mm3 \n\t"\
01057 "paddw %%mm7, %%mm0 \n\t"\
01058 "paddw %%mm7, %%mm6 \n\t"\
01059 "paddw %%mm7, %%mm3 \n\t"\
01060 \
01061 "packuswb %%mm0, %%mm2 \n\t"\
01062 "packuswb %%mm6, %%mm5 \n\t"\
01063 "packuswb %%mm3, %%mm4 \n\t"\
01064
01065 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
01066
01067 #define YSCALEYUV2RGB(index, c) \
01068 REAL_YSCALEYUV2RGB_UV(index, c) \
01069 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
01070 REAL_YSCALEYUV2RGB_COEFF(c)
01071
01075 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
01076 const int16_t *ubuf[2], const int16_t *vbuf[2],
01077 const int16_t *abuf[2], uint8_t *dest,
01078 int dstW, int yalpha, int uvalpha, int y)
01079 {
01080 const int16_t *buf0 = buf[0], *buf1 = buf[1],
01081 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01082
01083 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01084 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
01085 #if ARCH_X86_64
01086 __asm__ volatile(
01087 YSCALEYUV2RGB(%%r8, %5)
01088 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
01089 "psraw $3, %%mm1 \n\t"
01090 "psraw $3, %%mm7 \n\t"
01091 "packuswb %%mm7, %%mm1 \n\t"
01092 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01093 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
01094 "a" (&c->redDither),
01095 "r" (abuf0), "r" (abuf1)
01096 : "%r8"
01097 );
01098 #else
01099 c->u_temp=(intptr_t)abuf0;
01100 c->v_temp=(intptr_t)abuf1;
01101 __asm__ volatile(
01102 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01103 "mov %4, %%"REG_b" \n\t"
01104 "push %%"REG_BP" \n\t"
01105 YSCALEYUV2RGB(%%REGBP, %5)
01106 "push %0 \n\t"
01107 "push %1 \n\t"
01108 "mov "U_TEMP"(%5), %0 \n\t"
01109 "mov "V_TEMP"(%5), %1 \n\t"
01110 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
01111 "psraw $3, %%mm1 \n\t"
01112 "psraw $3, %%mm7 \n\t"
01113 "packuswb %%mm7, %%mm1 \n\t"
01114 "pop %1 \n\t"
01115 "pop %0 \n\t"
01116 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01117 "pop %%"REG_BP" \n\t"
01118 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01119 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01120 "a" (&c->redDither)
01121 );
01122 #endif
01123 } else {
01124 __asm__ volatile(
01125 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01126 "mov %4, %%"REG_b" \n\t"
01127 "push %%"REG_BP" \n\t"
01128 YSCALEYUV2RGB(%%REGBP, %5)
01129 "pcmpeqd %%mm7, %%mm7 \n\t"
01130 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01131 "pop %%"REG_BP" \n\t"
01132 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01133 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01134 "a" (&c->redDither)
01135 );
01136 }
01137 }
01138
01139 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
01140 const int16_t *ubuf[2], const int16_t *vbuf[2],
01141 const int16_t *abuf[2], uint8_t *dest,
01142 int dstW, int yalpha, int uvalpha, int y)
01143 {
01144 const int16_t *buf0 = buf[0], *buf1 = buf[1],
01145 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01146
01147
01148 __asm__ volatile(
01149 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01150 "mov %4, %%"REG_b" \n\t"
01151 "push %%"REG_BP" \n\t"
01152 YSCALEYUV2RGB(%%REGBP, %5)
01153 "pxor %%mm7, %%mm7 \n\t"
01154 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01155 "pop %%"REG_BP" \n\t"
01156 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01157 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01158 "a" (&c->redDither)
01159 );
01160 }
01161
01162 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
01163 const int16_t *ubuf[2], const int16_t *vbuf[2],
01164 const int16_t *abuf[2], uint8_t *dest,
01165 int dstW, int yalpha, int uvalpha, int y)
01166 {
01167 const int16_t *buf0 = buf[0], *buf1 = buf[1],
01168 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01169
01170
01171 __asm__ volatile(
01172 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01173 "mov %4, %%"REG_b" \n\t"
01174 "push %%"REG_BP" \n\t"
01175 YSCALEYUV2RGB(%%REGBP, %5)
01176 "pxor %%mm7, %%mm7 \n\t"
01177
01178 #ifdef DITHER1XBPP
01179 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01180 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01181 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01182 #endif
01183 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01184 "pop %%"REG_BP" \n\t"
01185 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01186 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01187 "a" (&c->redDither)
01188 );
01189 }
01190
01191 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
01192 const int16_t *ubuf[2], const int16_t *vbuf[2],
01193 const int16_t *abuf[2], uint8_t *dest,
01194 int dstW, int yalpha, int uvalpha, int y)
01195 {
01196 const int16_t *buf0 = buf[0], *buf1 = buf[1],
01197 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01198
01199
01200 __asm__ volatile(
01201 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01202 "mov %4, %%"REG_b" \n\t"
01203 "push %%"REG_BP" \n\t"
01204 YSCALEYUV2RGB(%%REGBP, %5)
01205 "pxor %%mm7, %%mm7 \n\t"
01206
01207 #ifdef DITHER1XBPP
01208 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01209 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01210 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01211 #endif
01212 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01213 "pop %%"REG_BP" \n\t"
01214 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01215 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01216 "a" (&c->redDither)
01217 );
01218 }
01219
01220 #define REAL_YSCALEYUV2PACKED(index, c) \
01221 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
01222 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
01223 "psraw $3, %%mm0 \n\t"\
01224 "psraw $3, %%mm1 \n\t"\
01225 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
01226 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
01227 "xor "#index", "#index" \n\t"\
01228 ".p2align 4 \n\t"\
01229 "1: \n\t"\
01230 "movq (%2, "#index"), %%mm2 \n\t" \
01231 "movq (%3, "#index"), %%mm3 \n\t" \
01232 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01233 "movq (%2, "#index"), %%mm5 \n\t" \
01234 "movq (%3, "#index"), %%mm4 \n\t" \
01235 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01236 "psubw %%mm3, %%mm2 \n\t" \
01237 "psubw %%mm4, %%mm5 \n\t" \
01238 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
01239 "pmulhw %%mm0, %%mm2 \n\t" \
01240 "pmulhw %%mm0, %%mm5 \n\t" \
01241 "psraw $7, %%mm3 \n\t" \
01242 "psraw $7, %%mm4 \n\t" \
01243 "paddw %%mm2, %%mm3 \n\t" \
01244 "paddw %%mm5, %%mm4 \n\t" \
01245 "movq (%0, "#index", 2), %%mm0 \n\t" \
01246 "movq (%1, "#index", 2), %%mm1 \n\t" \
01247 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
01248 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
01249 "psubw %%mm1, %%mm0 \n\t" \
01250 "psubw %%mm7, %%mm6 \n\t" \
01251 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
01252 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
01253 "psraw $7, %%mm1 \n\t" \
01254 "psraw $7, %%mm7 \n\t" \
01255 "paddw %%mm0, %%mm1 \n\t" \
01256 "paddw %%mm6, %%mm7 \n\t" \
01257
01258 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
01259
01260 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
01261 const int16_t *ubuf[2], const int16_t *vbuf[2],
01262 const int16_t *abuf[2], uint8_t *dest,
01263 int dstW, int yalpha, int uvalpha, int y)
01264 {
01265 const int16_t *buf0 = buf[0], *buf1 = buf[1],
01266 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01267
01268
01269 __asm__ volatile(
01270 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01271 "mov %4, %%"REG_b" \n\t"
01272 "push %%"REG_BP" \n\t"
01273 YSCALEYUV2PACKED(%%REGBP, %5)
01274 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01275 "pop %%"REG_BP" \n\t"
01276 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01277 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01278 "a" (&c->redDither)
01279 );
01280 }
01281
01282 #define REAL_YSCALEYUV2RGB1(index, c) \
01283 "xor "#index", "#index" \n\t"\
01284 ".p2align 4 \n\t"\
01285 "1: \n\t"\
01286 "movq (%2, "#index"), %%mm3 \n\t" \
01287 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01288 "movq (%2, "#index"), %%mm4 \n\t" \
01289 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01290 "psraw $4, %%mm3 \n\t" \
01291 "psraw $4, %%mm4 \n\t" \
01292 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
01293 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
01294 "movq %%mm3, %%mm2 \n\t" \
01295 "movq %%mm4, %%mm5 \n\t" \
01296 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
01297 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
01298 \
01299 "movq (%0, "#index", 2), %%mm1 \n\t" \
01300 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
01301 "psraw $4, %%mm1 \n\t" \
01302 "psraw $4, %%mm7 \n\t" \
01303 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
01304 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
01305 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
01306 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
01307 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
01308 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
01309 \
01310 "paddw %%mm3, %%mm4 \n\t"\
01311 "movq %%mm2, %%mm0 \n\t"\
01312 "movq %%mm5, %%mm6 \n\t"\
01313 "movq %%mm4, %%mm3 \n\t"\
01314 "punpcklwd %%mm2, %%mm2 \n\t"\
01315 "punpcklwd %%mm5, %%mm5 \n\t"\
01316 "punpcklwd %%mm4, %%mm4 \n\t"\
01317 "paddw %%mm1, %%mm2 \n\t"\
01318 "paddw %%mm1, %%mm5 \n\t"\
01319 "paddw %%mm1, %%mm4 \n\t"\
01320 "punpckhwd %%mm0, %%mm0 \n\t"\
01321 "punpckhwd %%mm6, %%mm6 \n\t"\
01322 "punpckhwd %%mm3, %%mm3 \n\t"\
01323 "paddw %%mm7, %%mm0 \n\t"\
01324 "paddw %%mm7, %%mm6 \n\t"\
01325 "paddw %%mm7, %%mm3 \n\t"\
01326 \
01327 "packuswb %%mm0, %%mm2 \n\t"\
01328 "packuswb %%mm6, %%mm5 \n\t"\
01329 "packuswb %%mm3, %%mm4 \n\t"\
01330
01331 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
01332
01333
01334 #define REAL_YSCALEYUV2RGB1b(index, c) \
01335 "xor "#index", "#index" \n\t"\
01336 ".p2align 4 \n\t"\
01337 "1: \n\t"\
01338 "movq (%2, "#index"), %%mm2 \n\t" \
01339 "movq (%3, "#index"), %%mm3 \n\t" \
01340 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01341 "movq (%2, "#index"), %%mm5 \n\t" \
01342 "movq (%3, "#index"), %%mm4 \n\t" \
01343 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01344 "paddw %%mm2, %%mm3 \n\t" \
01345 "paddw %%mm5, %%mm4 \n\t" \
01346 "psrlw $5, %%mm3 \n\t" \
01347 "psrlw $5, %%mm4 \n\t" \
01348 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
01349 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
01350 "movq %%mm3, %%mm2 \n\t" \
01351 "movq %%mm4, %%mm5 \n\t" \
01352 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
01353 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
01354 \
01355 "movq (%0, "#index", 2), %%mm1 \n\t" \
01356 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
01357 "psraw $4, %%mm1 \n\t" \
01358 "psraw $4, %%mm7 \n\t" \
01359 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
01360 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
01361 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
01362 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
01363 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
01364 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
01365 \
01366 "paddw %%mm3, %%mm4 \n\t"\
01367 "movq %%mm2, %%mm0 \n\t"\
01368 "movq %%mm5, %%mm6 \n\t"\
01369 "movq %%mm4, %%mm3 \n\t"\
01370 "punpcklwd %%mm2, %%mm2 \n\t"\
01371 "punpcklwd %%mm5, %%mm5 \n\t"\
01372 "punpcklwd %%mm4, %%mm4 \n\t"\
01373 "paddw %%mm1, %%mm2 \n\t"\
01374 "paddw %%mm1, %%mm5 \n\t"\
01375 "paddw %%mm1, %%mm4 \n\t"\
01376 "punpckhwd %%mm0, %%mm0 \n\t"\
01377 "punpckhwd %%mm6, %%mm6 \n\t"\
01378 "punpckhwd %%mm3, %%mm3 \n\t"\
01379 "paddw %%mm7, %%mm0 \n\t"\
01380 "paddw %%mm7, %%mm6 \n\t"\
01381 "paddw %%mm7, %%mm3 \n\t"\
01382 \
01383 "packuswb %%mm0, %%mm2 \n\t"\
01384 "packuswb %%mm6, %%mm5 \n\t"\
01385 "packuswb %%mm3, %%mm4 \n\t"\
01386
01387 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
01388
01389 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
01390 "movq (%1, "#index", 2), %%mm7 \n\t" \
01391 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
01392 "psraw $7, %%mm7 \n\t" \
01393 "psraw $7, %%mm1 \n\t" \
01394 "packuswb %%mm1, %%mm7 \n\t"
01395 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
01396
01400 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
01401 const int16_t *ubuf[2], const int16_t *bguf[2],
01402 const int16_t *abuf0, uint8_t *dest,
01403 int dstW, int uvalpha, int y)
01404 {
01405 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01406 const int16_t *buf1= buf0;
01407
01408 if (uvalpha < 2048) {
01409 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01410 __asm__ volatile(
01411 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01412 "mov %4, %%"REG_b" \n\t"
01413 "push %%"REG_BP" \n\t"
01414 YSCALEYUV2RGB1(%%REGBP, %5)
01415 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01416 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01417 "pop %%"REG_BP" \n\t"
01418 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01419 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01420 "a" (&c->redDither)
01421 );
01422 } else {
01423 __asm__ volatile(
01424 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01425 "mov %4, %%"REG_b" \n\t"
01426 "push %%"REG_BP" \n\t"
01427 YSCALEYUV2RGB1(%%REGBP, %5)
01428 "pcmpeqd %%mm7, %%mm7 \n\t"
01429 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01430 "pop %%"REG_BP" \n\t"
01431 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01432 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01433 "a" (&c->redDither)
01434 );
01435 }
01436 } else {
01437 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01438 __asm__ volatile(
01439 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01440 "mov %4, %%"REG_b" \n\t"
01441 "push %%"REG_BP" \n\t"
01442 YSCALEYUV2RGB1b(%%REGBP, %5)
01443 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01444 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01445 "pop %%"REG_BP" \n\t"
01446 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01447 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01448 "a" (&c->redDither)
01449 );
01450 } else {
01451 __asm__ volatile(
01452 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01453 "mov %4, %%"REG_b" \n\t"
01454 "push %%"REG_BP" \n\t"
01455 YSCALEYUV2RGB1b(%%REGBP, %5)
01456 "pcmpeqd %%mm7, %%mm7 \n\t"
01457 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01458 "pop %%"REG_BP" \n\t"
01459 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01460 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01461 "a" (&c->redDither)
01462 );
01463 }
01464 }
01465 }
01466
01467 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
01468 const int16_t *ubuf[2], const int16_t *bguf[2],
01469 const int16_t *abuf0, uint8_t *dest,
01470 int dstW, int uvalpha, int y)
01471 {
01472 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01473 const int16_t *buf1= buf0;
01474
01475 if (uvalpha < 2048) {
01476 __asm__ volatile(
01477 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01478 "mov %4, %%"REG_b" \n\t"
01479 "push %%"REG_BP" \n\t"
01480 YSCALEYUV2RGB1(%%REGBP, %5)
01481 "pxor %%mm7, %%mm7 \n\t"
01482 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01483 "pop %%"REG_BP" \n\t"
01484 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01485 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01486 "a" (&c->redDither)
01487 );
01488 } else {
01489 __asm__ volatile(
01490 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01491 "mov %4, %%"REG_b" \n\t"
01492 "push %%"REG_BP" \n\t"
01493 YSCALEYUV2RGB1b(%%REGBP, %5)
01494 "pxor %%mm7, %%mm7 \n\t"
01495 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01496 "pop %%"REG_BP" \n\t"
01497 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01498 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01499 "a" (&c->redDither)
01500 );
01501 }
01502 }
01503
01504 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
01505 const int16_t *ubuf[2], const int16_t *bguf[2],
01506 const int16_t *abuf0, uint8_t *dest,
01507 int dstW, int uvalpha, int y)
01508 {
01509 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01510 const int16_t *buf1= buf0;
01511
01512 if (uvalpha < 2048) {
01513 __asm__ volatile(
01514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01515 "mov %4, %%"REG_b" \n\t"
01516 "push %%"REG_BP" \n\t"
01517 YSCALEYUV2RGB1(%%REGBP, %5)
01518 "pxor %%mm7, %%mm7 \n\t"
01519
01520 #ifdef DITHER1XBPP
01521 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01522 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01523 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01524 #endif
01525 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01526 "pop %%"REG_BP" \n\t"
01527 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01528 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01529 "a" (&c->redDither)
01530 );
01531 } else {
01532 __asm__ volatile(
01533 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01534 "mov %4, %%"REG_b" \n\t"
01535 "push %%"REG_BP" \n\t"
01536 YSCALEYUV2RGB1b(%%REGBP, %5)
01537 "pxor %%mm7, %%mm7 \n\t"
01538
01539 #ifdef DITHER1XBPP
01540 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01541 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01542 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01543 #endif
01544 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01545 "pop %%"REG_BP" \n\t"
01546 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01547 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01548 "a" (&c->redDither)
01549 );
01550 }
01551 }
01552
01553 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
01554 const int16_t *ubuf[2], const int16_t *bguf[2],
01555 const int16_t *abuf0, uint8_t *dest,
01556 int dstW, int uvalpha, int y)
01557 {
01558 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01559 const int16_t *buf1= buf0;
01560
01561 if (uvalpha < 2048) {
01562 __asm__ volatile(
01563 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01564 "mov %4, %%"REG_b" \n\t"
01565 "push %%"REG_BP" \n\t"
01566 YSCALEYUV2RGB1(%%REGBP, %5)
01567 "pxor %%mm7, %%mm7 \n\t"
01568
01569 #ifdef DITHER1XBPP
01570 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01571 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01572 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01573 #endif
01574 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01575 "pop %%"REG_BP" \n\t"
01576 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01577 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01578 "a" (&c->redDither)
01579 );
01580 } else {
01581 __asm__ volatile(
01582 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01583 "mov %4, %%"REG_b" \n\t"
01584 "push %%"REG_BP" \n\t"
01585 YSCALEYUV2RGB1b(%%REGBP, %5)
01586 "pxor %%mm7, %%mm7 \n\t"
01587
01588 #ifdef DITHER1XBPP
01589 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01590 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01591 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01592 #endif
01593 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01594 "pop %%"REG_BP" \n\t"
01595 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01596 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01597 "a" (&c->redDither)
01598 );
01599 }
01600 }
01601
01602 #define REAL_YSCALEYUV2PACKED1(index, c) \
01603 "xor "#index", "#index" \n\t"\
01604 ".p2align 4 \n\t"\
01605 "1: \n\t"\
01606 "movq (%2, "#index"), %%mm3 \n\t" \
01607 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01608 "movq (%2, "#index"), %%mm4 \n\t" \
01609 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01610 "psraw $7, %%mm3 \n\t" \
01611 "psraw $7, %%mm4 \n\t" \
01612 "movq (%0, "#index", 2), %%mm1 \n\t" \
01613 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
01614 "psraw $7, %%mm1 \n\t" \
01615 "psraw $7, %%mm7 \n\t" \
01616
01617 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
01618
01619 #define REAL_YSCALEYUV2PACKED1b(index, c) \
01620 "xor "#index", "#index" \n\t"\
01621 ".p2align 4 \n\t"\
01622 "1: \n\t"\
01623 "movq (%2, "#index"), %%mm2 \n\t" \
01624 "movq (%3, "#index"), %%mm3 \n\t" \
01625 "add "UV_OFFx2"("#c"), "#index" \n\t" \
01626 "movq (%2, "#index"), %%mm5 \n\t" \
01627 "movq (%3, "#index"), %%mm4 \n\t" \
01628 "sub "UV_OFFx2"("#c"), "#index" \n\t" \
01629 "paddw %%mm2, %%mm3 \n\t" \
01630 "paddw %%mm5, %%mm4 \n\t" \
01631 "psrlw $8, %%mm3 \n\t" \
01632 "psrlw $8, %%mm4 \n\t" \
01633 "movq (%0, "#index", 2), %%mm1 \n\t" \
01634 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
01635 "psraw $7, %%mm1 \n\t" \
01636 "psraw $7, %%mm7 \n\t"
01637 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
01638
01639 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
01640 const int16_t *ubuf[2], const int16_t *bguf[2],
01641 const int16_t *abuf0, uint8_t *dest,
01642 int dstW, int uvalpha, int y)
01643 {
01644 const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01645 const int16_t *buf1= buf0;
01646
01647 if (uvalpha < 2048) {
01648 __asm__ volatile(
01649 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01650 "mov %4, %%"REG_b" \n\t"
01651 "push %%"REG_BP" \n\t"
01652 YSCALEYUV2PACKED1(%%REGBP, %5)
01653 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01654 "pop %%"REG_BP" \n\t"
01655 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01656 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01657 "a" (&c->redDither)
01658 );
01659 } else {
01660 __asm__ volatile(
01661 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01662 "mov %4, %%"REG_b" \n\t"
01663 "push %%"REG_BP" \n\t"
01664 YSCALEYUV2PACKED1b(%%REGBP, %5)
01665 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01666 "pop %%"REG_BP" \n\t"
01667 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01668 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01669 "a" (&c->redDither)
01670 );
01671 }
01672 }
01673
01674 #if !COMPILE_TEMPLATE_MMX2
01675
01676
01677 static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src,
01678 int width, uint32_t *unused)
01679 {
01680 __asm__ volatile(
01681 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
01682 "mov %0, %%"REG_a" \n\t"
01683 "1: \n\t"
01684 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01685 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01686 "pand %%mm2, %%mm0 \n\t"
01687 "pand %%mm2, %%mm1 \n\t"
01688 "packuswb %%mm1, %%mm0 \n\t"
01689 "movq %%mm0, (%2, %%"REG_a") \n\t"
01690 "add $8, %%"REG_a" \n\t"
01691 " js 1b \n\t"
01692 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01693 : "%"REG_a
01694 );
01695 }
01696
01697 static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
01698 const uint8_t *src1, const uint8_t *src2,
01699 int width, uint32_t *unused)
01700 {
01701 __asm__ volatile(
01702 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01703 "mov %0, %%"REG_a" \n\t"
01704 "1: \n\t"
01705 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
01706 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
01707 "psrlw $8, %%mm0 \n\t"
01708 "psrlw $8, %%mm1 \n\t"
01709 "packuswb %%mm1, %%mm0 \n\t"
01710 "movq %%mm0, %%mm1 \n\t"
01711 "psrlw $8, %%mm0 \n\t"
01712 "pand %%mm4, %%mm1 \n\t"
01713 "packuswb %%mm0, %%mm0 \n\t"
01714 "packuswb %%mm1, %%mm1 \n\t"
01715 "movd %%mm0, (%3, %%"REG_a") \n\t"
01716 "movd %%mm1, (%2, %%"REG_a") \n\t"
01717 "add $4, %%"REG_a" \n\t"
01718 " js 1b \n\t"
01719 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01720 : "%"REG_a
01721 );
01722 assert(src1 == src2);
01723 }
01724
01725
01726
01727 static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
01728 int width, uint32_t *unused)
01729 {
01730 __asm__ volatile(
01731 "mov %0, %%"REG_a" \n\t"
01732 "1: \n\t"
01733 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01734 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01735 "psrlw $8, %%mm0 \n\t"
01736 "psrlw $8, %%mm1 \n\t"
01737 "packuswb %%mm1, %%mm0 \n\t"
01738 "movq %%mm0, (%2, %%"REG_a") \n\t"
01739 "add $8, %%"REG_a" \n\t"
01740 " js 1b \n\t"
01741 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01742 : "%"REG_a
01743 );
01744 }
01745
01746 static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
01747 const uint8_t *src1, const uint8_t *src2,
01748 int width, uint32_t *unused)
01749 {
01750 __asm__ volatile(
01751 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01752 "mov %0, %%"REG_a" \n\t"
01753 "1: \n\t"
01754 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
01755 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
01756 "pand %%mm4, %%mm0 \n\t"
01757 "pand %%mm4, %%mm1 \n\t"
01758 "packuswb %%mm1, %%mm0 \n\t"
01759 "movq %%mm0, %%mm1 \n\t"
01760 "psrlw $8, %%mm0 \n\t"
01761 "pand %%mm4, %%mm1 \n\t"
01762 "packuswb %%mm0, %%mm0 \n\t"
01763 "packuswb %%mm1, %%mm1 \n\t"
01764 "movd %%mm0, (%3, %%"REG_a") \n\t"
01765 "movd %%mm1, (%2, %%"REG_a") \n\t"
01766 "add $4, %%"REG_a" \n\t"
01767 " js 1b \n\t"
01768 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01769 : "%"REG_a
01770 );
01771 assert(src1 == src2);
01772 }
01773
01774 static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
01775 const uint8_t *src, int width)
01776 {
01777 __asm__ volatile(
01778 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01779 "mov %0, %%"REG_a" \n\t"
01780 "1: \n\t"
01781 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01782 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01783 "movq %%mm0, %%mm2 \n\t"
01784 "movq %%mm1, %%mm3 \n\t"
01785 "pand %%mm4, %%mm0 \n\t"
01786 "pand %%mm4, %%mm1 \n\t"
01787 "psrlw $8, %%mm2 \n\t"
01788 "psrlw $8, %%mm3 \n\t"
01789 "packuswb %%mm1, %%mm0 \n\t"
01790 "packuswb %%mm3, %%mm2 \n\t"
01791 "movq %%mm0, (%2, %%"REG_a") \n\t"
01792 "movq %%mm2, (%3, %%"REG_a") \n\t"
01793 "add $8, %%"REG_a" \n\t"
01794 " js 1b \n\t"
01795 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
01796 : "%"REG_a
01797 );
01798 }
01799
01800 static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
01801 const uint8_t *src1, const uint8_t *src2,
01802 int width, uint32_t *unused)
01803 {
01804 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
01805 }
01806
01807 static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
01808 const uint8_t *src1, const uint8_t *src2,
01809 int width, uint32_t *unused)
01810 {
01811 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
01812 }
01813 #endif
01814
01815 static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
01816 int width, enum PixelFormat srcFormat)
01817 {
01818
01819 if(srcFormat == PIX_FMT_BGR24) {
01820 __asm__ volatile(
01821 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
01822 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
01823 :
01824 );
01825 } else {
01826 __asm__ volatile(
01827 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
01828 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
01829 :
01830 );
01831 }
01832
01833 __asm__ volatile(
01834 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
01835 "mov %2, %%"REG_a" \n\t"
01836 "pxor %%mm7, %%mm7 \n\t"
01837 "1: \n\t"
01838 PREFETCH" 64(%0) \n\t"
01839 "movd (%0), %%mm0 \n\t"
01840 "movd 2(%0), %%mm1 \n\t"
01841 "movd 6(%0), %%mm2 \n\t"
01842 "movd 8(%0), %%mm3 \n\t"
01843 "add $12, %0 \n\t"
01844 "punpcklbw %%mm7, %%mm0 \n\t"
01845 "punpcklbw %%mm7, %%mm1 \n\t"
01846 "punpcklbw %%mm7, %%mm2 \n\t"
01847 "punpcklbw %%mm7, %%mm3 \n\t"
01848 "pmaddwd %%mm5, %%mm0 \n\t"
01849 "pmaddwd %%mm6, %%mm1 \n\t"
01850 "pmaddwd %%mm5, %%mm2 \n\t"
01851 "pmaddwd %%mm6, %%mm3 \n\t"
01852 "paddd %%mm1, %%mm0 \n\t"
01853 "paddd %%mm3, %%mm2 \n\t"
01854 "paddd %%mm4, %%mm0 \n\t"
01855 "paddd %%mm4, %%mm2 \n\t"
01856 "psrad $9, %%mm0 \n\t"
01857 "psrad $9, %%mm2 \n\t"
01858 "packssdw %%mm2, %%mm0 \n\t"
01859 "movq %%mm0, (%1, %%"REG_a") \n\t"
01860 "add $8, %%"REG_a" \n\t"
01861 " js 1b \n\t"
01862 : "+r" (src)
01863 : "r" (dst+width), "g" ((x86_reg)-2*width)
01864 : "%"REG_a
01865 );
01866 }
01867
01868 static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src,
01869 int width, uint32_t *unused)
01870 {
01871 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01872 }
01873
01874 static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src,
01875 int width, uint32_t *unused)
01876 {
01877 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01878 }
01879
01880 static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
01881 const uint8_t *src, int width,
01882 enum PixelFormat srcFormat)
01883 {
01884 __asm__ volatile(
01885 "movq 24(%4), %%mm6 \n\t"
01886 "mov %3, %%"REG_a" \n\t"
01887 "pxor %%mm7, %%mm7 \n\t"
01888 "1: \n\t"
01889 PREFETCH" 64(%0) \n\t"
01890 "movd (%0), %%mm0 \n\t"
01891 "movd 2(%0), %%mm1 \n\t"
01892 "punpcklbw %%mm7, %%mm0 \n\t"
01893 "punpcklbw %%mm7, %%mm1 \n\t"
01894 "movq %%mm0, %%mm2 \n\t"
01895 "movq %%mm1, %%mm3 \n\t"
01896 "pmaddwd (%4), %%mm0 \n\t"
01897 "pmaddwd 8(%4), %%mm1 \n\t"
01898 "pmaddwd 16(%4), %%mm2 \n\t"
01899 "pmaddwd %%mm6, %%mm3 \n\t"
01900 "paddd %%mm1, %%mm0 \n\t"
01901 "paddd %%mm3, %%mm2 \n\t"
01902
01903 "movd 6(%0), %%mm1 \n\t"
01904 "movd 8(%0), %%mm3 \n\t"
01905 "add $12, %0 \n\t"
01906 "punpcklbw %%mm7, %%mm1 \n\t"
01907 "punpcklbw %%mm7, %%mm3 \n\t"
01908 "movq %%mm1, %%mm4 \n\t"
01909 "movq %%mm3, %%mm5 \n\t"
01910 "pmaddwd (%4), %%mm1 \n\t"
01911 "pmaddwd 8(%4), %%mm3 \n\t"
01912 "pmaddwd 16(%4), %%mm4 \n\t"
01913 "pmaddwd %%mm6, %%mm5 \n\t"
01914 "paddd %%mm3, %%mm1 \n\t"
01915 "paddd %%mm5, %%mm4 \n\t"
01916
01917 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
01918 "paddd %%mm3, %%mm0 \n\t"
01919 "paddd %%mm3, %%mm2 \n\t"
01920 "paddd %%mm3, %%mm1 \n\t"
01921 "paddd %%mm3, %%mm4 \n\t"
01922 "psrad $9, %%mm0 \n\t"
01923 "psrad $9, %%mm2 \n\t"
01924 "psrad $9, %%mm1 \n\t"
01925 "psrad $9, %%mm4 \n\t"
01926 "packssdw %%mm1, %%mm0 \n\t"
01927 "packssdw %%mm4, %%mm2 \n\t"
01928 "movq %%mm0, (%1, %%"REG_a") \n\t"
01929 "movq %%mm2, (%2, %%"REG_a") \n\t"
01930 "add $8, %%"REG_a" \n\t"
01931 " js 1b \n\t"
01932 : "+r" (src)
01933 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
01934 : "%"REG_a
01935 );
01936 }
01937
01938 static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
01939 const uint8_t *src1, const uint8_t *src2,
01940 int width, uint32_t *unused)
01941 {
01942 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01943 assert(src1 == src2);
01944 }
01945
01946 static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
01947 const uint8_t *src1, const uint8_t *src2,
01948 int width, uint32_t *unused)
01949 {
01950 assert(src1==src2);
01951 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
01952 }
01953
01954 #if !COMPILE_TEMPLATE_MMX2
01955
01956 static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
01957 const uint8_t *src, const int16_t *filter,
01958 const int16_t *filterPos, int filterSize)
01959 {
01960 assert(filterSize % 4 == 0 && filterSize>0);
01961 if (filterSize==4) {
01962 x86_reg counter= -2*dstW;
01963 filter-= counter*2;
01964 filterPos-= counter/2;
01965 dst-= counter/2;
01966 __asm__ volatile(
01967 #if defined(PIC)
01968 "push %%"REG_b" \n\t"
01969 #endif
01970 "pxor %%mm7, %%mm7 \n\t"
01971 "push %%"REG_BP" \n\t"
01972 "mov %%"REG_a", %%"REG_BP" \n\t"
01973 ".p2align 4 \n\t"
01974 "1: \n\t"
01975 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
01976 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
01977 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
01978 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
01979 "movd (%3, %%"REG_a"), %%mm0 \n\t"
01980 "movd (%3, %%"REG_b"), %%mm2 \n\t"
01981 "punpcklbw %%mm7, %%mm0 \n\t"
01982 "punpcklbw %%mm7, %%mm2 \n\t"
01983 "pmaddwd %%mm1, %%mm0 \n\t"
01984 "pmaddwd %%mm2, %%mm3 \n\t"
01985 "movq %%mm0, %%mm4 \n\t"
01986 "punpckldq %%mm3, %%mm0 \n\t"
01987 "punpckhdq %%mm3, %%mm4 \n\t"
01988 "paddd %%mm4, %%mm0 \n\t"
01989 "psrad $7, %%mm0 \n\t"
01990 "packssdw %%mm0, %%mm0 \n\t"
01991 "movd %%mm0, (%4, %%"REG_BP") \n\t"
01992 "add $4, %%"REG_BP" \n\t"
01993 " jnc 1b \n\t"
01994
01995 "pop %%"REG_BP" \n\t"
01996 #if defined(PIC)
01997 "pop %%"REG_b" \n\t"
01998 #endif
01999 : "+a" (counter)
02000 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02001 #if !defined(PIC)
02002 : "%"REG_b
02003 #endif
02004 );
02005 } else if (filterSize==8) {
02006 x86_reg counter= -2*dstW;
02007 filter-= counter*4;
02008 filterPos-= counter/2;
02009 dst-= counter/2;
02010 __asm__ volatile(
02011 #if defined(PIC)
02012 "push %%"REG_b" \n\t"
02013 #endif
02014 "pxor %%mm7, %%mm7 \n\t"
02015 "push %%"REG_BP" \n\t"
02016 "mov %%"REG_a", %%"REG_BP" \n\t"
02017 ".p2align 4 \n\t"
02018 "1: \n\t"
02019 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
02020 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
02021 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
02022 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
02023 "movd (%3, %%"REG_a"), %%mm0 \n\t"
02024 "movd (%3, %%"REG_b"), %%mm2 \n\t"
02025 "punpcklbw %%mm7, %%mm0 \n\t"
02026 "punpcklbw %%mm7, %%mm2 \n\t"
02027 "pmaddwd %%mm1, %%mm0 \n\t"
02028 "pmaddwd %%mm2, %%mm3 \n\t"
02029
02030 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
02031 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
02032 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
02033 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
02034 "punpcklbw %%mm7, %%mm4 \n\t"
02035 "punpcklbw %%mm7, %%mm2 \n\t"
02036 "pmaddwd %%mm1, %%mm4 \n\t"
02037 "pmaddwd %%mm2, %%mm5 \n\t"
02038 "paddd %%mm4, %%mm0 \n\t"
02039 "paddd %%mm5, %%mm3 \n\t"
02040 "movq %%mm0, %%mm4 \n\t"
02041 "punpckldq %%mm3, %%mm0 \n\t"
02042 "punpckhdq %%mm3, %%mm4 \n\t"
02043 "paddd %%mm4, %%mm0 \n\t"
02044 "psrad $7, %%mm0 \n\t"
02045 "packssdw %%mm0, %%mm0 \n\t"
02046 "movd %%mm0, (%4, %%"REG_BP") \n\t"
02047 "add $4, %%"REG_BP" \n\t"
02048 " jnc 1b \n\t"
02049
02050 "pop %%"REG_BP" \n\t"
02051 #if defined(PIC)
02052 "pop %%"REG_b" \n\t"
02053 #endif
02054 : "+a" (counter)
02055 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02056 #if !defined(PIC)
02057 : "%"REG_b
02058 #endif
02059 );
02060 } else {
02061 const uint8_t *offset = src+filterSize;
02062 x86_reg counter= -2*dstW;
02063
02064 filterPos-= counter/2;
02065 dst-= counter/2;
02066 __asm__ volatile(
02067 "pxor %%mm7, %%mm7 \n\t"
02068 ".p2align 4 \n\t"
02069 "1: \n\t"
02070 "mov %2, %%"REG_c" \n\t"
02071 "movzwl (%%"REG_c", %0), %%eax \n\t"
02072 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
02073 "mov %5, %%"REG_c" \n\t"
02074 "pxor %%mm4, %%mm4 \n\t"
02075 "pxor %%mm5, %%mm5 \n\t"
02076 "2: \n\t"
02077 "movq (%1), %%mm1 \n\t"
02078 "movq (%1, %6), %%mm3 \n\t"
02079 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
02080 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
02081 "punpcklbw %%mm7, %%mm0 \n\t"
02082 "punpcklbw %%mm7, %%mm2 \n\t"
02083 "pmaddwd %%mm1, %%mm0 \n\t"
02084 "pmaddwd %%mm2, %%mm3 \n\t"
02085 "paddd %%mm3, %%mm5 \n\t"
02086 "paddd %%mm0, %%mm4 \n\t"
02087 "add $8, %1 \n\t"
02088 "add $4, %%"REG_c" \n\t"
02089 "cmp %4, %%"REG_c" \n\t"
02090 " jb 2b \n\t"
02091 "add %6, %1 \n\t"
02092 "movq %%mm4, %%mm0 \n\t"
02093 "punpckldq %%mm5, %%mm4 \n\t"
02094 "punpckhdq %%mm5, %%mm0 \n\t"
02095 "paddd %%mm0, %%mm4 \n\t"
02096 "psrad $7, %%mm4 \n\t"
02097 "packssdw %%mm4, %%mm4 \n\t"
02098 "mov %3, %%"REG_a" \n\t"
02099 "movd %%mm4, (%%"REG_a", %0) \n\t"
02100 "add $4, %0 \n\t"
02101 " jnc 1b \n\t"
02102
02103 : "+r" (counter), "+r" (filter)
02104 : "m" (filterPos), "m" (dst), "m"(offset),
02105 "m" (src), "r" ((x86_reg)filterSize*2)
02106 : "%"REG_a, "%"REG_c, "%"REG_d
02107 );
02108 }
02109 }
02110 #endif
02111
02112 static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
02113 const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
02114 {
02115 int i, j;
02116
02117 assert(filterSize % 4 == 0 && filterSize>0);
02118 if (filterSize==4 && shift<15) {
02119 x86_reg counter= -2*dstW;
02120 filter-= counter*2;
02121 filterPos-= counter/2;
02122 dst-= counter/2;
02123 __asm__ volatile(
02124 "movd %5, %%mm7 \n\t"
02125 #if defined(PIC)
02126 "push %%"REG_b" \n\t"
02127 #endif
02128 "push %%"REG_BP" \n\t"
02129 "mov %%"REG_a", %%"REG_BP" \n\t"
02130 ".p2align 4 \n\t"
02131 "1: \n\t"
02132 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
02133 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
02134 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
02135 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
02136 "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
02137 "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
02138 "pmaddwd %%mm1, %%mm0 \n\t"
02139 "pmaddwd %%mm2, %%mm3 \n\t"
02140 "movq %%mm0, %%mm4 \n\t"
02141 "punpckldq %%mm3, %%mm0 \n\t"
02142 "punpckhdq %%mm3, %%mm4 \n\t"
02143 "paddd %%mm4, %%mm0 \n\t"
02144 "psrad %%mm7, %%mm0 \n\t"
02145 "packssdw %%mm0, %%mm0 \n\t"
02146 "movd %%mm0, (%4, %%"REG_BP") \n\t"
02147 "add $4, %%"REG_BP" \n\t"
02148 " jnc 1b \n\t"
02149
02150 "pop %%"REG_BP" \n\t"
02151 #if defined(PIC)
02152 "pop %%"REG_b" \n\t"
02153 #endif
02154 : "+a" (counter)
02155 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
02156 #if !defined(PIC)
02157 : "%"REG_b
02158 #endif
02159 );
02160 } else if (filterSize==8 && shift<15) {
02161 x86_reg counter= -2*dstW;
02162 filter-= counter*4;
02163 filterPos-= counter/2;
02164 dst-= counter/2;
02165 __asm__ volatile(
02166 "movd %5, %%mm7 \n\t"
02167 #if defined(PIC)
02168 "push %%"REG_b" \n\t"
02169 #endif
02170 "push %%"REG_BP" \n\t"
02171 "mov %%"REG_a", %%"REG_BP" \n\t"
02172 ".p2align 4 \n\t"
02173 "1: \n\t"
02174 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
02175 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
02176 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
02177 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
02178 "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
02179 "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
02180 "pmaddwd %%mm1, %%mm0 \n\t"
02181 "pmaddwd %%mm2, %%mm3 \n\t"
02182
02183 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
02184 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
02185 "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
02186 "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
02187 "pmaddwd %%mm1, %%mm4 \n\t"
02188 "pmaddwd %%mm2, %%mm5 \n\t"
02189 "paddd %%mm4, %%mm0 \n\t"
02190 "paddd %%mm5, %%mm3 \n\t"
02191 "movq %%mm0, %%mm4 \n\t"
02192 "punpckldq %%mm3, %%mm0 \n\t"
02193 "punpckhdq %%mm3, %%mm4 \n\t"
02194 "paddd %%mm4, %%mm0 \n\t"
02195 "psrad %%mm7, %%mm0 \n\t"
02196 "packssdw %%mm0, %%mm0 \n\t"
02197 "movd %%mm0, (%4, %%"REG_BP") \n\t"
02198 "add $4, %%"REG_BP" \n\t"
02199 " jnc 1b \n\t"
02200
02201 "pop %%"REG_BP" \n\t"
02202 #if defined(PIC)
02203 "pop %%"REG_b" \n\t"
02204 #endif
02205 : "+a" (counter)
02206 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
02207 #if !defined(PIC)
02208 : "%"REG_b
02209 #endif
02210 );
02211 } else if (shift<15){
02212 const uint16_t *offset = src+filterSize;
02213 x86_reg counter= -2*dstW;
02214
02215 filterPos-= counter/2;
02216 dst-= counter/2;
02217 __asm__ volatile(
02218 "movd %7, %%mm7 \n\t"
02219 ".p2align 4 \n\t"
02220 "1: \n\t"
02221 "mov %2, %%"REG_c" \n\t"
02222 "movzwl (%%"REG_c", %0), %%eax \n\t"
02223 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
02224 "mov %5, %%"REG_c" \n\t"
02225 "pxor %%mm4, %%mm4 \n\t"
02226 "pxor %%mm5, %%mm5 \n\t"
02227 "2: \n\t"
02228 "movq (%1), %%mm1 \n\t"
02229 "movq (%1, %6), %%mm3 \n\t"
02230 "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
02231 "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
02232 "pmaddwd %%mm1, %%mm0 \n\t"
02233 "pmaddwd %%mm2, %%mm3 \n\t"
02234 "paddd %%mm3, %%mm5 \n\t"
02235 "paddd %%mm0, %%mm4 \n\t"
02236 "add $8, %1 \n\t"
02237 "add $8, %%"REG_c" \n\t"
02238 "cmp %4, %%"REG_c" \n\t"
02239 " jb 2b \n\t"
02240 "add %6, %1 \n\t"
02241 "movq %%mm4, %%mm0 \n\t"
02242 "punpckldq %%mm5, %%mm4 \n\t"
02243 "punpckhdq %%mm5, %%mm0 \n\t"
02244 "paddd %%mm0, %%mm4 \n\t"
02245 "psrad %%mm7, %%mm4 \n\t"
02246 "packssdw %%mm4, %%mm4 \n\t"
02247 "mov %3, %%"REG_a" \n\t"
02248 "movd %%mm4, (%%"REG_a", %0) \n\t"
02249 "add $4, %0 \n\t"
02250 " jnc 1b \n\t"
02251
02252 : "+r" (counter), "+r" (filter)
02253 : "m" (filterPos), "m" (dst), "m"(offset),
02254 "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
02255 : "%"REG_a, "%"REG_c, "%"REG_d
02256 );
02257 } else
02258 for (i=0; i<dstW; i++) {
02259 int srcPos= filterPos[i];
02260 int val=0;
02261 for (j=0; j<filterSize; j++) {
02262 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
02263 }
02264 dst[i] = FFMIN(val>>shift, (1<<15)-1);
02265 }
02266 }
02267
02268
02269 #if COMPILE_TEMPLATE_MMX2
02270 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
02271 int dstWidth, const uint8_t *src,
02272 int srcW, int xInc)
02273 {
02274 int16_t *filterPos = c->hLumFilterPos;
02275 int16_t *filter = c->hLumFilter;
02276 void *mmx2FilterCode= c->lumMmx2FilterCode;
02277 int i;
02278 #if defined(PIC)
02279 DECLARE_ALIGNED(8, uint64_t, ebxsave);
02280 #endif
02281
02282 __asm__ volatile(
02283 #if defined(PIC)
02284 "mov %%"REG_b", %5 \n\t"
02285 #endif
02286 "pxor %%mm7, %%mm7 \n\t"
02287 "mov %0, %%"REG_c" \n\t"
02288 "mov %1, %%"REG_D" \n\t"
02289 "mov %2, %%"REG_d" \n\t"
02290 "mov %3, %%"REG_b" \n\t"
02291 "xor %%"REG_a", %%"REG_a" \n\t"
02292 PREFETCH" (%%"REG_c") \n\t"
02293 PREFETCH" 32(%%"REG_c") \n\t"
02294 PREFETCH" 64(%%"REG_c") \n\t"
02295
02296 #if ARCH_X86_64
02297 #define CALL_MMX2_FILTER_CODE \
02298 "movl (%%"REG_b"), %%esi \n\t"\
02299 "call *%4 \n\t"\
02300 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
02301 "add %%"REG_S", %%"REG_c" \n\t"\
02302 "add %%"REG_a", %%"REG_D" \n\t"\
02303 "xor %%"REG_a", %%"REG_a" \n\t"\
02304
02305 #else
02306 #define CALL_MMX2_FILTER_CODE \
02307 "movl (%%"REG_b"), %%esi \n\t"\
02308 "call *%4 \n\t"\
02309 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
02310 "add %%"REG_a", %%"REG_D" \n\t"\
02311 "xor %%"REG_a", %%"REG_a" \n\t"\
02312
02313 #endif
02314
02315 CALL_MMX2_FILTER_CODE
02316 CALL_MMX2_FILTER_CODE
02317 CALL_MMX2_FILTER_CODE
02318 CALL_MMX2_FILTER_CODE
02319 CALL_MMX2_FILTER_CODE
02320 CALL_MMX2_FILTER_CODE
02321 CALL_MMX2_FILTER_CODE
02322 CALL_MMX2_FILTER_CODE
02323
02324 #if defined(PIC)
02325 "mov %5, %%"REG_b" \n\t"
02326 #endif
02327 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
02328 "m" (mmx2FilterCode)
02329 #if defined(PIC)
02330 ,"m" (ebxsave)
02331 #endif
02332 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02333 #if !defined(PIC)
02334 ,"%"REG_b
02335 #endif
02336 );
02337
02338 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
02339 dst[i] = src[srcW-1]*128;
02340 }
02341
02342 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
02343 int dstWidth, const uint8_t *src1,
02344 const uint8_t *src2, int srcW, int xInc)
02345 {
02346 int16_t *filterPos = c->hChrFilterPos;
02347 int16_t *filter = c->hChrFilter;
02348 void *mmx2FilterCode= c->chrMmx2FilterCode;
02349 int i;
02350 #if defined(PIC)
02351 DECLARE_ALIGNED(8, uint64_t, ebxsave);
02352 #endif
02353
02354 __asm__ volatile(
02355 #if defined(PIC)
02356 "mov %%"REG_b", %7 \n\t"
02357 #endif
02358 "pxor %%mm7, %%mm7 \n\t"
02359 "mov %0, %%"REG_c" \n\t"
02360 "mov %1, %%"REG_D" \n\t"
02361 "mov %2, %%"REG_d" \n\t"
02362 "mov %3, %%"REG_b" \n\t"
02363 "xor %%"REG_a", %%"REG_a" \n\t"
02364 PREFETCH" (%%"REG_c") \n\t"
02365 PREFETCH" 32(%%"REG_c") \n\t"
02366 PREFETCH" 64(%%"REG_c") \n\t"
02367
02368 CALL_MMX2_FILTER_CODE
02369 CALL_MMX2_FILTER_CODE
02370 CALL_MMX2_FILTER_CODE
02371 CALL_MMX2_FILTER_CODE
02372 "xor %%"REG_a", %%"REG_a" \n\t"
02373 "mov %5, %%"REG_c" \n\t"
02374 "mov %6, %%"REG_D" \n\t"
02375 PREFETCH" (%%"REG_c") \n\t"
02376 PREFETCH" 32(%%"REG_c") \n\t"
02377 PREFETCH" 64(%%"REG_c") \n\t"
02378
02379 CALL_MMX2_FILTER_CODE
02380 CALL_MMX2_FILTER_CODE
02381 CALL_MMX2_FILTER_CODE
02382 CALL_MMX2_FILTER_CODE
02383
02384 #if defined(PIC)
02385 "mov %7, %%"REG_b" \n\t"
02386 #endif
02387 :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
02388 "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
02389 #if defined(PIC)
02390 ,"m" (ebxsave)
02391 #endif
02392 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02393 #if !defined(PIC)
02394 ,"%"REG_b
02395 #endif
02396 );
02397
02398 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
02399 dst1[i] = src1[srcW-1]*128;
02400 dst2[i] = src2[srcW-1]*128;
02401 }
02402 }
02403 #endif
02404
02405 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
02406 {
02407 enum PixelFormat srcFormat = c->srcFormat,
02408 dstFormat = c->dstFormat;
02409
02410 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
02411 && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
02412 if (c->flags & SWS_ACCURATE_RND) {
02413 c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
02414 c->yuv2yuvX = RENAME(yuv2yuvX_ar );
02415 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
02416 switch (c->dstFormat) {
02417 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
02418 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
02419 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
02420 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
02421 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
02422 default: break;
02423 }
02424 }
02425 } else {
02426 int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
02427 c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
02428 c->yuv2yuvX = RENAME(yuv2yuvX );
02429 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
02430 switch (c->dstFormat) {
02431 case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
02432 case PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
02433 case PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
02434 case PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
02435 case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
02436 default: break;
02437 }
02438 }
02439 }
02440 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
02441 switch (c->dstFormat) {
02442 case PIX_FMT_RGB32:
02443 c->yuv2packed1 = RENAME(yuv2rgb32_1);
02444 c->yuv2packed2 = RENAME(yuv2rgb32_2);
02445 break;
02446 case PIX_FMT_BGR24:
02447 c->yuv2packed1 = RENAME(yuv2bgr24_1);
02448 c->yuv2packed2 = RENAME(yuv2bgr24_2);
02449 break;
02450 case PIX_FMT_RGB555:
02451 c->yuv2packed1 = RENAME(yuv2rgb555_1);
02452 c->yuv2packed2 = RENAME(yuv2rgb555_2);
02453 break;
02454 case PIX_FMT_RGB565:
02455 c->yuv2packed1 = RENAME(yuv2rgb565_1);
02456 c->yuv2packed2 = RENAME(yuv2rgb565_2);
02457 break;
02458 case PIX_FMT_YUYV422:
02459 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
02460 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
02461 break;
02462 default:
02463 break;
02464 }
02465 }
02466 }
02467
02468 if (c->srcBpc == 8 && c->dstBpc <= 10) {
02469 #if !COMPILE_TEMPLATE_MMX2
02470 c->hyScale = c->hcScale = RENAME(hScale );
02471 #endif
02472
02473
02474 #if COMPILE_TEMPLATE_MMX2
02475 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
02476 {
02477 c->hyscale_fast = RENAME(hyscale_fast);
02478 c->hcscale_fast = RENAME(hcscale_fast);
02479 } else {
02480 #endif
02481 c->hyscale_fast = NULL;
02482 c->hcscale_fast = NULL;
02483 #if COMPILE_TEMPLATE_MMX2
02484 }
02485 #endif
02486 }
02487
02488 #if !COMPILE_TEMPLATE_MMX2
02489 switch(srcFormat) {
02490 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
02491 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
02492 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
02493 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
02494 case PIX_FMT_YUV420P9LE:
02495 case PIX_FMT_YUV422P10LE:
02496 case PIX_FMT_YUV420P10LE: c->hScale16= RENAME(hScale16); break;
02497 default: break;
02498 }
02499 #endif
02500 if (!c->chrSrcHSubSample) {
02501 switch(srcFormat) {
02502 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
02503 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
02504 default: break;
02505 }
02506 }
02507
02508 switch (srcFormat) {
02509 #if !COMPILE_TEMPLATE_MMX2
02510 case PIX_FMT_YUYV422 :
02511 case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
02512 case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
02513 #endif
02514 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
02515 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
02516 default: break;
02517 }
02518 #if !COMPILE_TEMPLATE_MMX2
02519 if (c->alpPixBuf) {
02520 switch (srcFormat) {
02521 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
02522 default: break;
02523 }
02524 }
02525 #endif
02526 if(isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
02527 c->hScale16= RENAME(hScale16);
02528 if(c->dstBpc > 10)
02529 c->hScale16 = NULL;
02530 }