00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #include "libavutil/x86_cpu.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "dsputil_mmx.h"
00030
00032 #define NORMALIZE_MMX(SHIFT) \
00033 "paddw %%mm7, %%mm3 \n\t" \
00034 "paddw %%mm7, %%mm4 \n\t" \
00035 "psraw "SHIFT", %%mm3 \n\t" \
00036 "psraw "SHIFT", %%mm4 \n\t"
00037
00038 #define TRANSFER_DO_PACK \
00039 "packuswb %%mm4, %%mm3 \n\t" \
00040 "movq %%mm3, (%2) \n\t"
00041
00042 #define TRANSFER_DONT_PACK \
00043 "movq %%mm3, 0(%2) \n\t" \
00044 "movq %%mm4, 8(%2) \n\t"
00045
00047 #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
00048 #define DONT_UNPACK(reg)
00049
00051 #define LOAD_ROUNDER_MMX(ROUND) \
00052 "movd "ROUND", %%mm7 \n\t" \
00053 "punpcklwd %%mm7, %%mm7 \n\t" \
00054 "punpckldq %%mm7, %%mm7 \n\t"
00055
00056 #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
00057 "paddw %%mm"#R2", %%mm"#R1" \n\t" \
00058 "movd (%0,%3), %%mm"#R0" \n\t" \
00059 "pmullw %%mm6, %%mm"#R1" \n\t" \
00060 "punpcklbw %%mm0, %%mm"#R0" \n\t" \
00061 "movd (%0,%2), %%mm"#R3" \n\t" \
00062 "psubw %%mm"#R0", %%mm"#R1" \n\t" \
00063 "punpcklbw %%mm0, %%mm"#R3" \n\t" \
00064 "paddw %%mm7, %%mm"#R1" \n\t" \
00065 "psubw %%mm"#R3", %%mm"#R1" \n\t" \
00066 "psraw %4, %%mm"#R1" \n\t" \
00067 "movq %%mm"#R1", "#OFF"(%1) \n\t" \
00068 "add %2, %0 \n\t"
00069
00070 DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
00071
00073 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
00074 const uint8_t *src, x86_reg stride,
00075 int rnd, int64_t shift)
00076 {
00077 __asm__ volatile(
00078 "mov $3, %%"REG_c" \n\t"
00079 LOAD_ROUNDER_MMX("%5")
00080 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
00081 "1: \n\t"
00082 "movd (%0), %%mm2 \n\t"
00083 "add %2, %0 \n\t"
00084 "movd (%0), %%mm3 \n\t"
00085 "punpcklbw %%mm0, %%mm2 \n\t"
00086 "punpcklbw %%mm0, %%mm3 \n\t"
00087 SHIFT2_LINE( 0, 1, 2, 3, 4)
00088 SHIFT2_LINE( 24, 2, 3, 4, 1)
00089 SHIFT2_LINE( 48, 3, 4, 1, 2)
00090 SHIFT2_LINE( 72, 4, 1, 2, 3)
00091 SHIFT2_LINE( 96, 1, 2, 3, 4)
00092 SHIFT2_LINE(120, 2, 3, 4, 1)
00093 SHIFT2_LINE(144, 3, 4, 1, 2)
00094 SHIFT2_LINE(168, 4, 1, 2, 3)
00095 "sub %6, %0 \n\t"
00096 "add $8, %1 \n\t"
00097 "dec %%"REG_c" \n\t"
00098 "jnz 1b \n\t"
00099 : "+r"(src), "+r"(dst)
00100 : "r"(stride), "r"(-2*stride),
00101 "m"(shift), "m"(rnd), "r"(9*stride-4)
00102 : "%"REG_c, "memory"
00103 );
00104 }
00105
00110 static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
00111 const int16_t *src, int rnd)
00112 {
00113 int h = 8;
00114
00115 src -= 1;
00116 rnd -= (-1+9+9-1)*1024;
00117 __asm__ volatile(
00118 LOAD_ROUNDER_MMX("%4")
00119 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"
00120 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"
00121 "1: \n\t"
00122 "movq 2*0+0(%1), %%mm1 \n\t"
00123 "movq 2*0+8(%1), %%mm2 \n\t"
00124 "movq 2*1+0(%1), %%mm3 \n\t"
00125 "movq 2*1+8(%1), %%mm4 \n\t"
00126 "paddw 2*3+0(%1), %%mm1 \n\t"
00127 "paddw 2*3+8(%1), %%mm2 \n\t"
00128 "paddw 2*2+0(%1), %%mm3 \n\t"
00129 "paddw 2*2+8(%1), %%mm4 \n\t"
00130 "pmullw %%mm5, %%mm3 \n\t"
00131 "pmullw %%mm5, %%mm4 \n\t"
00132 "psubw %%mm1, %%mm3 \n\t"
00133 "psubw %%mm2, %%mm4 \n\t"
00134 NORMALIZE_MMX("$7")
00135
00136 "paddw %%mm6, %%mm3 \n\t"
00137 "paddw %%mm6, %%mm4 \n\t"
00138 TRANSFER_DO_PACK
00139 "add $24, %1 \n\t"
00140 "add %3, %2 \n\t"
00141 "decl %0 \n\t"
00142 "jnz 1b \n\t"
00143 : "+r"(h), "+r" (src), "+r" (dst)
00144 : "r"(stride), "m"(rnd)
00145 : "memory"
00146 );
00147 }
00148
00149
00154 static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
00155 x86_reg stride, int rnd, x86_reg offset)
00156 {
00157 rnd = 8-rnd;
00158 __asm__ volatile(
00159 "mov $8, %%"REG_c" \n\t"
00160 LOAD_ROUNDER_MMX("%5")
00161 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"
00162 "1: \n\t"
00163 "movd 0(%0 ), %%mm3 \n\t"
00164 "movd 4(%0 ), %%mm4 \n\t"
00165 "movd 0(%0,%2), %%mm1 \n\t"
00166 "movd 4(%0,%2), %%mm2 \n\t"
00167 "add %2, %0 \n\t"
00168 "punpcklbw %%mm0, %%mm3 \n\t"
00169 "punpcklbw %%mm0, %%mm4 \n\t"
00170 "punpcklbw %%mm0, %%mm1 \n\t"
00171 "punpcklbw %%mm0, %%mm2 \n\t"
00172 "paddw %%mm1, %%mm3 \n\t"
00173 "paddw %%mm2, %%mm4 \n\t"
00174 "movd 0(%0,%3), %%mm1 \n\t"
00175 "movd 4(%0,%3), %%mm2 \n\t"
00176 "pmullw %%mm6, %%mm3 \n\t"
00177 "pmullw %%mm6, %%mm4 \n\t"
00178 "punpcklbw %%mm0, %%mm1 \n\t"
00179 "punpcklbw %%mm0, %%mm2 \n\t"
00180 "psubw %%mm1, %%mm3 \n\t"
00181 "psubw %%mm2, %%mm4 \n\t"
00182 "movd 0(%0,%2), %%mm1 \n\t"
00183 "movd 4(%0,%2), %%mm2 \n\t"
00184 "punpcklbw %%mm0, %%mm1 \n\t"
00185 "punpcklbw %%mm0, %%mm2 \n\t"
00186 "psubw %%mm1, %%mm3 \n\t"
00187 "psubw %%mm2, %%mm4 \n\t"
00188 NORMALIZE_MMX("$4")
00189 "packuswb %%mm4, %%mm3 \n\t"
00190 "movq %%mm3, (%1) \n\t"
00191 "add %6, %0 \n\t"
00192 "add %4, %1 \n\t"
00193 "dec %%"REG_c" \n\t"
00194 "jnz 1b \n\t"
00195 : "+r"(src), "+r"(dst)
00196 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),
00197 "g"(stride-offset)
00198 : "%"REG_c, "memory"
00199 );
00200 }
00201
00206 DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL;
00207 DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL;
00208
00219 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
00220 MOVQ "*0+"A1", %%mm1 \n\t" \
00221 MOVQ "*4+"A1", %%mm2 \n\t" \
00222 UNPACK("%%mm1") \
00223 UNPACK("%%mm2") \
00224 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
00225 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
00226 MOVQ "*0+"A2", %%mm3 \n\t" \
00227 MOVQ "*4+"A2", %%mm4 \n\t" \
00228 UNPACK("%%mm3") \
00229 UNPACK("%%mm4") \
00230 "pmullw %%mm6, %%mm3 \n\t" \
00231 "pmullw %%mm6, %%mm4 \n\t" \
00232 "psubw %%mm1, %%mm3 \n\t" \
00233 "psubw %%mm2, %%mm4 \n\t" \
00234 MOVQ "*0+"A4", %%mm1 \n\t" \
00235 MOVQ "*4+"A4", %%mm2 \n\t" \
00236 UNPACK("%%mm1") \
00237 UNPACK("%%mm2") \
00238 "psllw $2, %%mm1 \n\t" \
00239 "psllw $2, %%mm2 \n\t" \
00240 "psubw %%mm1, %%mm3 \n\t" \
00241 "psubw %%mm2, %%mm4 \n\t" \
00242 MOVQ "*0+"A3", %%mm1 \n\t" \
00243 MOVQ "*4+"A3", %%mm2 \n\t" \
00244 UNPACK("%%mm1") \
00245 UNPACK("%%mm2") \
00246 "pmullw %%mm5, %%mm1 \n\t" \
00247 "pmullw %%mm5, %%mm2 \n\t" \
00248 "paddw %%mm1, %%mm3 \n\t" \
00249 "paddw %%mm2, %%mm4 \n\t"
00250
00259 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
00260 static void \
00261 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
00262 x86_reg src_stride, \
00263 int rnd, int64_t shift) \
00264 { \
00265 int h = 8; \
00266 src -= src_stride; \
00267 __asm__ volatile( \
00268 LOAD_ROUNDER_MMX("%5") \
00269 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
00270 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
00271 ASMALIGN(3) \
00272 "1: \n\t" \
00273 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
00274 NORMALIZE_MMX("%6") \
00275 TRANSFER_DONT_PACK \
00276 \
00277 "movd 8+"A1", %%mm1 \n\t" \
00278 DO_UNPACK("%%mm1") \
00279 "movq %%mm1, %%mm3 \n\t" \
00280 "paddw %%mm1, %%mm1 \n\t" \
00281 "paddw %%mm3, %%mm1 \n\t" \
00282 "movd 8+"A2", %%mm3 \n\t" \
00283 DO_UNPACK("%%mm3") \
00284 "pmullw %%mm6, %%mm3 \n\t" \
00285 "psubw %%mm1, %%mm3 \n\t" \
00286 "movd 8+"A3", %%mm1 \n\t" \
00287 DO_UNPACK("%%mm1") \
00288 "pmullw %%mm5, %%mm1 \n\t" \
00289 "paddw %%mm1, %%mm3 \n\t" \
00290 "movd 8+"A4", %%mm1 \n\t" \
00291 DO_UNPACK("%%mm1") \
00292 "psllw $2, %%mm1 \n\t" \
00293 "psubw %%mm1, %%mm3 \n\t" \
00294 "paddw %%mm7, %%mm3 \n\t" \
00295 "psraw %6, %%mm3 \n\t" \
00296 "movq %%mm3, 16(%2) \n\t" \
00297 "add %3, %1 \n\t" \
00298 "add $24, %2 \n\t" \
00299 "decl %0 \n\t" \
00300 "jnz 1b \n\t" \
00301 : "+r"(h), "+r" (src), "+r" (dst) \
00302 : "r"(src_stride), "r"(3*src_stride), \
00303 "m"(rnd), "m"(shift) \
00304 : "memory" \
00305 ); \
00306 }
00307
00315 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \
00316 static void \
00317 vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
00318 const int16_t *src, int rnd) \
00319 { \
00320 int h = 8; \
00321 src -= 1; \
00322 rnd -= (-4+58+13-3)*256; \
00323 __asm__ volatile( \
00324 LOAD_ROUNDER_MMX("%4") \
00325 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
00326 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
00327 ASMALIGN(3) \
00328 "1: \n\t" \
00329 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
00330 NORMALIZE_MMX("$7") \
00331 \
00332 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
00333 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
00334 TRANSFER_DO_PACK \
00335 "add $24, %1 \n\t" \
00336 "add %3, %2 \n\t" \
00337 "decl %0 \n\t" \
00338 "jnz 1b \n\t" \
00339 : "+r"(h), "+r" (src), "+r" (dst) \
00340 : "r"(stride), "m"(rnd) \
00341 : "memory" \
00342 ); \
00343 }
00344
00353 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \
00354 static void \
00355 vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
00356 x86_reg stride, int rnd, x86_reg offset) \
00357 { \
00358 int h = 8; \
00359 src -= offset; \
00360 rnd = 32-rnd; \
00361 __asm__ volatile ( \
00362 LOAD_ROUNDER_MMX("%6") \
00363 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
00364 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
00365 ASMALIGN(3) \
00366 "1: \n\t" \
00367 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
00368 NORMALIZE_MMX("$6") \
00369 TRANSFER_DO_PACK \
00370 "add %5, %1 \n\t" \
00371 "add %5, %2 \n\t" \
00372 "decl %0 \n\t" \
00373 "jnz 1b \n\t" \
00374 : "+r"(h), "+r" (src), "+r" (dst) \
00375 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
00376 : "memory" \
00377 ); \
00378 }
00379
00381 MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
00382 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
00383 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)")
00384
00386 MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
00387 MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
00388 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)")
00389
00390 typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
00391 typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
00392 typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
00393
00405 static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,
00406 int hmode, int vmode, int rnd)
00407 {
00408 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =
00409 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };
00410 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =
00411 { NULL, vc1_put_hor_16b_shift1_mmx, vc1_put_hor_16b_shift2_mmx, vc1_put_hor_16b_shift3_mmx };
00412 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =
00413 { NULL, vc1_put_shift1_mmx, vc1_put_shift2_mmx, vc1_put_shift3_mmx };
00414
00415 __asm__ volatile(
00416 "pxor %%mm0, %%mm0 \n\t"
00417 ::: "memory"
00418 );
00419
00420 if (vmode) {
00421 if (hmode) {
00422 static const int shift_value[] = { 0, 5, 1, 5 };
00423 int shift = (shift_value[hmode]+shift_value[vmode])>>1;
00424 int r;
00425 DECLARE_ALIGNED_16(int16_t, tmp[12*8]);
00426
00427 r = (1<<(shift-1)) + rnd-1;
00428 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);
00429
00430 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);
00431 return;
00432 }
00433 else {
00434 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);
00435 return;
00436 }
00437 }
00438
00439
00440 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);
00441 }
00442
00443 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
00444
00446 #define DECLARE_FUNCTION(a, b) \
00447 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
00448 vc1_mspel_mc(dst, src, stride, a, b, rnd); \
00449 }
00450
00451 DECLARE_FUNCTION(0, 1)
00452 DECLARE_FUNCTION(0, 2)
00453 DECLARE_FUNCTION(0, 3)
00454
00455 DECLARE_FUNCTION(1, 0)
00456 DECLARE_FUNCTION(1, 1)
00457 DECLARE_FUNCTION(1, 2)
00458 DECLARE_FUNCTION(1, 3)
00459
00460 DECLARE_FUNCTION(2, 0)
00461 DECLARE_FUNCTION(2, 1)
00462 DECLARE_FUNCTION(2, 2)
00463 DECLARE_FUNCTION(2, 3)
00464
00465 DECLARE_FUNCTION(3, 0)
00466 DECLARE_FUNCTION(3, 1)
00467 DECLARE_FUNCTION(3, 2)
00468 DECLARE_FUNCTION(3, 3)
00469
00470 void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
00471 dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
00472 dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
00473 dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
00474 dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
00475
00476 dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
00477 dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
00478 dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
00479 dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
00480
00481 dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
00482 dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
00483 dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
00484 dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
00485
00486 dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
00487 dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
00488 dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
00489 dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
00490 }