00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #include "libavcodec/dsputil.h"
00042 #include "idct_xvid.h"
00043
00049 #define X8(x) x,x,x,x,x,x,x,x
00050
00051 #define ROW_SHIFT 11
00052 #define COL_SHIFT 6
00053
00054 DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)};
00055 DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)};
00056 DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)};
00057 DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)};
00058 DECLARE_ASM_CONST(8, uint8_t, m127[]) = {X8(127)};
00059
00060 DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {
00061 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00062 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00063 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00064 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00065 };
00066
00067 DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {
00068 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00069 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00070 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00071 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00072 };
00073
00074 DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {
00075 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00076 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00077 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00078 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00079 };
00080
00081 DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {
00082 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00083 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00084 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00085 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00086 };
00087
00088 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {
00089 65536, 65536, 65536, 65536,
00090 3597, 3597, 3597, 3597,
00091 2260, 2260, 2260, 2260,
00092 1203, 1203, 1203, 1203,
00093 120, 120, 120, 120,
00094 512, 512, 512, 512
00095 };
00096
00097
00098 #define ROW1 "%%xmm6"
00099 #define ROW3 "%%xmm4"
00100 #define ROW5 "%%xmm5"
00101 #define ROW7 "%%xmm7"
00102
00103 #define CLEAR_ODD(r) "pxor "r","r" \n\t"
00104 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
00105
00106 #if ARCH_X86_64
00107
00108 # define ROW0 "%%xmm8"
00109 # define REG0 ROW0
00110 # define ROW2 "%%xmm9"
00111 # define REG2 ROW2
00112 # define ROW4 "%%xmm10"
00113 # define REG4 ROW4
00114 # define ROW6 "%%xmm11"
00115 # define REG6 ROW6
00116 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00117 # define PUT_EVEN(dst) PUT_ODD(dst)
00118 # define XMMS "%%xmm12"
00119 # define MOV_32_ONLY "#"
00120 # define SREG2 REG2
00121 # define TAN3 "%%xmm13"
00122 # define TAN1 "%%xmm14"
00123
00124 #else
00125
00126 # define ROW0 "(%0)"
00127 # define REG0 "%%xmm4"
00128 # define ROW2 "2*16(%0)"
00129 # define REG2 "%%xmm4"
00130 # define ROW4 "4*16(%0)"
00131 # define REG4 "%%xmm6"
00132 # define ROW6 "6*16(%0)"
00133 # define REG6 "%%xmm6"
00134 # define CLEAR_EVEN(r)
00135 # define PUT_EVEN(dst) \
00136 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
00137 "movdqa %%xmm2, "dst" \n\t"
00138 # define XMMS "%%xmm2"
00139 # define MOV_32_ONLY "movdqa "
00140 # define SREG2 "%%xmm7"
00141 # define TAN3 "%%xmm0"
00142 # define TAN1 "%%xmm2"
00143
00144 #endif
00145
00146 #define ROUND(x) "paddd "MANGLE(x)
00147
00148 #define JZ(reg, to) \
00149 "testl "reg","reg" \n\t" \
00150 "jz "to" \n\t"
00151
00152 #define JNZ(reg, to) \
00153 "testl "reg","reg" \n\t" \
00154 "jnz "to" \n\t"
00155
00156 #define TEST_ONE_ROW(src, reg, clear) \
00157 clear \
00158 "movq "src", %%mm1 \n\t" \
00159 "por 8+"src", %%mm1 \n\t" \
00160 "paddusb %%mm0, %%mm1 \n\t" \
00161 "pmovmskb %%mm1, "reg" \n\t"
00162
00163 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00164 clear1 \
00165 clear2 \
00166 "movq "row1", %%mm1 \n\t" \
00167 "por 8+"row1", %%mm1 \n\t" \
00168 "movq "row2", %%mm2 \n\t" \
00169 "por 8+"row2", %%mm2 \n\t" \
00170 "paddusb %%mm0, %%mm1 \n\t" \
00171 "paddusb %%mm0, %%mm2 \n\t" \
00172 "pmovmskb %%mm1, "reg1" \n\t" \
00173 "pmovmskb %%mm2, "reg2" \n\t"
00174
00176 #define iMTX_MULT(src, table, rounder, put) \
00177 "movdqa "src", %%xmm3 \n\t" \
00178 "movdqa %%xmm3, %%xmm0 \n\t" \
00179 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" \
00180 "punpcklqdq %%xmm0, %%xmm0 \n\t" \
00181 "pmaddwd "table", %%xmm0 \n\t" \
00182 "pmaddwd 16+"table", %%xmm1 \n\t" \
00183 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" \
00184 "punpckhqdq %%xmm3, %%xmm3 \n\t" \
00185 "pmaddwd 32+"table", %%xmm2 \n\t" \
00186 "pmaddwd 48+"table", %%xmm3 \n\t" \
00187 "paddd %%xmm1, %%xmm0 \n\t" \
00188 "paddd %%xmm3, %%xmm2 \n\t" \
00189 rounder", %%xmm0 \n\t" \
00190 "movdqa %%xmm2, %%xmm3 \n\t" \
00191 "paddd %%xmm0, %%xmm2 \n\t" \
00192 "psubd %%xmm3, %%xmm0 \n\t" \
00193 "psrad $11, %%xmm2 \n\t" \
00194 "psrad $11, %%xmm0 \n\t" \
00195 "packssdw %%xmm0, %%xmm2 \n\t" \
00196 put \
00197 "1: \n\t"
00198
00199 #define iLLM_HEAD \
00200 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
00201 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
00202
00204 #define iLLM_PASS(dct) \
00205 "movdqa "TAN3", %%xmm1 \n\t" \
00206 "movdqa "TAN1", %%xmm3 \n\t" \
00207 "pmulhw %%xmm4, "TAN3" \n\t" \
00208 "pmulhw %%xmm5, %%xmm1 \n\t" \
00209 "paddsw %%xmm4, "TAN3" \n\t" \
00210 "paddsw %%xmm5, %%xmm1 \n\t" \
00211 "psubsw %%xmm5, "TAN3" \n\t" \
00212 "paddsw %%xmm4, %%xmm1 \n\t" \
00213 "pmulhw %%xmm7, %%xmm3 \n\t" \
00214 "pmulhw %%xmm6, "TAN1" \n\t" \
00215 "paddsw %%xmm6, %%xmm3 \n\t" \
00216 "psubsw %%xmm7, "TAN1" \n\t" \
00217 "movdqa %%xmm3, %%xmm7 \n\t" \
00218 "movdqa "TAN1", %%xmm6 \n\t" \
00219 "psubsw %%xmm1, %%xmm3 \n\t" \
00220 "psubsw "TAN3", "TAN1" \n\t" \
00221 "paddsw %%xmm7, %%xmm1 \n\t" \
00222 "paddsw %%xmm6, "TAN3" \n\t" \
00223 "movdqa %%xmm3, %%xmm6 \n\t" \
00224 "psubsw "TAN3", %%xmm3 \n\t" \
00225 "paddsw %%xmm6, "TAN3" \n\t" \
00226 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00227 "pmulhw %%xmm4, %%xmm3 \n\t" \
00228 "pmulhw %%xmm4, "TAN3" \n\t" \
00229 "paddsw "TAN3", "TAN3" \n\t" \
00230 "paddsw %%xmm3, %%xmm3 \n\t" \
00231 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
00232 MOV_32_ONLY ROW2", "REG2" \n\t" \
00233 MOV_32_ONLY ROW6", "REG6" \n\t" \
00234 "movdqa %%xmm7, %%xmm5 \n\t" \
00235 "pmulhw "REG6", %%xmm7 \n\t" \
00236 "pmulhw "REG2", %%xmm5 \n\t" \
00237 "paddsw "REG2", %%xmm7 \n\t" \
00238 "psubsw "REG6", %%xmm5 \n\t" \
00239 MOV_32_ONLY ROW0", "REG0" \n\t" \
00240 MOV_32_ONLY ROW4", "REG4" \n\t" \
00241 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00242 "movdqa "REG0", "XMMS" \n\t" \
00243 "psubsw "REG4", "REG0" \n\t" \
00244 "paddsw "XMMS", "REG4" \n\t" \
00245 "movdqa "REG4", "XMMS" \n\t" \
00246 "psubsw %%xmm7, "REG4" \n\t" \
00247 "paddsw "XMMS", %%xmm7 \n\t" \
00248 "movdqa "REG0", "XMMS" \n\t" \
00249 "psubsw %%xmm5, "REG0" \n\t" \
00250 "paddsw "XMMS", %%xmm5 \n\t" \
00251 "movdqa %%xmm5, "XMMS" \n\t" \
00252 "psubsw "TAN3", %%xmm5 \n\t" \
00253 "paddsw "XMMS", "TAN3" \n\t" \
00254 "movdqa "REG0", "XMMS" \n\t" \
00255 "psubsw %%xmm3, "REG0" \n\t" \
00256 "paddsw "XMMS", %%xmm3 \n\t" \
00257 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00258 "psraw $6, %%xmm5 \n\t" \
00259 "psraw $6, "REG0" \n\t" \
00260 "psraw $6, "TAN3" \n\t" \
00261 "psraw $6, %%xmm3 \n\t" \
00262 "movdqa "TAN3", 1*16("dct") \n\t" \
00263 "movdqa %%xmm3, 2*16("dct") \n\t" \
00264 "movdqa "REG0", 5*16("dct") \n\t" \
00265 "movdqa %%xmm5, 6*16("dct") \n\t" \
00266 "movdqa %%xmm7, %%xmm0 \n\t" \
00267 "movdqa "REG4", %%xmm4 \n\t" \
00268 "psubsw %%xmm1, %%xmm7 \n\t" \
00269 "psubsw "TAN1", "REG4" \n\t" \
00270 "paddsw %%xmm0, %%xmm1 \n\t" \
00271 "paddsw %%xmm4, "TAN1" \n\t" \
00272 "psraw $6, %%xmm1 \n\t" \
00273 "psraw $6, %%xmm7 \n\t" \
00274 "psraw $6, "TAN1" \n\t" \
00275 "psraw $6, "REG4" \n\t" \
00276 "movdqa %%xmm1, ("dct") \n\t" \
00277 "movdqa "TAN1", 3*16("dct") \n\t" \
00278 "movdqa "REG4", 4*16("dct") \n\t" \
00279 "movdqa %%xmm7, 7*16("dct") \n\t"
00280
00282 #define iLLM_PASS_SPARSE(dct) \
00283 "pmulhw %%xmm4, "TAN3" \n\t" \
00284 "paddsw %%xmm4, "TAN3" \n\t" \
00285 "movdqa %%xmm6, %%xmm3 \n\t" \
00286 "pmulhw %%xmm6, "TAN1" \n\t" \
00287 "movdqa %%xmm4, %%xmm1 \n\t" \
00288 "psubsw %%xmm1, %%xmm3 \n\t" \
00289 "paddsw %%xmm6, %%xmm1 \n\t" \
00290 "movdqa "TAN1", %%xmm6 \n\t" \
00291 "psubsw "TAN3", "TAN1" \n\t" \
00292 "paddsw %%xmm6, "TAN3" \n\t" \
00293 "movdqa %%xmm3, %%xmm6 \n\t" \
00294 "psubsw "TAN3", %%xmm3 \n\t" \
00295 "paddsw %%xmm6, "TAN3" \n\t" \
00296 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00297 "pmulhw %%xmm4, %%xmm3 \n\t" \
00298 "pmulhw %%xmm4, "TAN3" \n\t" \
00299 "paddsw "TAN3", "TAN3" \n\t" \
00300 "paddsw %%xmm3, %%xmm3 \n\t" \
00301 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
00302 MOV_32_ONLY ROW2", "SREG2" \n\t" \
00303 "pmulhw "SREG2", %%xmm5 \n\t" \
00304 MOV_32_ONLY ROW0", "REG0" \n\t" \
00305 "movdqa "REG0", %%xmm6 \n\t" \
00306 "psubsw "SREG2", %%xmm6 \n\t" \
00307 "paddsw "REG0", "SREG2" \n\t" \
00308 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00309 "movdqa "REG0", "XMMS" \n\t" \
00310 "psubsw %%xmm5, "REG0" \n\t" \
00311 "paddsw "XMMS", %%xmm5 \n\t" \
00312 "movdqa %%xmm5, "XMMS" \n\t" \
00313 "psubsw "TAN3", %%xmm5 \n\t" \
00314 "paddsw "XMMS", "TAN3" \n\t" \
00315 "movdqa "REG0", "XMMS" \n\t" \
00316 "psubsw %%xmm3, "REG0" \n\t" \
00317 "paddsw "XMMS", %%xmm3 \n\t" \
00318 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00319 "psraw $6, %%xmm5 \n\t" \
00320 "psraw $6, "REG0" \n\t" \
00321 "psraw $6, "TAN3" \n\t" \
00322 "psraw $6, %%xmm3 \n\t" \
00323 "movdqa "TAN3", 1*16("dct") \n\t" \
00324 "movdqa %%xmm3, 2*16("dct") \n\t" \
00325 "movdqa "REG0", 5*16("dct") \n\t" \
00326 "movdqa %%xmm5, 6*16("dct") \n\t" \
00327 "movdqa "SREG2", %%xmm0 \n\t" \
00328 "movdqa %%xmm6, %%xmm4 \n\t" \
00329 "psubsw %%xmm1, "SREG2" \n\t" \
00330 "psubsw "TAN1", %%xmm6 \n\t" \
00331 "paddsw %%xmm0, %%xmm1 \n\t" \
00332 "paddsw %%xmm4, "TAN1" \n\t" \
00333 "psraw $6, %%xmm1 \n\t" \
00334 "psraw $6, "SREG2" \n\t" \
00335 "psraw $6, "TAN1" \n\t" \
00336 "psraw $6, %%xmm6 \n\t" \
00337 "movdqa %%xmm1, ("dct") \n\t" \
00338 "movdqa "TAN1", 3*16("dct") \n\t" \
00339 "movdqa %%xmm6, 4*16("dct") \n\t" \
00340 "movdqa "SREG2", 7*16("dct") \n\t"
00341
00342 inline void ff_idct_xvid_sse2(short *block)
00343 {
00344 __asm__ volatile(
00345 "movq "MANGLE(m127)", %%mm0 \n\t"
00346 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
00347 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00348 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00349
00350 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00351 JZ("%%eax", "1f")
00352 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00353
00354 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00355 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00356 iLLM_HEAD
00357 ASMALIGN(4)
00358 JNZ("%%ecx", "2f")
00359 JNZ("%%eax", "3f")
00360 JNZ("%%edx", "4f")
00361 JNZ("%%esi", "5f")
00362 iLLM_PASS_SPARSE("%0")
00363 "jmp 6f \n\t"
00364 "2: \n\t"
00365 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00366 "3: \n\t"
00367 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00368 JZ("%%edx", "1f")
00369 "4: \n\t"
00370 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00371 JZ("%%esi", "1f")
00372 "5: \n\t"
00373 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00374 #if !ARCH_X86_64
00375 iLLM_HEAD
00376 #endif
00377 iLLM_PASS("%0")
00378 "6: \n\t"
00379 : "+r"(block)
00380 :
00381 : "%eax", "%ecx", "%edx", "%esi", "memory");
00382 }
00383
00384 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00385 {
00386 ff_idct_xvid_sse2(block);
00387 put_pixels_clamped_mmx(block, dest, line_size);
00388 }
00389
00390 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00391 {
00392 ff_idct_xvid_sse2(block);
00393 add_pixels_clamped_mmx(block, dest, line_size);
00394 }