00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "dsputil_mmx.h"
00023
00024 #if HAVE_INLINE_ASM
00025
00026
00027
00028
00029 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
00030 "mov"#q" "#C", "#T" \n\t"\
00031 "mov"#d" (%0), "#F" \n\t"\
00032 "paddw "#D", "#T" \n\t"\
00033 "psllw $2, "#T" \n\t"\
00034 "psubw "#B", "#T" \n\t"\
00035 "psubw "#E", "#T" \n\t"\
00036 "punpcklbw "#Z", "#F" \n\t"\
00037 "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
00038 "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
00039 "add %2, %0 \n\t"\
00040 "paddw "#F", "#A" \n\t"\
00041 "paddw "#A", "#T" \n\t"\
00042 "psraw $5, "#T" \n\t"\
00043 "packuswb "#T", "#T" \n\t"\
00044 OP(T, (%1), A, d)\
00045 "add %3, %1 \n\t"
00046
00047 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
00048 "mov"#q" "#C", "#T" \n\t"\
00049 "mov"#d" (%0), "#F" \n\t"\
00050 "paddw "#D", "#T" \n\t"\
00051 "psllw $2, "#T" \n\t"\
00052 "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
00053 "psubw "#B", "#T" \n\t"\
00054 "psubw "#E", "#T" \n\t"\
00055 "punpcklbw "#Z", "#F" \n\t"\
00056 "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
00057 "paddw "#F", "#A" \n\t"\
00058 "add %2, %0 \n\t"\
00059 "paddw "#A", "#T" \n\t"\
00060 "mov"#q" "#T", "#OF"(%1) \n\t"
00061
00062 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
00063 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
00064 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
00065 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
00066
00067
00068 #define QPEL_H264(OPNAME, OP, MMX)\
00069 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00070 int h=4;\
00071 \
00072 __asm__ volatile(\
00073 "pxor %%mm7, %%mm7 \n\t"\
00074 "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
00075 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
00076 "1: \n\t"\
00077 "movd -1(%0), %%mm1 \n\t"\
00078 "movd (%0), %%mm2 \n\t"\
00079 "movd 1(%0), %%mm3 \n\t"\
00080 "movd 2(%0), %%mm0 \n\t"\
00081 "punpcklbw %%mm7, %%mm1 \n\t"\
00082 "punpcklbw %%mm7, %%mm2 \n\t"\
00083 "punpcklbw %%mm7, %%mm3 \n\t"\
00084 "punpcklbw %%mm7, %%mm0 \n\t"\
00085 "paddw %%mm0, %%mm1 \n\t"\
00086 "paddw %%mm3, %%mm2 \n\t"\
00087 "movd -2(%0), %%mm0 \n\t"\
00088 "movd 3(%0), %%mm3 \n\t"\
00089 "punpcklbw %%mm7, %%mm0 \n\t"\
00090 "punpcklbw %%mm7, %%mm3 \n\t"\
00091 "paddw %%mm3, %%mm0 \n\t"\
00092 "psllw $2, %%mm2 \n\t"\
00093 "psubw %%mm1, %%mm2 \n\t"\
00094 "pmullw %%mm4, %%mm2 \n\t"\
00095 "paddw %%mm5, %%mm0 \n\t"\
00096 "paddw %%mm2, %%mm0 \n\t"\
00097 "psraw $5, %%mm0 \n\t"\
00098 "packuswb %%mm0, %%mm0 \n\t"\
00099 OP(%%mm0, (%1),%%mm6, d)\
00100 "add %3, %0 \n\t"\
00101 "add %4, %1 \n\t"\
00102 "decl %2 \n\t"\
00103 " jnz 1b \n\t"\
00104 : "+a"(src), "+c"(dst), "+g"(h)\
00105 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
00106 : "memory"\
00107 );\
00108 }\
00109 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00110 int h=4;\
00111 __asm__ volatile(\
00112 "pxor %%mm7, %%mm7 \n\t"\
00113 "movq %0, %%mm4 \n\t"\
00114 "movq %1, %%mm5 \n\t"\
00115 :: "m"(ff_pw_5), "m"(ff_pw_16)\
00116 );\
00117 do{\
00118 __asm__ volatile(\
00119 "movd -1(%0), %%mm1 \n\t"\
00120 "movd (%0), %%mm2 \n\t"\
00121 "movd 1(%0), %%mm3 \n\t"\
00122 "movd 2(%0), %%mm0 \n\t"\
00123 "punpcklbw %%mm7, %%mm1 \n\t"\
00124 "punpcklbw %%mm7, %%mm2 \n\t"\
00125 "punpcklbw %%mm7, %%mm3 \n\t"\
00126 "punpcklbw %%mm7, %%mm0 \n\t"\
00127 "paddw %%mm0, %%mm1 \n\t"\
00128 "paddw %%mm3, %%mm2 \n\t"\
00129 "movd -2(%0), %%mm0 \n\t"\
00130 "movd 3(%0), %%mm3 \n\t"\
00131 "punpcklbw %%mm7, %%mm0 \n\t"\
00132 "punpcklbw %%mm7, %%mm3 \n\t"\
00133 "paddw %%mm3, %%mm0 \n\t"\
00134 "psllw $2, %%mm2 \n\t"\
00135 "psubw %%mm1, %%mm2 \n\t"\
00136 "pmullw %%mm4, %%mm2 \n\t"\
00137 "paddw %%mm5, %%mm0 \n\t"\
00138 "paddw %%mm2, %%mm0 \n\t"\
00139 "movd (%2), %%mm3 \n\t"\
00140 "psraw $5, %%mm0 \n\t"\
00141 "packuswb %%mm0, %%mm0 \n\t"\
00142 PAVGB" %%mm3, %%mm0 \n\t"\
00143 OP(%%mm0, (%1),%%mm6, d)\
00144 "add %4, %0 \n\t"\
00145 "add %4, %1 \n\t"\
00146 "add %3, %2 \n\t"\
00147 : "+a"(src), "+c"(dst), "+d"(src2)\
00148 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
00149 : "memory"\
00150 );\
00151 }while(--h);\
00152 }\
00153 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00154 src -= 2*srcStride;\
00155 __asm__ volatile(\
00156 "pxor %%mm7, %%mm7 \n\t"\
00157 "movd (%0), %%mm0 \n\t"\
00158 "add %2, %0 \n\t"\
00159 "movd (%0), %%mm1 \n\t"\
00160 "add %2, %0 \n\t"\
00161 "movd (%0), %%mm2 \n\t"\
00162 "add %2, %0 \n\t"\
00163 "movd (%0), %%mm3 \n\t"\
00164 "add %2, %0 \n\t"\
00165 "movd (%0), %%mm4 \n\t"\
00166 "add %2, %0 \n\t"\
00167 "punpcklbw %%mm7, %%mm0 \n\t"\
00168 "punpcklbw %%mm7, %%mm1 \n\t"\
00169 "punpcklbw %%mm7, %%mm2 \n\t"\
00170 "punpcklbw %%mm7, %%mm3 \n\t"\
00171 "punpcklbw %%mm7, %%mm4 \n\t"\
00172 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00173 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00174 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00175 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00176 \
00177 : "+a"(src), "+c"(dst)\
00178 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00179 : "memory"\
00180 );\
00181 }\
00182 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00183 int h=4;\
00184 int w=3;\
00185 src -= 2*srcStride+2;\
00186 while(w--){\
00187 __asm__ volatile(\
00188 "pxor %%mm7, %%mm7 \n\t"\
00189 "movd (%0), %%mm0 \n\t"\
00190 "add %2, %0 \n\t"\
00191 "movd (%0), %%mm1 \n\t"\
00192 "add %2, %0 \n\t"\
00193 "movd (%0), %%mm2 \n\t"\
00194 "add %2, %0 \n\t"\
00195 "movd (%0), %%mm3 \n\t"\
00196 "add %2, %0 \n\t"\
00197 "movd (%0), %%mm4 \n\t"\
00198 "add %2, %0 \n\t"\
00199 "punpcklbw %%mm7, %%mm0 \n\t"\
00200 "punpcklbw %%mm7, %%mm1 \n\t"\
00201 "punpcklbw %%mm7, %%mm2 \n\t"\
00202 "punpcklbw %%mm7, %%mm3 \n\t"\
00203 "punpcklbw %%mm7, %%mm4 \n\t"\
00204 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
00205 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
00206 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
00207 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
00208 \
00209 : "+a"(src)\
00210 : "c"(tmp), "S"((x86_reg)srcStride)\
00211 : "memory"\
00212 );\
00213 tmp += 4;\
00214 src += 4 - 9*srcStride;\
00215 }\
00216 tmp -= 3*4;\
00217 __asm__ volatile(\
00218 "1: \n\t"\
00219 "movq (%0), %%mm0 \n\t"\
00220 "paddw 10(%0), %%mm0 \n\t"\
00221 "movq 2(%0), %%mm1 \n\t"\
00222 "paddw 8(%0), %%mm1 \n\t"\
00223 "movq 4(%0), %%mm2 \n\t"\
00224 "paddw 6(%0), %%mm2 \n\t"\
00225 "psubw %%mm1, %%mm0 \n\t"\
00226 "psraw $2, %%mm0 \n\t"\
00227 "psubw %%mm1, %%mm0 \n\t"\
00228 "paddsw %%mm2, %%mm0 \n\t"\
00229 "psraw $2, %%mm0 \n\t"\
00230 "paddw %%mm2, %%mm0 \n\t"\
00231 "psraw $6, %%mm0 \n\t"\
00232 "packuswb %%mm0, %%mm0 \n\t"\
00233 OP(%%mm0, (%1),%%mm7, d)\
00234 "add $24, %0 \n\t"\
00235 "add %3, %1 \n\t"\
00236 "decl %2 \n\t"\
00237 " jnz 1b \n\t"\
00238 : "+a"(tmp), "+c"(dst), "+g"(h)\
00239 : "S"((x86_reg)dstStride)\
00240 : "memory"\
00241 );\
00242 }\
00243 \
00244 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00245 int h=8;\
00246 __asm__ volatile(\
00247 "pxor %%mm7, %%mm7 \n\t"\
00248 "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
00249 "1: \n\t"\
00250 "movq (%0), %%mm0 \n\t"\
00251 "movq 1(%0), %%mm2 \n\t"\
00252 "movq %%mm0, %%mm1 \n\t"\
00253 "movq %%mm2, %%mm3 \n\t"\
00254 "punpcklbw %%mm7, %%mm0 \n\t"\
00255 "punpckhbw %%mm7, %%mm1 \n\t"\
00256 "punpcklbw %%mm7, %%mm2 \n\t"\
00257 "punpckhbw %%mm7, %%mm3 \n\t"\
00258 "paddw %%mm2, %%mm0 \n\t"\
00259 "paddw %%mm3, %%mm1 \n\t"\
00260 "psllw $2, %%mm0 \n\t"\
00261 "psllw $2, %%mm1 \n\t"\
00262 "movq -1(%0), %%mm2 \n\t"\
00263 "movq 2(%0), %%mm4 \n\t"\
00264 "movq %%mm2, %%mm3 \n\t"\
00265 "movq %%mm4, %%mm5 \n\t"\
00266 "punpcklbw %%mm7, %%mm2 \n\t"\
00267 "punpckhbw %%mm7, %%mm3 \n\t"\
00268 "punpcklbw %%mm7, %%mm4 \n\t"\
00269 "punpckhbw %%mm7, %%mm5 \n\t"\
00270 "paddw %%mm4, %%mm2 \n\t"\
00271 "paddw %%mm3, %%mm5 \n\t"\
00272 "psubw %%mm2, %%mm0 \n\t"\
00273 "psubw %%mm5, %%mm1 \n\t"\
00274 "pmullw %%mm6, %%mm0 \n\t"\
00275 "pmullw %%mm6, %%mm1 \n\t"\
00276 "movd -2(%0), %%mm2 \n\t"\
00277 "movd 7(%0), %%mm5 \n\t"\
00278 "punpcklbw %%mm7, %%mm2 \n\t"\
00279 "punpcklbw %%mm7, %%mm5 \n\t"\
00280 "paddw %%mm3, %%mm2 \n\t"\
00281 "paddw %%mm5, %%mm4 \n\t"\
00282 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
00283 "paddw %%mm5, %%mm2 \n\t"\
00284 "paddw %%mm5, %%mm4 \n\t"\
00285 "paddw %%mm2, %%mm0 \n\t"\
00286 "paddw %%mm4, %%mm1 \n\t"\
00287 "psraw $5, %%mm0 \n\t"\
00288 "psraw $5, %%mm1 \n\t"\
00289 "packuswb %%mm1, %%mm0 \n\t"\
00290 OP(%%mm0, (%1),%%mm5, q)\
00291 "add %3, %0 \n\t"\
00292 "add %4, %1 \n\t"\
00293 "decl %2 \n\t"\
00294 " jnz 1b \n\t"\
00295 : "+a"(src), "+c"(dst), "+g"(h)\
00296 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
00297 : "memory"\
00298 );\
00299 }\
00300 \
00301 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00302 int h=8;\
00303 __asm__ volatile(\
00304 "pxor %%mm7, %%mm7 \n\t"\
00305 "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
00306 "1: \n\t"\
00307 "movq (%0), %%mm0 \n\t"\
00308 "movq 1(%0), %%mm2 \n\t"\
00309 "movq %%mm0, %%mm1 \n\t"\
00310 "movq %%mm2, %%mm3 \n\t"\
00311 "punpcklbw %%mm7, %%mm0 \n\t"\
00312 "punpckhbw %%mm7, %%mm1 \n\t"\
00313 "punpcklbw %%mm7, %%mm2 \n\t"\
00314 "punpckhbw %%mm7, %%mm3 \n\t"\
00315 "paddw %%mm2, %%mm0 \n\t"\
00316 "paddw %%mm3, %%mm1 \n\t"\
00317 "psllw $2, %%mm0 \n\t"\
00318 "psllw $2, %%mm1 \n\t"\
00319 "movq -1(%0), %%mm2 \n\t"\
00320 "movq 2(%0), %%mm4 \n\t"\
00321 "movq %%mm2, %%mm3 \n\t"\
00322 "movq %%mm4, %%mm5 \n\t"\
00323 "punpcklbw %%mm7, %%mm2 \n\t"\
00324 "punpckhbw %%mm7, %%mm3 \n\t"\
00325 "punpcklbw %%mm7, %%mm4 \n\t"\
00326 "punpckhbw %%mm7, %%mm5 \n\t"\
00327 "paddw %%mm4, %%mm2 \n\t"\
00328 "paddw %%mm3, %%mm5 \n\t"\
00329 "psubw %%mm2, %%mm0 \n\t"\
00330 "psubw %%mm5, %%mm1 \n\t"\
00331 "pmullw %%mm6, %%mm0 \n\t"\
00332 "pmullw %%mm6, %%mm1 \n\t"\
00333 "movd -2(%0), %%mm2 \n\t"\
00334 "movd 7(%0), %%mm5 \n\t"\
00335 "punpcklbw %%mm7, %%mm2 \n\t"\
00336 "punpcklbw %%mm7, %%mm5 \n\t"\
00337 "paddw %%mm3, %%mm2 \n\t"\
00338 "paddw %%mm5, %%mm4 \n\t"\
00339 "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
00340 "paddw %%mm5, %%mm2 \n\t"\
00341 "paddw %%mm5, %%mm4 \n\t"\
00342 "paddw %%mm2, %%mm0 \n\t"\
00343 "paddw %%mm4, %%mm1 \n\t"\
00344 "psraw $5, %%mm0 \n\t"\
00345 "psraw $5, %%mm1 \n\t"\
00346 "movq (%2), %%mm4 \n\t"\
00347 "packuswb %%mm1, %%mm0 \n\t"\
00348 PAVGB" %%mm4, %%mm0 \n\t"\
00349 OP(%%mm0, (%1),%%mm5, q)\
00350 "add %5, %0 \n\t"\
00351 "add %5, %1 \n\t"\
00352 "add %4, %2 \n\t"\
00353 "decl %3 \n\t"\
00354 "jg 1b \n\t"\
00355 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
00356 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
00357 : "memory"\
00358 );\
00359 }\
00360 \
00361 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00362 int w= 2;\
00363 src -= 2*srcStride;\
00364 \
00365 while(w--){\
00366 __asm__ volatile(\
00367 "pxor %%mm7, %%mm7 \n\t"\
00368 "movd (%0), %%mm0 \n\t"\
00369 "add %2, %0 \n\t"\
00370 "movd (%0), %%mm1 \n\t"\
00371 "add %2, %0 \n\t"\
00372 "movd (%0), %%mm2 \n\t"\
00373 "add %2, %0 \n\t"\
00374 "movd (%0), %%mm3 \n\t"\
00375 "add %2, %0 \n\t"\
00376 "movd (%0), %%mm4 \n\t"\
00377 "add %2, %0 \n\t"\
00378 "punpcklbw %%mm7, %%mm0 \n\t"\
00379 "punpcklbw %%mm7, %%mm1 \n\t"\
00380 "punpcklbw %%mm7, %%mm2 \n\t"\
00381 "punpcklbw %%mm7, %%mm3 \n\t"\
00382 "punpcklbw %%mm7, %%mm4 \n\t"\
00383 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00384 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00385 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00386 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00387 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
00388 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
00389 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00390 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00391 "cmpl $16, %4 \n\t"\
00392 "jne 2f \n\t"\
00393 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00394 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00395 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
00396 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
00397 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00398 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00399 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00400 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00401 "2: \n\t"\
00402 \
00403 : "+a"(src), "+c"(dst)\
00404 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
00405 : "memory"\
00406 );\
00407 src += 4-(h+5)*srcStride;\
00408 dst += 4-h*dstStride;\
00409 }\
00410 }\
00411 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
00412 int w = (size+8)>>2;\
00413 src -= 2*srcStride+2;\
00414 while(w--){\
00415 __asm__ volatile(\
00416 "pxor %%mm7, %%mm7 \n\t"\
00417 "movd (%0), %%mm0 \n\t"\
00418 "add %2, %0 \n\t"\
00419 "movd (%0), %%mm1 \n\t"\
00420 "add %2, %0 \n\t"\
00421 "movd (%0), %%mm2 \n\t"\
00422 "add %2, %0 \n\t"\
00423 "movd (%0), %%mm3 \n\t"\
00424 "add %2, %0 \n\t"\
00425 "movd (%0), %%mm4 \n\t"\
00426 "add %2, %0 \n\t"\
00427 "punpcklbw %%mm7, %%mm0 \n\t"\
00428 "punpcklbw %%mm7, %%mm1 \n\t"\
00429 "punpcklbw %%mm7, %%mm2 \n\t"\
00430 "punpcklbw %%mm7, %%mm3 \n\t"\
00431 "punpcklbw %%mm7, %%mm4 \n\t"\
00432 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
00433 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
00434 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
00435 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
00436 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
00437 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
00438 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
00439 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
00440 "cmpl $16, %3 \n\t"\
00441 "jne 2f \n\t"\
00442 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
00443 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
00444 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
00445 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
00446 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
00447 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
00448 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
00449 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
00450 "2: \n\t"\
00451 : "+a"(src)\
00452 : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
00453 : "memory"\
00454 );\
00455 tmp += 4;\
00456 src += 4 - (size+5)*srcStride;\
00457 }\
00458 }\
00459 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
00460 int w = size>>4;\
00461 do{\
00462 int h = size;\
00463 __asm__ volatile(\
00464 "1: \n\t"\
00465 "movq (%0), %%mm0 \n\t"\
00466 "movq 8(%0), %%mm3 \n\t"\
00467 "movq 2(%0), %%mm1 \n\t"\
00468 "movq 10(%0), %%mm4 \n\t"\
00469 "paddw %%mm4, %%mm0 \n\t"\
00470 "paddw %%mm3, %%mm1 \n\t"\
00471 "paddw 18(%0), %%mm3 \n\t"\
00472 "paddw 16(%0), %%mm4 \n\t"\
00473 "movq 4(%0), %%mm2 \n\t"\
00474 "movq 12(%0), %%mm5 \n\t"\
00475 "paddw 6(%0), %%mm2 \n\t"\
00476 "paddw 14(%0), %%mm5 \n\t"\
00477 "psubw %%mm1, %%mm0 \n\t"\
00478 "psubw %%mm4, %%mm3 \n\t"\
00479 "psraw $2, %%mm0 \n\t"\
00480 "psraw $2, %%mm3 \n\t"\
00481 "psubw %%mm1, %%mm0 \n\t"\
00482 "psubw %%mm4, %%mm3 \n\t"\
00483 "paddsw %%mm2, %%mm0 \n\t"\
00484 "paddsw %%mm5, %%mm3 \n\t"\
00485 "psraw $2, %%mm0 \n\t"\
00486 "psraw $2, %%mm3 \n\t"\
00487 "paddw %%mm2, %%mm0 \n\t"\
00488 "paddw %%mm5, %%mm3 \n\t"\
00489 "psraw $6, %%mm0 \n\t"\
00490 "psraw $6, %%mm3 \n\t"\
00491 "packuswb %%mm3, %%mm0 \n\t"\
00492 OP(%%mm0, (%1),%%mm7, q)\
00493 "add $48, %0 \n\t"\
00494 "add %3, %1 \n\t"\
00495 "decl %2 \n\t"\
00496 " jnz 1b \n\t"\
00497 : "+a"(tmp), "+c"(dst), "+g"(h)\
00498 : "S"((x86_reg)dstStride)\
00499 : "memory"\
00500 );\
00501 tmp += 8 - size*24;\
00502 dst += 8 - size*dstStride;\
00503 }while(w--);\
00504 }\
00505 \
00506 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00507 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
00508 }\
00509 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00510 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
00511 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
00512 }\
00513 \
00514 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00515 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00516 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00517 src += 8*srcStride;\
00518 dst += 8*dstStride;\
00519 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00520 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00521 }\
00522 \
00523 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00524 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
00525 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
00526 src += 8*dstStride;\
00527 dst += 8*dstStride;\
00528 src2 += 8*src2Stride;\
00529 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
00530 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
00531 }\
00532 \
00533 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
00534 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
00535 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
00536 }\
00537 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00538 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
00539 }\
00540 \
00541 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00542 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
00543 }\
00544 \
00545 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
00546 {\
00547 __asm__ volatile(\
00548 "movq (%1), %%mm0 \n\t"\
00549 "movq 24(%1), %%mm1 \n\t"\
00550 "psraw $5, %%mm0 \n\t"\
00551 "psraw $5, %%mm1 \n\t"\
00552 "packuswb %%mm0, %%mm0 \n\t"\
00553 "packuswb %%mm1, %%mm1 \n\t"\
00554 PAVGB" (%0), %%mm0 \n\t"\
00555 PAVGB" (%0,%3), %%mm1 \n\t"\
00556 OP(%%mm0, (%2), %%mm4, d)\
00557 OP(%%mm1, (%2,%4), %%mm5, d)\
00558 "lea (%0,%3,2), %0 \n\t"\
00559 "lea (%2,%4,2), %2 \n\t"\
00560 "movq 48(%1), %%mm0 \n\t"\
00561 "movq 72(%1), %%mm1 \n\t"\
00562 "psraw $5, %%mm0 \n\t"\
00563 "psraw $5, %%mm1 \n\t"\
00564 "packuswb %%mm0, %%mm0 \n\t"\
00565 "packuswb %%mm1, %%mm1 \n\t"\
00566 PAVGB" (%0), %%mm0 \n\t"\
00567 PAVGB" (%0,%3), %%mm1 \n\t"\
00568 OP(%%mm0, (%2), %%mm4, d)\
00569 OP(%%mm1, (%2,%4), %%mm5, d)\
00570 :"+a"(src8), "+c"(src16), "+d"(dst)\
00571 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
00572 :"memory");\
00573 }\
00574 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
00575 {\
00576 do{\
00577 __asm__ volatile(\
00578 "movq (%1), %%mm0 \n\t"\
00579 "movq 8(%1), %%mm1 \n\t"\
00580 "movq 48(%1), %%mm2 \n\t"\
00581 "movq 8+48(%1), %%mm3 \n\t"\
00582 "psraw $5, %%mm0 \n\t"\
00583 "psraw $5, %%mm1 \n\t"\
00584 "psraw $5, %%mm2 \n\t"\
00585 "psraw $5, %%mm3 \n\t"\
00586 "packuswb %%mm1, %%mm0 \n\t"\
00587 "packuswb %%mm3, %%mm2 \n\t"\
00588 PAVGB" (%0), %%mm0 \n\t"\
00589 PAVGB" (%0,%3), %%mm2 \n\t"\
00590 OP(%%mm0, (%2), %%mm5, q)\
00591 OP(%%mm2, (%2,%4), %%mm5, q)\
00592 ::"a"(src8), "c"(src16), "d"(dst),\
00593 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
00594 :"memory");\
00595 src8 += 2L*src8Stride;\
00596 src16 += 48;\
00597 dst += 2L*dstStride;\
00598 }while(h-=2);\
00599 }\
00600 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
00601 {\
00602 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
00603 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
00604 }\
00605
00606
00607 #if ARCH_X86_64
00608 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
00609 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00610 int h=16;\
00611 __asm__ volatile(\
00612 "pxor %%xmm15, %%xmm15 \n\t"\
00613 "movdqa %6, %%xmm14 \n\t"\
00614 "movdqa %7, %%xmm13 \n\t"\
00615 "1: \n\t"\
00616 "lddqu 6(%0), %%xmm1 \n\t"\
00617 "lddqu -2(%0), %%xmm7 \n\t"\
00618 "movdqa %%xmm1, %%xmm0 \n\t"\
00619 "punpckhbw %%xmm15, %%xmm1 \n\t"\
00620 "punpcklbw %%xmm15, %%xmm0 \n\t"\
00621 "punpcklbw %%xmm15, %%xmm7 \n\t"\
00622 "movdqa %%xmm1, %%xmm2 \n\t"\
00623 "movdqa %%xmm0, %%xmm6 \n\t"\
00624 "movdqa %%xmm1, %%xmm3 \n\t"\
00625 "movdqa %%xmm0, %%xmm8 \n\t"\
00626 "movdqa %%xmm1, %%xmm4 \n\t"\
00627 "movdqa %%xmm0, %%xmm9 \n\t"\
00628 "movdqa %%xmm0, %%xmm12 \n\t"\
00629 "movdqa %%xmm1, %%xmm11 \n\t"\
00630 "palignr $10,%%xmm0, %%xmm11\n\t"\
00631 "palignr $10,%%xmm7, %%xmm12\n\t"\
00632 "palignr $2, %%xmm0, %%xmm4 \n\t"\
00633 "palignr $2, %%xmm7, %%xmm9 \n\t"\
00634 "palignr $4, %%xmm0, %%xmm3 \n\t"\
00635 "palignr $4, %%xmm7, %%xmm8 \n\t"\
00636 "palignr $6, %%xmm0, %%xmm2 \n\t"\
00637 "palignr $6, %%xmm7, %%xmm6 \n\t"\
00638 "paddw %%xmm0 ,%%xmm11 \n\t"\
00639 "palignr $8, %%xmm0, %%xmm1 \n\t"\
00640 "palignr $8, %%xmm7, %%xmm0 \n\t"\
00641 "paddw %%xmm12,%%xmm7 \n\t"\
00642 "paddw %%xmm3, %%xmm2 \n\t"\
00643 "paddw %%xmm8, %%xmm6 \n\t"\
00644 "paddw %%xmm4, %%xmm1 \n\t"\
00645 "paddw %%xmm9, %%xmm0 \n\t"\
00646 "psllw $2, %%xmm2 \n\t"\
00647 "psllw $2, %%xmm6 \n\t"\
00648 "psubw %%xmm1, %%xmm2 \n\t"\
00649 "psubw %%xmm0, %%xmm6 \n\t"\
00650 "paddw %%xmm13,%%xmm11 \n\t"\
00651 "paddw %%xmm13,%%xmm7 \n\t"\
00652 "pmullw %%xmm14,%%xmm2 \n\t"\
00653 "pmullw %%xmm14,%%xmm6 \n\t"\
00654 "lddqu (%2), %%xmm3 \n\t"\
00655 "paddw %%xmm11,%%xmm2 \n\t"\
00656 "paddw %%xmm7, %%xmm6 \n\t"\
00657 "psraw $5, %%xmm2 \n\t"\
00658 "psraw $5, %%xmm6 \n\t"\
00659 "packuswb %%xmm2,%%xmm6 \n\t"\
00660 "pavgb %%xmm3, %%xmm6 \n\t"\
00661 OP(%%xmm6, (%1), %%xmm4, dqa)\
00662 "add %5, %0 \n\t"\
00663 "add %5, %1 \n\t"\
00664 "add %4, %2 \n\t"\
00665 "decl %3 \n\t"\
00666 "jg 1b \n\t"\
00667 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
00668 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
00669 "m"(ff_pw_5), "m"(ff_pw_16)\
00670 : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
00671 "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
00672 "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
00673 "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
00674 "memory"\
00675 );\
00676 }
00677 #else // ARCH_X86_64
00678 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
00679 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00680 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
00681 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
00682 src += 8*dstStride;\
00683 dst += 8*dstStride;\
00684 src2 += 8*src2Stride;\
00685 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
00686 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
00687 }
00688 #endif // ARCH_X86_64
00689
00690 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
00691 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00692 int h=8;\
00693 __asm__ volatile(\
00694 "pxor %%xmm7, %%xmm7 \n\t"\
00695 "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
00696 "1: \n\t"\
00697 "lddqu -2(%0), %%xmm1 \n\t"\
00698 "movdqa %%xmm1, %%xmm0 \n\t"\
00699 "punpckhbw %%xmm7, %%xmm1 \n\t"\
00700 "punpcklbw %%xmm7, %%xmm0 \n\t"\
00701 "movdqa %%xmm1, %%xmm2 \n\t"\
00702 "movdqa %%xmm1, %%xmm3 \n\t"\
00703 "movdqa %%xmm1, %%xmm4 \n\t"\
00704 "movdqa %%xmm1, %%xmm5 \n\t"\
00705 "palignr $2, %%xmm0, %%xmm4 \n\t"\
00706 "palignr $4, %%xmm0, %%xmm3 \n\t"\
00707 "palignr $6, %%xmm0, %%xmm2 \n\t"\
00708 "palignr $8, %%xmm0, %%xmm1 \n\t"\
00709 "palignr $10,%%xmm0, %%xmm5 \n\t"\
00710 "paddw %%xmm5, %%xmm0 \n\t"\
00711 "paddw %%xmm3, %%xmm2 \n\t"\
00712 "paddw %%xmm4, %%xmm1 \n\t"\
00713 "psllw $2, %%xmm2 \n\t"\
00714 "movq (%2), %%xmm3 \n\t"\
00715 "psubw %%xmm1, %%xmm2 \n\t"\
00716 "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
00717 "pmullw %%xmm6, %%xmm2 \n\t"\
00718 "paddw %%xmm0, %%xmm2 \n\t"\
00719 "psraw $5, %%xmm2 \n\t"\
00720 "packuswb %%xmm2, %%xmm2 \n\t"\
00721 "pavgb %%xmm3, %%xmm2 \n\t"\
00722 OP(%%xmm2, (%1), %%xmm4, q)\
00723 "add %5, %0 \n\t"\
00724 "add %5, %1 \n\t"\
00725 "add %4, %2 \n\t"\
00726 "decl %3 \n\t"\
00727 "jg 1b \n\t"\
00728 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
00729 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
00730 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
00731 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
00732 "memory"\
00733 );\
00734 }\
00735 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
00736 \
00737 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00738 int h=8;\
00739 __asm__ volatile(\
00740 "pxor %%xmm7, %%xmm7 \n\t"\
00741 "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
00742 "1: \n\t"\
00743 "lddqu -2(%0), %%xmm1 \n\t"\
00744 "movdqa %%xmm1, %%xmm0 \n\t"\
00745 "punpckhbw %%xmm7, %%xmm1 \n\t"\
00746 "punpcklbw %%xmm7, %%xmm0 \n\t"\
00747 "movdqa %%xmm1, %%xmm2 \n\t"\
00748 "movdqa %%xmm1, %%xmm3 \n\t"\
00749 "movdqa %%xmm1, %%xmm4 \n\t"\
00750 "movdqa %%xmm1, %%xmm5 \n\t"\
00751 "palignr $2, %%xmm0, %%xmm4 \n\t"\
00752 "palignr $4, %%xmm0, %%xmm3 \n\t"\
00753 "palignr $6, %%xmm0, %%xmm2 \n\t"\
00754 "palignr $8, %%xmm0, %%xmm1 \n\t"\
00755 "palignr $10,%%xmm0, %%xmm5 \n\t"\
00756 "paddw %%xmm5, %%xmm0 \n\t"\
00757 "paddw %%xmm3, %%xmm2 \n\t"\
00758 "paddw %%xmm4, %%xmm1 \n\t"\
00759 "psllw $2, %%xmm2 \n\t"\
00760 "psubw %%xmm1, %%xmm2 \n\t"\
00761 "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
00762 "pmullw %%xmm6, %%xmm2 \n\t"\
00763 "paddw %%xmm0, %%xmm2 \n\t"\
00764 "psraw $5, %%xmm2 \n\t"\
00765 "packuswb %%xmm2, %%xmm2 \n\t"\
00766 OP(%%xmm2, (%1), %%xmm4, q)\
00767 "add %3, %0 \n\t"\
00768 "add %4, %1 \n\t"\
00769 "decl %2 \n\t"\
00770 " jnz 1b \n\t"\
00771 : "+a"(src), "+c"(dst), "+g"(h)\
00772 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
00773 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
00774 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
00775 "memory"\
00776 );\
00777 }\
00778 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00779 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00780 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00781 src += 8*srcStride;\
00782 dst += 8*dstStride;\
00783 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00784 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00785 }\
00786
00787 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
00788 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00789 src -= 2*srcStride;\
00790 \
00791 __asm__ volatile(\
00792 "pxor %%xmm7, %%xmm7 \n\t"\
00793 "movq (%0), %%xmm0 \n\t"\
00794 "add %2, %0 \n\t"\
00795 "movq (%0), %%xmm1 \n\t"\
00796 "add %2, %0 \n\t"\
00797 "movq (%0), %%xmm2 \n\t"\
00798 "add %2, %0 \n\t"\
00799 "movq (%0), %%xmm3 \n\t"\
00800 "add %2, %0 \n\t"\
00801 "movq (%0), %%xmm4 \n\t"\
00802 "add %2, %0 \n\t"\
00803 "punpcklbw %%xmm7, %%xmm0 \n\t"\
00804 "punpcklbw %%xmm7, %%xmm1 \n\t"\
00805 "punpcklbw %%xmm7, %%xmm2 \n\t"\
00806 "punpcklbw %%xmm7, %%xmm3 \n\t"\
00807 "punpcklbw %%xmm7, %%xmm4 \n\t"\
00808 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
00809 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
00810 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
00811 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
00812 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
00813 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
00814 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
00815 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
00816 "cmpl $16, %4 \n\t"\
00817 "jne 2f \n\t"\
00818 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
00819 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
00820 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
00821 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
00822 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
00823 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
00824 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
00825 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
00826 "2: \n\t"\
00827 \
00828 : "+a"(src), "+c"(dst)\
00829 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
00830 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
00831 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
00832 "memory"\
00833 );\
00834 }\
00835 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00836 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
00837 }\
00838 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00839 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
00840 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
00841 }
00842
00843 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
00844 int w = (size+8)>>3;
00845 src -= 2*srcStride+2;
00846 while(w--){
00847 __asm__ volatile(
00848 "pxor %%xmm7, %%xmm7 \n\t"
00849 "movq (%0), %%xmm0 \n\t"
00850 "add %2, %0 \n\t"
00851 "movq (%0), %%xmm1 \n\t"
00852 "add %2, %0 \n\t"
00853 "movq (%0), %%xmm2 \n\t"
00854 "add %2, %0 \n\t"
00855 "movq (%0), %%xmm3 \n\t"
00856 "add %2, %0 \n\t"
00857 "movq (%0), %%xmm4 \n\t"
00858 "add %2, %0 \n\t"
00859 "punpcklbw %%xmm7, %%xmm0 \n\t"
00860 "punpcklbw %%xmm7, %%xmm1 \n\t"
00861 "punpcklbw %%xmm7, %%xmm2 \n\t"
00862 "punpcklbw %%xmm7, %%xmm3 \n\t"
00863 "punpcklbw %%xmm7, %%xmm4 \n\t"
00864 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
00865 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
00866 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
00867 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
00868 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
00869 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
00870 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
00871 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
00872 "cmpl $16, %3 \n\t"
00873 "jne 2f \n\t"
00874 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
00875 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
00876 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
00877 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
00878 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
00879 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
00880 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
00881 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
00882 "2: \n\t"
00883 : "+a"(src)
00884 : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)
00885 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
00886 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
00887 "memory"
00888 );
00889 tmp += 8;
00890 src += 8 - (size+5)*srcStride;
00891 }
00892 }
00893
00894 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
00895 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
00896 int h = size;\
00897 if(size == 16){\
00898 __asm__ volatile(\
00899 "1: \n\t"\
00900 "movdqa 32(%0), %%xmm4 \n\t"\
00901 "movdqa 16(%0), %%xmm5 \n\t"\
00902 "movdqa (%0), %%xmm7 \n\t"\
00903 "movdqa %%xmm4, %%xmm3 \n\t"\
00904 "movdqa %%xmm4, %%xmm2 \n\t"\
00905 "movdqa %%xmm4, %%xmm1 \n\t"\
00906 "movdqa %%xmm4, %%xmm0 \n\t"\
00907 "palignr $10, %%xmm5, %%xmm0 \n\t"\
00908 "palignr $8, %%xmm5, %%xmm1 \n\t"\
00909 "palignr $6, %%xmm5, %%xmm2 \n\t"\
00910 "palignr $4, %%xmm5, %%xmm3 \n\t"\
00911 "palignr $2, %%xmm5, %%xmm4 \n\t"\
00912 "paddw %%xmm5, %%xmm0 \n\t"\
00913 "paddw %%xmm4, %%xmm1 \n\t"\
00914 "paddw %%xmm3, %%xmm2 \n\t"\
00915 "movdqa %%xmm5, %%xmm6 \n\t"\
00916 "movdqa %%xmm5, %%xmm4 \n\t"\
00917 "movdqa %%xmm5, %%xmm3 \n\t"\
00918 "palignr $8, %%xmm7, %%xmm4 \n\t"\
00919 "palignr $2, %%xmm7, %%xmm6 \n\t"\
00920 "palignr $10, %%xmm7, %%xmm3 \n\t"\
00921 "paddw %%xmm6, %%xmm4 \n\t"\
00922 "movdqa %%xmm5, %%xmm6 \n\t"\
00923 "palignr $6, %%xmm7, %%xmm5 \n\t"\
00924 "palignr $4, %%xmm7, %%xmm6 \n\t"\
00925 "paddw %%xmm7, %%xmm3 \n\t"\
00926 "paddw %%xmm6, %%xmm5 \n\t"\
00927 \
00928 "psubw %%xmm1, %%xmm0 \n\t"\
00929 "psubw %%xmm4, %%xmm3 \n\t"\
00930 "psraw $2, %%xmm0 \n\t"\
00931 "psraw $2, %%xmm3 \n\t"\
00932 "psubw %%xmm1, %%xmm0 \n\t"\
00933 "psubw %%xmm4, %%xmm3 \n\t"\
00934 "paddw %%xmm2, %%xmm0 \n\t"\
00935 "paddw %%xmm5, %%xmm3 \n\t"\
00936 "psraw $2, %%xmm0 \n\t"\
00937 "psraw $2, %%xmm3 \n\t"\
00938 "paddw %%xmm2, %%xmm0 \n\t"\
00939 "paddw %%xmm5, %%xmm3 \n\t"\
00940 "psraw $6, %%xmm0 \n\t"\
00941 "psraw $6, %%xmm3 \n\t"\
00942 "packuswb %%xmm0, %%xmm3 \n\t"\
00943 OP(%%xmm3, (%1), %%xmm7, dqa)\
00944 "add $48, %0 \n\t"\
00945 "add %3, %1 \n\t"\
00946 "decl %2 \n\t"\
00947 " jnz 1b \n\t"\
00948 : "+a"(tmp), "+c"(dst), "+g"(h)\
00949 : "S"((x86_reg)dstStride)\
00950 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
00951 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
00952 "memory"\
00953 );\
00954 }else{\
00955 __asm__ volatile(\
00956 "1: \n\t"\
00957 "movdqa 16(%0), %%xmm1 \n\t"\
00958 "movdqa (%0), %%xmm0 \n\t"\
00959 "movdqa %%xmm1, %%xmm2 \n\t"\
00960 "movdqa %%xmm1, %%xmm3 \n\t"\
00961 "movdqa %%xmm1, %%xmm4 \n\t"\
00962 "movdqa %%xmm1, %%xmm5 \n\t"\
00963 "palignr $10, %%xmm0, %%xmm5 \n\t"\
00964 "palignr $8, %%xmm0, %%xmm4 \n\t"\
00965 "palignr $6, %%xmm0, %%xmm3 \n\t"\
00966 "palignr $4, %%xmm0, %%xmm2 \n\t"\
00967 "palignr $2, %%xmm0, %%xmm1 \n\t"\
00968 "paddw %%xmm5, %%xmm0 \n\t"\
00969 "paddw %%xmm4, %%xmm1 \n\t"\
00970 "paddw %%xmm3, %%xmm2 \n\t"\
00971 "psubw %%xmm1, %%xmm0 \n\t"\
00972 "psraw $2, %%xmm0 \n\t"\
00973 "psubw %%xmm1, %%xmm0 \n\t"\
00974 "paddw %%xmm2, %%xmm0 \n\t"\
00975 "psraw $2, %%xmm0 \n\t"\
00976 "paddw %%xmm2, %%xmm0 \n\t"\
00977 "psraw $6, %%xmm0 \n\t"\
00978 "packuswb %%xmm0, %%xmm0 \n\t"\
00979 OP(%%xmm0, (%1), %%xmm7, q)\
00980 "add $48, %0 \n\t"\
00981 "add %3, %1 \n\t"\
00982 "decl %2 \n\t"\
00983 " jnz 1b \n\t"\
00984 : "+a"(tmp), "+c"(dst), "+g"(h)\
00985 : "S"((x86_reg)dstStride)\
00986 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
00987 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
00988 "memory"\
00989 );\
00990 }\
00991 }
00992
00993 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
00994 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
00995 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
00996 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
00997 }\
00998 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00999 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
01000 }\
01001 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01002 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
01003 }\
01004
01005 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
01006 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
01007 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
01008 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
01009 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
01010 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
01011 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
01012 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
01013
01014 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
01015 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
01016 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
01017 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
01018 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
01019 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
01020 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
01021 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
01022
01023 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
01024 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
01025 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
01026 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
01027
01028 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
01029 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
01030 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
01031 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
01032
01033 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
01034 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
01035
01036 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
01037 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
01038 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
01039 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
01040 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
01041
01042 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01043 put_pixels16_sse2(dst, src, stride, 16);
01044 }
01045 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01046 avg_pixels16_sse2(dst, src, stride, 16);
01047 }
01048 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
01049 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
01050
01051 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
01052 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01053 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
01054 }\
01055
01056 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
01057 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01058 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01059 }\
01060 \
01061 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01062 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01063 }\
01064 \
01065 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01066 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01067 }\
01068
01069 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
01070 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01071 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01072 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01073 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
01074 }\
01075 \
01076 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01077 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01078 }\
01079 \
01080 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01081 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01082 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01083 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
01084 }\
01085
01086 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
01087 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01088 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01089 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01090 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01091 }\
01092 \
01093 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01094 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01095 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01096 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01097 }\
01098 \
01099 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01100 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01101 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01102 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01103 }\
01104 \
01105 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01106 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
01107 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01108 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01109 }\
01110 \
01111 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01112 DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
01113 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
01114 }\
01115 \
01116 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01117 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
01118 uint8_t * const halfHV= temp;\
01119 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
01120 assert(((int)temp & 7) == 0);\
01121 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01122 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
01123 }\
01124 \
01125 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01126 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
01127 uint8_t * const halfHV= temp;\
01128 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
01129 assert(((int)temp & 7) == 0);\
01130 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01131 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
01132 }\
01133 \
01134 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01135 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
01136 uint8_t * const halfHV= temp;\
01137 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
01138 assert(((int)temp & 7) == 0);\
01139 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01140 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
01141 }\
01142 \
01143 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01144 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
01145 uint8_t * const halfHV= temp;\
01146 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
01147 assert(((int)temp & 7) == 0);\
01148 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
01149 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
01150 }\
01151
01152 #define H264_MC_4816(MMX)\
01153 H264_MC(put_, 4, MMX, 8)\
01154 H264_MC(put_, 8, MMX, 8)\
01155 H264_MC(put_, 16,MMX, 8)\
01156 H264_MC(avg_, 4, MMX, 8)\
01157 H264_MC(avg_, 8, MMX, 8)\
01158 H264_MC(avg_, 16,MMX, 8)\
01159
01160 #define H264_MC_816(QPEL, XMM)\
01161 QPEL(put_, 8, XMM, 16)\
01162 QPEL(put_, 16,XMM, 16)\
01163 QPEL(avg_, 8, XMM, 16)\
01164 QPEL(avg_, 16,XMM, 16)\
01165
01166 #define PAVGB "pavgusb"
01167 QPEL_H264(put_, PUT_OP, 3dnow)
01168 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
01169 #undef PAVGB
01170 #define PAVGB "pavgb"
01171 QPEL_H264(put_, PUT_OP, mmx2)
01172 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
01173 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
01174 QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
01175 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
01176 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
01177 #if HAVE_SSSE3_INLINE
01178 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
01179 QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
01180 QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
01181 QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
01182 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
01183 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
01184 #endif
01185 #undef PAVGB
01186
01187 H264_MC_4816(3dnow)
01188 H264_MC_4816(mmx2)
01189 H264_MC_816(H264_MC_V, sse2)
01190 H264_MC_816(H264_MC_HV, sse2)
01191 #if HAVE_SSSE3_INLINE
01192 H264_MC_816(H264_MC_H, ssse3)
01193 H264_MC_816(H264_MC_HV, ssse3)
01194 #endif
01195
01196 #endif
01197
01198
01199 #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
01200 void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
01201 (uint8_t *dst, uint8_t *src, int stride);
01202
01203 #define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
01204 LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
01205 LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
01206 LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
01207 LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
01208 LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
01209 LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
01210
01211 #define LUMA_MC_816(DEPTH, TYPE, OPT) \
01212 LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
01213 LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
01214 LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
01215 LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
01216
01217 LUMA_MC_ALL(10, mc00, mmxext)
01218 LUMA_MC_ALL(10, mc10, mmxext)
01219 LUMA_MC_ALL(10, mc20, mmxext)
01220 LUMA_MC_ALL(10, mc30, mmxext)
01221 LUMA_MC_ALL(10, mc01, mmxext)
01222 LUMA_MC_ALL(10, mc11, mmxext)
01223 LUMA_MC_ALL(10, mc21, mmxext)
01224 LUMA_MC_ALL(10, mc31, mmxext)
01225 LUMA_MC_ALL(10, mc02, mmxext)
01226 LUMA_MC_ALL(10, mc12, mmxext)
01227 LUMA_MC_ALL(10, mc22, mmxext)
01228 LUMA_MC_ALL(10, mc32, mmxext)
01229 LUMA_MC_ALL(10, mc03, mmxext)
01230 LUMA_MC_ALL(10, mc13, mmxext)
01231 LUMA_MC_ALL(10, mc23, mmxext)
01232 LUMA_MC_ALL(10, mc33, mmxext)
01233
01234 LUMA_MC_816(10, mc00, sse2)
01235 LUMA_MC_816(10, mc10, sse2)
01236 LUMA_MC_816(10, mc10, sse2_cache64)
01237 LUMA_MC_816(10, mc10, ssse3_cache64)
01238 LUMA_MC_816(10, mc20, sse2)
01239 LUMA_MC_816(10, mc20, sse2_cache64)
01240 LUMA_MC_816(10, mc20, ssse3_cache64)
01241 LUMA_MC_816(10, mc30, sse2)
01242 LUMA_MC_816(10, mc30, sse2_cache64)
01243 LUMA_MC_816(10, mc30, ssse3_cache64)
01244 LUMA_MC_816(10, mc01, sse2)
01245 LUMA_MC_816(10, mc11, sse2)
01246 LUMA_MC_816(10, mc21, sse2)
01247 LUMA_MC_816(10, mc31, sse2)
01248 LUMA_MC_816(10, mc02, sse2)
01249 LUMA_MC_816(10, mc12, sse2)
01250 LUMA_MC_816(10, mc22, sse2)
01251 LUMA_MC_816(10, mc32, sse2)
01252 LUMA_MC_816(10, mc03, sse2)
01253 LUMA_MC_816(10, mc13, sse2)
01254 LUMA_MC_816(10, mc23, sse2)
01255 LUMA_MC_816(10, mc33, sse2)
01256
01257 #define QPEL16_OPMC(OP, MC, MMX)\
01258 void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01259 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
01260 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
01261 src += 8*stride;\
01262 dst += 8*stride;\
01263 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
01264 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
01265 }
01266
01267 #define QPEL16_OP(MC, MMX)\
01268 QPEL16_OPMC(put, MC, MMX)\
01269 QPEL16_OPMC(avg, MC, MMX)
01270
01271 #define QPEL16(MMX)\
01272 QPEL16_OP(mc00, MMX)\
01273 QPEL16_OP(mc01, MMX)\
01274 QPEL16_OP(mc02, MMX)\
01275 QPEL16_OP(mc03, MMX)\
01276 QPEL16_OP(mc10, MMX)\
01277 QPEL16_OP(mc11, MMX)\
01278 QPEL16_OP(mc12, MMX)\
01279 QPEL16_OP(mc13, MMX)\
01280 QPEL16_OP(mc20, MMX)\
01281 QPEL16_OP(mc21, MMX)\
01282 QPEL16_OP(mc22, MMX)\
01283 QPEL16_OP(mc23, MMX)\
01284 QPEL16_OP(mc30, MMX)\
01285 QPEL16_OP(mc31, MMX)\
01286 QPEL16_OP(mc32, MMX)\
01287 QPEL16_OP(mc33, MMX)
01288
01289 #if CONFIG_H264QPEL && ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+
01290 QPEL16(mmxext)
01291 #endif