00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 static void H264_CHROMA_MC8_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y, const uint64_t *rnd_reg)
00029 {
00030 DECLARE_ALIGNED_8(uint64_t, AA);
00031 DECLARE_ALIGNED_8(uint64_t, DD);
00032 int i;
00033
00034 if(y==0 && x==0) {
00035
00036 H264_CHROMA_MC8_MV0(dst, src, stride, h);
00037 return;
00038 }
00039
00040 assert(x<8 && y<8 && x>=0 && y>=0);
00041
00042 if(y==0 || x==0)
00043 {
00044
00045 const int dxy = x ? 1 : stride;
00046
00047 __asm__ volatile(
00048 "movd %0, %%mm5\n\t"
00049 "movq %1, %%mm4\n\t"
00050 "movq %2, %%mm6\n\t"
00051 "punpcklwd %%mm5, %%mm5\n\t"
00052 "punpckldq %%mm5, %%mm5\n\t"
00053 "pxor %%mm7, %%mm7\n\t"
00054 "psubw %%mm5, %%mm4\n\t"
00055 :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
00056
00057 for(i=0; i<h; i++) {
00058 __asm__ volatile(
00059
00060 "movq %0, %%mm0\n\t"
00061 "movq %1, %%mm2\n\t"
00062 :: "m"(src[0]), "m"(src[dxy]));
00063
00064 __asm__ volatile(
00065
00066
00067 "movq %%mm0, %%mm1\n\t"
00068 "movq %%mm2, %%mm3\n\t"
00069 "punpcklbw %%mm7, %%mm0\n\t"
00070 "punpckhbw %%mm7, %%mm1\n\t"
00071 "punpcklbw %%mm7, %%mm2\n\t"
00072 "punpckhbw %%mm7, %%mm3\n\t"
00073 "pmullw %%mm4, %%mm0\n\t"
00074 "pmullw %%mm4, %%mm1\n\t"
00075 "pmullw %%mm5, %%mm2\n\t"
00076 "pmullw %%mm5, %%mm3\n\t"
00077
00078
00079 "paddw %%mm6, %%mm0\n\t"
00080 "paddw %%mm6, %%mm1\n\t"
00081 "paddw %%mm2, %%mm0\n\t"
00082 "paddw %%mm3, %%mm1\n\t"
00083 "psrlw $3, %%mm0\n\t"
00084 "psrlw $3, %%mm1\n\t"
00085 "packuswb %%mm1, %%mm0\n\t"
00086 H264_CHROMA_OP(%0, %%mm0)
00087 "movq %%mm0, %0\n\t"
00088 : "=m" (dst[0]));
00089
00090 src += stride;
00091 dst += stride;
00092 }
00093 return;
00094 }
00095
00096
00097 __asm__ volatile("movd %2, %%mm4\n\t"
00098 "movd %3, %%mm6\n\t"
00099 "punpcklwd %%mm4, %%mm4\n\t"
00100 "punpcklwd %%mm6, %%mm6\n\t"
00101 "punpckldq %%mm4, %%mm4\n\t"
00102 "punpckldq %%mm6, %%mm6\n\t"
00103 "movq %%mm4, %%mm5\n\t"
00104 "pmullw %%mm6, %%mm4\n\t"
00105 "psllw $3, %%mm5\n\t"
00106 "psllw $3, %%mm6\n\t"
00107 "movq %%mm5, %%mm7\n\t"
00108 "paddw %%mm6, %%mm7\n\t"
00109 "movq %%mm4, %1\n\t"
00110 "psubw %%mm4, %%mm5\n\t"
00111 "psubw %%mm4, %%mm6\n\t"
00112 "paddw %4, %%mm4\n\t"
00113 "psubw %%mm7, %%mm4\n\t"
00114 "pxor %%mm7, %%mm7\n\t"
00115 "movq %%mm4, %0\n\t"
00116 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
00117
00118 __asm__ volatile(
00119
00120 "movq %0, %%mm0\n\t"
00121 "movq %1, %%mm1\n\t"
00122 : : "m" (src[0]), "m" (src[1]));
00123
00124 for(i=0; i<h; i++) {
00125 src += stride;
00126
00127 __asm__ volatile(
00128
00129
00130 "movq %%mm0, %%mm2\n\t"
00131 "movq %%mm1, %%mm3\n\t"
00132 "punpckhbw %%mm7, %%mm0\n\t"
00133 "punpcklbw %%mm7, %%mm1\n\t"
00134 "punpcklbw %%mm7, %%mm2\n\t"
00135 "punpckhbw %%mm7, %%mm3\n\t"
00136 "pmullw %0, %%mm0\n\t"
00137 "pmullw %0, %%mm2\n\t"
00138 "pmullw %%mm5, %%mm1\n\t"
00139 "pmullw %%mm5, %%mm3\n\t"
00140 "paddw %%mm1, %%mm2\n\t"
00141 "paddw %%mm0, %%mm3\n\t"
00142 : : "m" (AA));
00143
00144 __asm__ volatile(
00145
00146 "movq %0, %%mm0\n\t"
00147 "movq %%mm0, %%mm1\n\t"
00148 "punpcklbw %%mm7, %%mm0\n\t"
00149 "punpckhbw %%mm7, %%mm1\n\t"
00150 "pmullw %%mm6, %%mm0\n\t"
00151 "pmullw %%mm6, %%mm1\n\t"
00152 "paddw %%mm0, %%mm2\n\t"
00153 "paddw %%mm1, %%mm3\n\t"
00154 : : "m" (src[0]));
00155
00156 __asm__ volatile(
00157
00158 "movq %1, %%mm1\n\t"
00159 "movq %%mm1, %%mm0\n\t"
00160 "movq %%mm1, %%mm4\n\t"
00161 "punpcklbw %%mm7, %%mm0\n\t"
00162 "punpckhbw %%mm7, %%mm4\n\t"
00163 "pmullw %2, %%mm0\n\t"
00164 "pmullw %2, %%mm4\n\t"
00165 "paddw %%mm0, %%mm2\n\t"
00166 "paddw %%mm4, %%mm3\n\t"
00167 "movq %0, %%mm0\n\t"
00168 : : "m" (src[0]), "m" (src[1]), "m" (DD));
00169
00170 __asm__ volatile(
00171
00172 "paddw %1, %%mm2\n\t"
00173 "paddw %1, %%mm3\n\t"
00174 "psrlw $6, %%mm2\n\t"
00175 "psrlw $6, %%mm3\n\t"
00176 "packuswb %%mm3, %%mm2\n\t"
00177 H264_CHROMA_OP(%0, %%mm2)
00178 "movq %%mm2, %0\n\t"
00179 : "=m" (dst[0]) : "m" (*rnd_reg));
00180 dst+= stride;
00181 }
00182 }
00183
00184 static void H264_CHROMA_MC4_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y, const uint64_t *rnd_reg)
00185 {
00186 __asm__ volatile(
00187 "pxor %%mm7, %%mm7 \n\t"
00188 "movd %5, %%mm2 \n\t"
00189 "movd %6, %%mm3 \n\t"
00190 "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
00191 "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
00192 "punpcklwd %%mm2, %%mm2 \n\t"
00193 "punpcklwd %%mm3, %%mm3 \n\t"
00194 "punpcklwd %%mm2, %%mm2 \n\t"
00195 "punpcklwd %%mm3, %%mm3 \n\t"
00196 "psubw %%mm2, %%mm4 \n\t"
00197 "psubw %%mm3, %%mm5 \n\t"
00198
00199 "movd (%1), %%mm0 \n\t"
00200 "movd 1(%1), %%mm6 \n\t"
00201 "add %3, %1 \n\t"
00202 "punpcklbw %%mm7, %%mm0 \n\t"
00203 "punpcklbw %%mm7, %%mm6 \n\t"
00204 "pmullw %%mm4, %%mm0 \n\t"
00205 "pmullw %%mm2, %%mm6 \n\t"
00206 "paddw %%mm0, %%mm6 \n\t"
00207
00208 "1: \n\t"
00209 "movd (%1), %%mm0 \n\t"
00210 "movd 1(%1), %%mm1 \n\t"
00211 "add %3, %1 \n\t"
00212 "punpcklbw %%mm7, %%mm0 \n\t"
00213 "punpcklbw %%mm7, %%mm1 \n\t"
00214 "pmullw %%mm4, %%mm0 \n\t"
00215 "pmullw %%mm2, %%mm1 \n\t"
00216 "paddw %%mm0, %%mm1 \n\t"
00217 "movq %%mm1, %%mm0 \n\t"
00218 "pmullw %%mm5, %%mm6 \n\t"
00219 "pmullw %%mm3, %%mm1 \n\t"
00220 "paddw %4, %%mm6 \n\t"
00221 "paddw %%mm6, %%mm1 \n\t"
00222 "psrlw $6, %%mm1 \n\t"
00223 "packuswb %%mm1, %%mm1 \n\t"
00224 H264_CHROMA_OP4((%0), %%mm1, %%mm6)
00225 "movd %%mm1, (%0) \n\t"
00226 "add %3, %0 \n\t"
00227 "movd (%1), %%mm6 \n\t"
00228 "movd 1(%1), %%mm1 \n\t"
00229 "add %3, %1 \n\t"
00230 "punpcklbw %%mm7, %%mm6 \n\t"
00231 "punpcklbw %%mm7, %%mm1 \n\t"
00232 "pmullw %%mm4, %%mm6 \n\t"
00233 "pmullw %%mm2, %%mm1 \n\t"
00234 "paddw %%mm6, %%mm1 \n\t"
00235 "movq %%mm1, %%mm6 \n\t"
00236 "pmullw %%mm5, %%mm0 \n\t"
00237 "pmullw %%mm3, %%mm1 \n\t"
00238 "paddw %4, %%mm0 \n\t"
00239 "paddw %%mm0, %%mm1 \n\t"
00240 "psrlw $6, %%mm1 \n\t"
00241 "packuswb %%mm1, %%mm1 \n\t"
00242 H264_CHROMA_OP4((%0), %%mm1, %%mm0)
00243 "movd %%mm1, (%0) \n\t"
00244 "add %3, %0 \n\t"
00245 "sub $2, %2 \n\t"
00246 "jnz 1b \n\t"
00247 : "+r"(dst), "+r"(src), "+r"(h)
00248 : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
00249 );
00250 }
00251
00252 #ifdef H264_CHROMA_MC2_TMPL
00253 static void H264_CHROMA_MC2_TMPL(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
00254 {
00255 int tmp = ((1<<16)-1)*x + 8;
00256 int CD= tmp*y;
00257 int AB= (tmp<<3) - CD;
00258 __asm__ volatile(
00259
00260
00261 "movd %0, %%mm5\n\t"
00262 "movd %1, %%mm6\n\t"
00263 "punpckldq %%mm5, %%mm5\n\t"
00264 "punpckldq %%mm6, %%mm6\n\t"
00265 "pxor %%mm7, %%mm7\n\t"
00266
00267 "movd %2, %%mm2\n\t"
00268 "punpcklbw %%mm7, %%mm2\n\t"
00269 "pshufw $0x94, %%mm2, %%mm2\n\t"
00270 :: "r"(AB), "r"(CD), "m"(src[0]));
00271
00272
00273 __asm__ volatile(
00274 "1:\n\t"
00275 "add %4, %1\n\t"
00276
00277 "movq %%mm2, %%mm1\n\t"
00278 "pmaddwd %%mm5, %%mm1\n\t"
00279
00280 "movd (%1), %%mm0\n\t"
00281 "punpcklbw %%mm7, %%mm0\n\t"
00282 "pshufw $0x94, %%mm0, %%mm0\n\t"
00283
00284 "movq %%mm0, %%mm2\n\t"
00285 "pmaddwd %%mm6, %%mm0\n\t"
00286 "paddw %3, %%mm1\n\t"
00287 "paddw %%mm0, %%mm1\n\t"
00288
00289 "psrlw $6, %%mm1\n\t"
00290 "packssdw %%mm7, %%mm1\n\t"
00291 "packuswb %%mm7, %%mm1\n\t"
00292 H264_CHROMA_OP4((%0), %%mm1, %%mm3)
00293 "movd %%mm1, %%esi\n\t"
00294 "movw %%si, (%0)\n\t"
00295 "add %4, %0\n\t"
00296 "sub $1, %2\n\t"
00297 "jnz 1b\n\t"
00298 : "+r" (dst), "+r"(src), "+r"(h)
00299 : "m" (ff_pw_32), "r"((x86_reg)stride)
00300 : "%esi");
00301
00302 }
00303 #endif
00304