FFmpeg: libavcodec/x86/dsputil

00001 /*
00002  * MMX optimized DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  *
00022  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00023  */
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86/asm.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "dsputil_mmx.h"
00032 #include "idct_xvid.h"
00033 #include "diracdsp_mmx.h"
00034 
00035 //#undef NDEBUG
00036 //#include <assert.h>
00037 
00038 /* pixel operations */
00039 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
00040 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00041 
00042 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00043     { 0x8000000080000000ULL, 0x8000000080000000ULL };
00044 
00045 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
00046 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
00047 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
00048 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
00049 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
00050 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
00051 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
00052 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
00053 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
00054 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
00055 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
00056 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
00057 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
00058 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
00059 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
00060 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
00061 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
00062 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
00063 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
00064 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
00065 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
00066 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
00067 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
00068 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
00069 
00070 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
00071 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
00072 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
00073 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
00074 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
00075 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
00076 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
00077 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
00078 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
00079 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
00080 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
00081 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
00082 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
00083 
00084 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00085 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00086 
00087 #if HAVE_INLINE_ASM
00088 
00089 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
00090 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
00091 
00092 #define MOVQ_BFE(regd)                                  \
00093     __asm__ volatile (                                  \
00094         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
00095         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
00096 
00097 #ifndef PIC
00098 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
00099 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
00100 #else
00101 // for shared library it's better to use this way for accessing constants
00102 // pcmpeqd -> -1
00103 #define MOVQ_BONE(regd)                                 \
00104     __asm__ volatile (                                  \
00105         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
00106         "psrlw          $15, %%"#regd"  \n\t"           \
00107         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
00108 
00109 #define MOVQ_WTWO(regd)                                 \
00110     __asm__ volatile (                                  \
00111         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
00112         "psrlw         $15, %%"#regd"   \n\t"           \
00113         "psllw          $1, %%"#regd"   \n\t"::)
00114 
00115 #endif
00116 
00117 // using regr as temporary and for the output result
00118 // first argument is unmodifed and second is trashed
00119 // regfe is supposed to contain 0xfefefefefefefefe
00120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
00121     "movq   "#rega", "#regr"            \n\t"                    \
00122     "pand   "#regb", "#regr"            \n\t"                    \
00123     "pxor   "#rega", "#regb"            \n\t"                    \
00124     "pand  "#regfe", "#regb"            \n\t"                    \
00125     "psrlq       $1, "#regb"            \n\t"                    \
00126     "paddb  "#regb", "#regr"            \n\t"
00127 
00128 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
00129     "movq   "#rega", "#regr"            \n\t"                    \
00130     "por    "#regb", "#regr"            \n\t"                    \
00131     "pxor   "#rega", "#regb"            \n\t"                    \
00132     "pand  "#regfe", "#regb"            \n\t"                    \
00133     "psrlq       $1, "#regb"            \n\t"                    \
00134     "psubb  "#regb", "#regr"            \n\t"
00135 
00136 // mm6 is supposed to contain 0xfefefefefefefefe
00137 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
00138     "movq  "#rega", "#regr"             \n\t"                    \
00139     "movq  "#regc", "#regp"             \n\t"                    \
00140     "pand  "#regb", "#regr"             \n\t"                    \
00141     "pand  "#regd", "#regp"             \n\t"                    \
00142     "pxor  "#rega", "#regb"             \n\t"                    \
00143     "pxor  "#regc", "#regd"             \n\t"                    \
00144     "pand    %%mm6, "#regb"             \n\t"                    \
00145     "pand    %%mm6, "#regd"             \n\t"                    \
00146     "psrlq      $1, "#regb"             \n\t"                    \
00147     "psrlq      $1, "#regd"             \n\t"                    \
00148     "paddb "#regb", "#regr"             \n\t"                    \
00149     "paddb "#regd", "#regp"             \n\t"
00150 
00151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
00152     "movq  "#rega", "#regr"             \n\t"                    \
00153     "movq  "#regc", "#regp"             \n\t"                    \
00154     "por   "#regb", "#regr"             \n\t"                    \
00155     "por   "#regd", "#regp"             \n\t"                    \
00156     "pxor  "#rega", "#regb"             \n\t"                    \
00157     "pxor  "#regc", "#regd"             \n\t"                    \
00158     "pand    %%mm6, "#regb"             \n\t"                    \
00159     "pand    %%mm6, "#regd"             \n\t"                    \
00160     "psrlq      $1, "#regd"             \n\t"                    \
00161     "psrlq      $1, "#regb"             \n\t"                    \
00162     "psubb "#regb", "#regr"             \n\t"                    \
00163     "psubb "#regd", "#regp"             \n\t"
00164 
00165 /***********************************/
00166 /* MMX no rounding */
00167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
00168 #define SET_RND  MOVQ_WONE
00169 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00170 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
00171 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
00172 
00173 #include "dsputil_rnd_template.c"
00174 
00175 #undef DEF
00176 #undef SET_RND
00177 #undef PAVGBP
00178 #undef PAVGB
00179 /***********************************/
00180 /* MMX rounding */
00181 
00182 #define DEF(x, y) x ## _ ## y ## _mmx
00183 #define SET_RND  MOVQ_WTWO
00184 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
00185 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
00186 
00187 #include "dsputil_rnd_template.c"
00188 
00189 #undef DEF
00190 #undef SET_RND
00191 #undef PAVGBP
00192 #undef PAVGB
00193 #undef OP_AVG
00194 
00195 /***********************************/
00196 /* 3Dnow specific */
00197 
00198 #define DEF(x) x ## _3dnow
00199 #define PAVGB "pavgusb"
00200 #define OP_AVG PAVGB
00201 
00202 #include "dsputil_avg_template.c"
00203 
00204 #undef DEF
00205 #undef PAVGB
00206 #undef OP_AVG
00207 
00208 /***********************************/
00209 /* MMX2 specific */
00210 
00211 #define DEF(x) x ## _mmx2
00212 
00213 /* Introduced only in MMX2 set */
00214 #define PAVGB "pavgb"
00215 #define OP_AVG PAVGB
00216 
00217 #include "dsputil_avg_template.c"
00218 
00219 #undef DEF
00220 #undef PAVGB
00221 #undef OP_AVG
00222 
00223 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00224 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00225 #define put_pixels16_mmx2 put_pixels16_mmx
00226 #define put_pixels8_mmx2 put_pixels8_mmx
00227 #define put_pixels4_mmx2 put_pixels4_mmx
00228 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00229 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00230 #define put_pixels16_3dnow put_pixels16_mmx
00231 #define put_pixels8_3dnow put_pixels8_mmx
00232 #define put_pixels4_3dnow put_pixels4_mmx
00233 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00234 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00235 
00236 /***********************************/
00237 /* standard MMX */
00238 
00239 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00240                                int line_size)
00241 {
00242     const DCTELEM *p;
00243     uint8_t *pix;
00244 
00245     /* read the pixels */
00246     p   = block;
00247     pix = pixels;
00248     /* unrolled loop */
00249     __asm__ volatile (
00250         "movq      (%3), %%mm0          \n\t"
00251         "movq     8(%3), %%mm1          \n\t"
00252         "movq    16(%3), %%mm2          \n\t"
00253         "movq    24(%3), %%mm3          \n\t"
00254         "movq    32(%3), %%mm4          \n\t"
00255         "movq    40(%3), %%mm5          \n\t"
00256         "movq    48(%3), %%mm6          \n\t"
00257         "movq    56(%3), %%mm7          \n\t"
00258         "packuswb %%mm1, %%mm0          \n\t"
00259         "packuswb %%mm3, %%mm2          \n\t"
00260         "packuswb %%mm5, %%mm4          \n\t"
00261         "packuswb %%mm7, %%mm6          \n\t"
00262         "movq     %%mm0, (%0)           \n\t"
00263         "movq     %%mm2, (%0, %1)       \n\t"
00264         "movq     %%mm4, (%0, %1, 2)    \n\t"
00265         "movq     %%mm6, (%0, %2)       \n\t"
00266         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
00267            "r"(p)
00268         : "memory");
00269     pix += line_size * 4;
00270     p   += 32;
00271 
00272     // if here would be an exact copy of the code above
00273     // compiler would generate some very strange code
00274     // thus using "r"
00275     __asm__ volatile (
00276         "movq       (%3), %%mm0         \n\t"
00277         "movq      8(%3), %%mm1         \n\t"
00278         "movq     16(%3), %%mm2         \n\t"
00279         "movq     24(%3), %%mm3         \n\t"
00280         "movq     32(%3), %%mm4         \n\t"
00281         "movq     40(%3), %%mm5         \n\t"
00282         "movq     48(%3), %%mm6         \n\t"
00283         "movq     56(%3), %%mm7         \n\t"
00284         "packuswb  %%mm1, %%mm0         \n\t"
00285         "packuswb  %%mm3, %%mm2         \n\t"
00286         "packuswb  %%mm5, %%mm4         \n\t"
00287         "packuswb  %%mm7, %%mm6         \n\t"
00288         "movq      %%mm0, (%0)          \n\t"
00289         "movq      %%mm2, (%0, %1)      \n\t"
00290         "movq      %%mm4, (%0, %1, 2)   \n\t"
00291         "movq      %%mm6, (%0, %2)      \n\t"
00292         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
00293         : "memory");
00294 }
00295 
00296 #define put_signed_pixels_clamped_mmx_half(off)             \
00297     "movq          "#off"(%2), %%mm1        \n\t"           \
00298     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
00299     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
00300     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
00301     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
00302     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
00303     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
00304     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
00305     "paddb              %%mm0, %%mm1        \n\t"           \
00306     "paddb              %%mm0, %%mm2        \n\t"           \
00307     "paddb              %%mm0, %%mm3        \n\t"           \
00308     "paddb              %%mm0, %%mm4        \n\t"           \
00309     "movq               %%mm1, (%0)         \n\t"           \
00310     "movq               %%mm2, (%0, %3)     \n\t"           \
00311     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
00312     "movq               %%mm4, (%0, %1)     \n\t"
00313 
00314 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00315                                       int line_size)
00316 {
00317     x86_reg line_skip = line_size;
00318     x86_reg line_skip3;
00319 
00320     __asm__ volatile (
00321         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
00322         "lea         (%3, %3, 2), %1        \n\t"
00323         put_signed_pixels_clamped_mmx_half(0)
00324         "lea         (%0, %3, 4), %0        \n\t"
00325         put_signed_pixels_clamped_mmx_half(64)
00326         : "+&r"(pixels), "=&r"(line_skip3)
00327         : "r"(block), "r"(line_skip)
00328         : "memory");
00329 }
00330 
00331 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00332                                int line_size)
00333 {
00334     const DCTELEM *p;
00335     uint8_t *pix;
00336     int i;
00337 
00338     /* read the pixels */
00339     p   = block;
00340     pix = pixels;
00341     MOVQ_ZERO(mm7);
00342     i = 4;
00343     do {
00344         __asm__ volatile (
00345             "movq        (%2), %%mm0    \n\t"
00346             "movq       8(%2), %%mm1    \n\t"
00347             "movq      16(%2), %%mm2    \n\t"
00348             "movq      24(%2), %%mm3    \n\t"
00349             "movq          %0, %%mm4    \n\t"
00350             "movq          %1, %%mm6    \n\t"
00351             "movq       %%mm4, %%mm5    \n\t"
00352             "punpcklbw  %%mm7, %%mm4    \n\t"
00353             "punpckhbw  %%mm7, %%mm5    \n\t"
00354             "paddsw     %%mm4, %%mm0    \n\t"
00355             "paddsw     %%mm5, %%mm1    \n\t"
00356             "movq       %%mm6, %%mm5    \n\t"
00357             "punpcklbw  %%mm7, %%mm6    \n\t"
00358             "punpckhbw  %%mm7, %%mm5    \n\t"
00359             "paddsw     %%mm6, %%mm2    \n\t"
00360             "paddsw     %%mm5, %%mm3    \n\t"
00361             "packuswb   %%mm1, %%mm0    \n\t"
00362             "packuswb   %%mm3, %%mm2    \n\t"
00363             "movq       %%mm0, %0       \n\t"
00364             "movq       %%mm2, %1       \n\t"
00365             : "+m"(*pix), "+m"(*(pix + line_size))
00366             : "r"(p)
00367             : "memory");
00368         pix += line_size * 2;
00369         p   += 16;
00370     } while (--i);
00371 }
00372 
00373 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
00374                             int line_size, int h)
00375 {
00376     __asm__ volatile (
00377         "lea   (%3, %3), %%"REG_a"      \n\t"
00378         ".p2align     3                 \n\t"
00379         "1:                             \n\t"
00380         "movd  (%1    ), %%mm0          \n\t"
00381         "movd  (%1, %3), %%mm1          \n\t"
00382         "movd     %%mm0, (%2)           \n\t"
00383         "movd     %%mm1, (%2, %3)       \n\t"
00384         "add  %%"REG_a", %1             \n\t"
00385         "add  %%"REG_a", %2             \n\t"
00386         "movd  (%1    ), %%mm0          \n\t"
00387         "movd  (%1, %3), %%mm1          \n\t"
00388         "movd     %%mm0, (%2)           \n\t"
00389         "movd     %%mm1, (%2, %3)       \n\t"
00390         "add  %%"REG_a", %1             \n\t"
00391         "add  %%"REG_a", %2             \n\t"
00392         "subl        $4, %0             \n\t"
00393         "jnz         1b                 \n\t"
00394         : "+g"(h), "+r"(pixels),  "+r"(block)
00395         : "r"((x86_reg)line_size)
00396         : "%"REG_a, "memory"
00397         );
00398 }
00399 
00400 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
00401                             int line_size, int h)
00402 {
00403     __asm__ volatile (
00404         "lea   (%3, %3), %%"REG_a"      \n\t"
00405         ".p2align     3                 \n\t"
00406         "1:                             \n\t"
00407         "movq  (%1    ), %%mm0          \n\t"
00408         "movq  (%1, %3), %%mm1          \n\t"
00409         "movq     %%mm0, (%2)           \n\t"
00410         "movq     %%mm1, (%2, %3)       \n\t"
00411         "add  %%"REG_a", %1             \n\t"
00412         "add  %%"REG_a", %2             \n\t"
00413         "movq  (%1    ), %%mm0          \n\t"
00414         "movq  (%1, %3), %%mm1          \n\t"
00415         "movq     %%mm0, (%2)           \n\t"
00416         "movq     %%mm1, (%2, %3)       \n\t"
00417         "add  %%"REG_a", %1             \n\t"
00418         "add  %%"REG_a", %2             \n\t"
00419         "subl        $4, %0             \n\t"
00420         "jnz         1b                 \n\t"
00421         : "+g"(h), "+r"(pixels),  "+r"(block)
00422         : "r"((x86_reg)line_size)
00423         : "%"REG_a, "memory"
00424         );
00425 }
00426 
00427 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
00428                              int line_size, int h)
00429 {
00430     __asm__ volatile (
00431         "lea   (%3, %3), %%"REG_a"      \n\t"
00432         ".p2align     3                 \n\t"
00433         "1:                             \n\t"
00434         "movq  (%1    ), %%mm0          \n\t"
00435         "movq 8(%1    ), %%mm4          \n\t"
00436         "movq  (%1, %3), %%mm1          \n\t"
00437         "movq 8(%1, %3), %%mm5          \n\t"
00438         "movq     %%mm0,  (%2)          \n\t"
00439         "movq     %%mm4, 8(%2)          \n\t"
00440         "movq     %%mm1,  (%2, %3)      \n\t"
00441         "movq     %%mm5, 8(%2, %3)      \n\t"
00442         "add  %%"REG_a", %1             \n\t"
00443         "add  %%"REG_a", %2             \n\t"
00444         "movq  (%1    ), %%mm0          \n\t"
00445         "movq 8(%1    ), %%mm4          \n\t"
00446         "movq  (%1, %3), %%mm1          \n\t"
00447         "movq 8(%1, %3), %%mm5          \n\t"
00448         "movq     %%mm0,  (%2)          \n\t"
00449         "movq     %%mm4, 8(%2)          \n\t"
00450         "movq     %%mm1,  (%2, %3)      \n\t"
00451         "movq     %%mm5, 8(%2, %3)      \n\t"
00452         "add  %%"REG_a", %1             \n\t"
00453         "add  %%"REG_a", %2             \n\t"
00454         "subl        $4, %0             \n\t"
00455         "jnz         1b                 \n\t"
00456         : "+g"(h), "+r"(pixels),  "+r"(block)
00457         : "r"((x86_reg)line_size)
00458         : "%"REG_a, "memory"
00459         );
00460 }
00461 
00462 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
00463                               int line_size, int h)
00464 {
00465     __asm__ volatile (
00466         "1:                              \n\t"
00467         "movdqu (%1       ), %%xmm0      \n\t"
00468         "movdqu (%1, %3   ), %%xmm1      \n\t"
00469         "movdqu (%1, %3, 2), %%xmm2      \n\t"
00470         "movdqu (%1, %4   ), %%xmm3      \n\t"
00471         "lea    (%1, %3, 4), %1          \n\t"
00472         "movdqa      %%xmm0, (%2)        \n\t"
00473         "movdqa      %%xmm1, (%2, %3)    \n\t"
00474         "movdqa      %%xmm2, (%2, %3, 2) \n\t"
00475         "movdqa      %%xmm3, (%2, %4)    \n\t"
00476         "subl            $4, %0          \n\t"
00477         "lea    (%2, %3, 4), %2          \n\t"
00478         "jnz             1b              \n\t"
00479         : "+g"(h), "+r"(pixels),  "+r"(block)
00480         : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
00481         : "memory"
00482         );
00483 }
00484 
00485 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
00486                               int line_size, int h)
00487 {
00488     __asm__ volatile (
00489         "1:                                 \n\t"
00490         "movdqu (%1       ), %%xmm0         \n\t"
00491         "movdqu (%1, %3   ), %%xmm1         \n\t"
00492         "movdqu (%1, %3, 2), %%xmm2         \n\t"
00493         "movdqu (%1, %4   ), %%xmm3         \n\t"
00494         "lea    (%1, %3, 4), %1             \n\t"
00495         "pavgb  (%2       ), %%xmm0         \n\t"
00496         "pavgb  (%2, %3   ), %%xmm1         \n\t"
00497         "pavgb  (%2, %3, 2), %%xmm2         \n\t"
00498         "pavgb     (%2, %4), %%xmm3         \n\t"
00499         "movdqa      %%xmm0, (%2)           \n\t"
00500         "movdqa      %%xmm1, (%2, %3)       \n\t"
00501         "movdqa      %%xmm2, (%2, %3, 2)    \n\t"
00502         "movdqa      %%xmm3, (%2, %4)       \n\t"
00503         "subl            $4, %0             \n\t"
00504         "lea    (%2, %3, 4), %2             \n\t"
00505         "jnz             1b                 \n\t"
00506         : "+g"(h), "+r"(pixels),  "+r"(block)
00507         : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
00508         : "memory"
00509         );
00510 }
00511 
00512 #define CLEAR_BLOCKS(name, n)                           \
00513 static void name(DCTELEM *blocks)                       \
00514 {                                                       \
00515     __asm__ volatile (                                  \
00516         "pxor %%mm7, %%mm7              \n\t"           \
00517         "mov     %1,        %%"REG_a"   \n\t"           \
00518         "1:                             \n\t"           \
00519         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
00520         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
00521         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
00522         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
00523         "add    $32, %%"REG_a"          \n\t"           \
00524         "js      1b                     \n\t"           \
00525         :: "r"(((uint8_t *)blocks) + 128 * n),          \
00526            "i"(-128 * n)                                \
00527         : "%"REG_a                                      \
00528         );                                              \
00529 }
00530 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00531 CLEAR_BLOCKS(clear_block_mmx, 1)
00532 
00533 static void clear_block_sse(DCTELEM *block)
00534 {
00535     __asm__ volatile (
00536         "xorps  %%xmm0, %%xmm0          \n"
00537         "movaps %%xmm0,    (%0)         \n"
00538         "movaps %%xmm0,  16(%0)         \n"
00539         "movaps %%xmm0,  32(%0)         \n"
00540         "movaps %%xmm0,  48(%0)         \n"
00541         "movaps %%xmm0,  64(%0)         \n"
00542         "movaps %%xmm0,  80(%0)         \n"
00543         "movaps %%xmm0,  96(%0)         \n"
00544         "movaps %%xmm0, 112(%0)         \n"
00545         :: "r"(block)
00546         : "memory"
00547     );
00548 }
00549 
00550 static void clear_blocks_sse(DCTELEM *blocks)
00551 {
00552     __asm__ volatile (
00553         "xorps  %%xmm0, %%xmm0              \n"
00554         "mov        %1,         %%"REG_a"   \n"
00555         "1:                                 \n"
00556         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
00557         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
00558         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
00559         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
00560         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
00561         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
00562         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
00563         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
00564         "add      $128,         %%"REG_a"   \n"
00565         "js         1b                      \n"
00566         :: "r"(((uint8_t *)blocks) + 128 * 6),
00567            "i"(-128 * 6)
00568         : "%"REG_a
00569     );
00570 }
00571 
00572 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
00573 {
00574     x86_reg i = 0;
00575     __asm__ volatile (
00576         "jmp          2f                \n\t"
00577         "1:                             \n\t"
00578         "movq   (%1, %0), %%mm0         \n\t"
00579         "movq   (%2, %0), %%mm1         \n\t"
00580         "paddb     %%mm0, %%mm1         \n\t"
00581         "movq      %%mm1, (%2, %0)      \n\t"
00582         "movq  8(%1, %0), %%mm0         \n\t"
00583         "movq  8(%2, %0), %%mm1         \n\t"
00584         "paddb     %%mm0, %%mm1         \n\t"
00585         "movq      %%mm1, 8(%2, %0)     \n\t"
00586         "add         $16, %0            \n\t"
00587         "2:                             \n\t"
00588         "cmp          %3, %0            \n\t"
00589         "js           1b                \n\t"
00590         : "+r"(i)
00591         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
00592     );
00593     for ( ; i < w; i++)
00594         dst[i + 0] += src[i + 0];
00595 }
00596 
00597 #if HAVE_7REGS
00598 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
00599                                             const uint8_t *diff, int w,
00600                                             int *left, int *left_top)
00601 {
00602     x86_reg w2 = -w;
00603     x86_reg x;
00604     int l  = *left     & 0xff;
00605     int tl = *left_top & 0xff;
00606     int t;
00607     __asm__ volatile (
00608         "mov          %7, %3            \n"
00609         "1:                             \n"
00610         "movzbl (%3, %4), %2            \n"
00611         "mov          %2, %k3           \n"
00612         "sub         %b1, %b3           \n"
00613         "add         %b0, %b3           \n"
00614         "mov          %2, %1            \n"
00615         "cmp          %0, %2            \n"
00616         "cmovg        %0, %2            \n"
00617         "cmovg        %1, %0            \n"
00618         "cmp         %k3, %0            \n"
00619         "cmovg       %k3, %0            \n"
00620         "mov          %7, %3            \n"
00621         "cmp          %2, %0            \n"
00622         "cmovl        %2, %0            \n"
00623         "add    (%6, %4), %b0           \n"
00624         "mov         %b0, (%5, %4)      \n"
00625         "inc          %4                \n"
00626         "jl           1b                \n"
00627         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00628         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
00629     );
00630     *left     = l;
00631     *left_top = tl;
00632 }
00633 #endif
00634 
00635 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
00636     __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
00637         "movd  (%1), %%mm0              \n\t"
00638         "add   %3, %1                   \n\t"
00639         "movd  (%1), %%mm1              \n\t"
00640         "movd  (%1,%3,1), %%mm2         \n\t"
00641         "movd  (%1,%3,2), %%mm3         \n\t"
00642         "punpcklbw %%mm1, %%mm0         \n\t"
00643         "punpcklbw %%mm3, %%mm2         \n\t"
00644         "movq %%mm0, %%mm1              \n\t"
00645         "punpcklwd %%mm2, %%mm0         \n\t"
00646         "punpckhwd %%mm2, %%mm1         \n\t"
00647         "movd  %%mm0, (%0)              \n\t"
00648         "add   %2, %0                   \n\t"
00649         "punpckhdq %%mm0, %%mm0         \n\t"
00650         "movd  %%mm0, (%0)              \n\t"
00651         "movd  %%mm1, (%0,%2,1)         \n\t"
00652         "punpckhdq %%mm1, %%mm1         \n\t"
00653         "movd  %%mm1, (%0,%2,2)         \n\t"
00654 
00655         :  "+&r" (dst),
00656            "+&r" (src)
00657         :  "r" (dst_stride),
00658            "r" (src_stride)
00659         :  "memory"
00660     );
00661 }
00662 
00663 #define H263_LOOP_FILTER                        \
00664     "pxor      %%mm7, %%mm7             \n\t"   \
00665     "movq         %0, %%mm0             \n\t"   \
00666     "movq         %0, %%mm1             \n\t"   \
00667     "movq         %3, %%mm2             \n\t"   \
00668     "movq         %3, %%mm3             \n\t"   \
00669     "punpcklbw %%mm7, %%mm0             \n\t"   \
00670     "punpckhbw %%mm7, %%mm1             \n\t"   \
00671     "punpcklbw %%mm7, %%mm2             \n\t"   \
00672     "punpckhbw %%mm7, %%mm3             \n\t"   \
00673     "psubw     %%mm2, %%mm0             \n\t"   \
00674     "psubw     %%mm3, %%mm1             \n\t"   \
00675     "movq         %1, %%mm2             \n\t"   \
00676     "movq         %1, %%mm3             \n\t"   \
00677     "movq         %2, %%mm4             \n\t"   \
00678     "movq         %2, %%mm5             \n\t"   \
00679     "punpcklbw %%mm7, %%mm2             \n\t"   \
00680     "punpckhbw %%mm7, %%mm3             \n\t"   \
00681     "punpcklbw %%mm7, %%mm4             \n\t"   \
00682     "punpckhbw %%mm7, %%mm5             \n\t"   \
00683     "psubw     %%mm2, %%mm4             \n\t"   \
00684     "psubw     %%mm3, %%mm5             \n\t"   \
00685     "psllw        $2, %%mm4             \n\t"   \
00686     "psllw        $2, %%mm5             \n\t"   \
00687     "paddw     %%mm0, %%mm4             \n\t"   \
00688     "paddw     %%mm1, %%mm5             \n\t"   \
00689     "pxor      %%mm6, %%mm6             \n\t"   \
00690     "pcmpgtw   %%mm4, %%mm6             \n\t"   \
00691     "pcmpgtw   %%mm5, %%mm7             \n\t"   \
00692     "pxor      %%mm6, %%mm4             \n\t"   \
00693     "pxor      %%mm7, %%mm5             \n\t"   \
00694     "psubw     %%mm6, %%mm4             \n\t"   \
00695     "psubw     %%mm7, %%mm5             \n\t"   \
00696     "psrlw        $3, %%mm4             \n\t"   \
00697     "psrlw        $3, %%mm5             \n\t"   \
00698     "packuswb  %%mm5, %%mm4             \n\t"   \
00699     "packsswb  %%mm7, %%mm6             \n\t"   \
00700     "pxor      %%mm7, %%mm7             \n\t"   \
00701     "movd         %4, %%mm2             \n\t"   \
00702     "punpcklbw %%mm2, %%mm2             \n\t"   \
00703     "punpcklbw %%mm2, %%mm2             \n\t"   \
00704     "punpcklbw %%mm2, %%mm2             \n\t"   \
00705     "psubusb   %%mm4, %%mm2             \n\t"   \
00706     "movq      %%mm2, %%mm3             \n\t"   \
00707     "psubusb   %%mm4, %%mm3             \n\t"   \
00708     "psubb     %%mm3, %%mm2             \n\t"   \
00709     "movq         %1, %%mm3             \n\t"   \
00710     "movq         %2, %%mm4             \n\t"   \
00711     "pxor      %%mm6, %%mm3             \n\t"   \
00712     "pxor      %%mm6, %%mm4             \n\t"   \
00713     "paddusb   %%mm2, %%mm3             \n\t"   \
00714     "psubusb   %%mm2, %%mm4             \n\t"   \
00715     "pxor      %%mm6, %%mm3             \n\t"   \
00716     "pxor      %%mm6, %%mm4             \n\t"   \
00717     "paddusb   %%mm2, %%mm2             \n\t"   \
00718     "packsswb  %%mm1, %%mm0             \n\t"   \
00719     "pcmpgtb   %%mm0, %%mm7             \n\t"   \
00720     "pxor      %%mm7, %%mm0             \n\t"   \
00721     "psubb     %%mm7, %%mm0             \n\t"   \
00722     "movq      %%mm0, %%mm1             \n\t"   \
00723     "psubusb   %%mm2, %%mm0             \n\t"   \
00724     "psubb     %%mm0, %%mm1             \n\t"   \
00725     "pand         %5, %%mm1             \n\t"   \
00726     "psrlw        $2, %%mm1             \n\t"   \
00727     "pxor      %%mm7, %%mm1             \n\t"   \
00728     "psubb     %%mm7, %%mm1             \n\t"   \
00729     "movq         %0, %%mm5             \n\t"   \
00730     "movq         %3, %%mm6             \n\t"   \
00731     "psubb     %%mm1, %%mm5             \n\t"   \
00732     "paddb     %%mm1, %%mm6             \n\t"
00733 
00734 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00735 {
00736     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00737         const int strength = ff_h263_loop_filter_strength[qscale];
00738 
00739         __asm__ volatile (
00740             H263_LOOP_FILTER
00741 
00742             "movq %%mm3, %1             \n\t"
00743             "movq %%mm4, %2             \n\t"
00744             "movq %%mm5, %0             \n\t"
00745             "movq %%mm6, %3             \n\t"
00746             : "+m"(*(uint64_t*)(src - 2 * stride)),
00747               "+m"(*(uint64_t*)(src - 1 * stride)),
00748               "+m"(*(uint64_t*)(src + 0 * stride)),
00749               "+m"(*(uint64_t*)(src + 1 * stride))
00750             : "g"(2 * strength), "m"(ff_pb_FC)
00751             );
00752     }
00753 }
00754 
00755 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00756 {
00757     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00758         const int strength = ff_h263_loop_filter_strength[qscale];
00759         DECLARE_ALIGNED(8, uint64_t, temp)[4];
00760         uint8_t *btemp = (uint8_t*)temp;
00761 
00762         src -= 2;
00763 
00764         transpose4x4(btemp,     src,              8, stride);
00765         transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
00766         __asm__ volatile (
00767             H263_LOOP_FILTER // 5 3 4 6
00768 
00769             : "+m"(temp[0]),
00770               "+m"(temp[1]),
00771               "+m"(temp[2]),
00772               "+m"(temp[3])
00773             : "g"(2 * strength), "m"(ff_pb_FC)
00774             );
00775 
00776         __asm__ volatile (
00777             "movq      %%mm5, %%mm1         \n\t"
00778             "movq      %%mm4, %%mm0         \n\t"
00779             "punpcklbw %%mm3, %%mm5         \n\t"
00780             "punpcklbw %%mm6, %%mm4         \n\t"
00781             "punpckhbw %%mm3, %%mm1         \n\t"
00782             "punpckhbw %%mm6, %%mm0         \n\t"
00783             "movq      %%mm5, %%mm3         \n\t"
00784             "movq      %%mm1, %%mm6         \n\t"
00785             "punpcklwd %%mm4, %%mm5         \n\t"
00786             "punpcklwd %%mm0, %%mm1         \n\t"
00787             "punpckhwd %%mm4, %%mm3         \n\t"
00788             "punpckhwd %%mm0, %%mm6         \n\t"
00789             "movd      %%mm5, (%0)          \n\t"
00790             "punpckhdq %%mm5, %%mm5         \n\t"
00791             "movd      %%mm5, (%0, %2)      \n\t"
00792             "movd      %%mm3, (%0, %2, 2)   \n\t"
00793             "punpckhdq %%mm3, %%mm3         \n\t"
00794             "movd      %%mm3, (%0, %3)      \n\t"
00795             "movd      %%mm1, (%1)          \n\t"
00796             "punpckhdq %%mm1, %%mm1         \n\t"
00797             "movd      %%mm1, (%1, %2)      \n\t"
00798             "movd      %%mm6, (%1, %2, 2)   \n\t"
00799             "punpckhdq %%mm6, %%mm6         \n\t"
00800             "movd      %%mm6, (%1, %3)      \n\t"
00801             :: "r"(src),
00802                "r"(src + 4 * stride),
00803                "r"((x86_reg)stride),
00804                "r"((x86_reg)(3 * stride))
00805             );
00806     }
00807 }
00808 
00809 /* Draw the edges of width 'w' of an image of size width, height
00810  * this MMX version can only handle w == 8 || w == 16. */
00811 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
00812                            int w, int h, int sides)
00813 {
00814     uint8_t *ptr, *last_line;
00815     int i;
00816 
00817     last_line = buf + (height - 1) * wrap;
00818     /* left and right */
00819     ptr = buf;
00820     if (w == 8) {
00821         __asm__ volatile (
00822             "1:                             \n\t"
00823             "movd            (%0), %%mm0    \n\t"
00824             "punpcklbw      %%mm0, %%mm0    \n\t"
00825             "punpcklwd      %%mm0, %%mm0    \n\t"
00826             "punpckldq      %%mm0, %%mm0    \n\t"
00827             "movq           %%mm0, -8(%0)   \n\t"
00828             "movq      -8(%0, %2), %%mm1    \n\t"
00829             "punpckhbw      %%mm1, %%mm1    \n\t"
00830             "punpckhwd      %%mm1, %%mm1    \n\t"
00831             "punpckhdq      %%mm1, %%mm1    \n\t"
00832             "movq           %%mm1, (%0, %2) \n\t"
00833             "add               %1, %0       \n\t"
00834             "cmp               %3, %0       \n\t"
00835             "jb                1b           \n\t"
00836             : "+r"(ptr)
00837             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00838             );
00839     } else if(w==16){
00840         __asm__ volatile (
00841             "1:                                 \n\t"
00842             "movd            (%0), %%mm0        \n\t"
00843             "punpcklbw      %%mm0, %%mm0        \n\t"
00844             "punpcklwd      %%mm0, %%mm0        \n\t"
00845             "punpckldq      %%mm0, %%mm0        \n\t"
00846             "movq           %%mm0, -8(%0)       \n\t"
00847             "movq           %%mm0, -16(%0)      \n\t"
00848             "movq      -8(%0, %2), %%mm1        \n\t"
00849             "punpckhbw      %%mm1, %%mm1        \n\t"
00850             "punpckhwd      %%mm1, %%mm1        \n\t"
00851             "punpckhdq      %%mm1, %%mm1        \n\t"
00852             "movq           %%mm1,  (%0, %2)    \n\t"
00853             "movq           %%mm1, 8(%0, %2)    \n\t"
00854             "add               %1, %0           \n\t"
00855             "cmp               %3, %0           \n\t"
00856             "jb                1b               \n\t"
00857             : "+r"(ptr)
00858             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00859             );
00860     } else {
00861         av_assert1(w == 4);
00862         __asm__ volatile (
00863             "1:                             \n\t"
00864             "movd            (%0), %%mm0    \n\t"
00865             "punpcklbw      %%mm0, %%mm0    \n\t"
00866             "punpcklwd      %%mm0, %%mm0    \n\t"
00867             "movd           %%mm0, -4(%0)   \n\t"
00868             "movd      -4(%0, %2), %%mm1    \n\t"
00869             "punpcklbw      %%mm1, %%mm1    \n\t"
00870             "punpckhwd      %%mm1, %%mm1    \n\t"
00871             "punpckhdq      %%mm1, %%mm1    \n\t"
00872             "movd           %%mm1, (%0, %2) \n\t"
00873             "add               %1, %0       \n\t"
00874             "cmp               %3, %0       \n\t"
00875             "jb                1b           \n\t"
00876             : "+r"(ptr)
00877             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00878             );
00879     }
00880 
00881     /* top and bottom (and hopefully also the corners) */
00882     if (sides & EDGE_TOP) {
00883         for (i = 0; i < h; i += 4) {
00884             ptr = buf - (i + 1) * wrap - w;
00885             __asm__ volatile (
00886                 "1:                             \n\t"
00887                 "movq (%1, %0), %%mm0           \n\t"
00888                 "movq    %%mm0, (%0)            \n\t"
00889                 "movq    %%mm0, (%0, %2)        \n\t"
00890                 "movq    %%mm0, (%0, %2, 2)     \n\t"
00891                 "movq    %%mm0, (%0, %3)        \n\t"
00892                 "add        $8, %0              \n\t"
00893                 "cmp        %4, %0              \n\t"
00894                 "jb         1b                  \n\t"
00895                 : "+r"(ptr)
00896                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
00897                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
00898                 );
00899         }
00900     }
00901 
00902     if (sides & EDGE_BOTTOM) {
00903         for (i = 0; i < h; i += 4) {
00904             ptr = last_line + (i + 1) * wrap - w;
00905             __asm__ volatile (
00906                 "1:                             \n\t"
00907                 "movq (%1, %0), %%mm0           \n\t"
00908                 "movq    %%mm0, (%0)            \n\t"
00909                 "movq    %%mm0, (%0, %2)        \n\t"
00910                 "movq    %%mm0, (%0, %2, 2)     \n\t"
00911                 "movq    %%mm0, (%0, %3)        \n\t"
00912                 "add        $8, %0              \n\t"
00913                 "cmp        %4, %0              \n\t"
00914                 "jb         1b                  \n\t"
00915                 : "+r"(ptr)
00916                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
00917                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
00918                   "r"(ptr + width + 2 * w)
00919                 );
00920         }
00921     }
00922 }
00923 
00924 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd,                      \
00925                    in0, in1, in2, in7, out, OP)                           \
00926     "paddw               "#m4", "#m3"   \n\t" /* x1 */                    \
00927     "movq   "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */                    \
00928     "pmullw              "#m3", %%mm4   \n\t" /* 20x1 */                  \
00929     "movq               "#in7", "#m3"   \n\t" /* d */                     \
00930     "movq               "#in0", %%mm5   \n\t" /* D */                     \
00931     "paddw               "#m3", %%mm5   \n\t" /* x4 */                    \
00932     "psubw               %%mm5, %%mm4   \n\t" /* 20x1 - x4 */             \
00933     "movq               "#in1", %%mm5   \n\t" /* C */                     \
00934     "movq               "#in2", %%mm6   \n\t" /* B */                     \
00935     "paddw               "#m6", %%mm5   \n\t" /* x3 */                    \
00936     "paddw               "#m5", %%mm6   \n\t" /* x2 */                    \
00937     "paddw               %%mm6, %%mm6   \n\t" /* 2x2 */                   \
00938     "psubw               %%mm6, %%mm5   \n\t" /* -2x2 + x3 */             \
00939     "pmullw  "MANGLE(ff_pw_3)", %%mm5   \n\t" /* -6x2 + 3x3 */            \
00940     "paddw              "#rnd", %%mm4   \n\t" /* x2 */                    \
00941     "paddw               %%mm4, %%mm5   \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
00942     "psraw                  $5, %%mm5   \n\t"                             \
00943     "packuswb            %%mm5, %%mm5   \n\t"                             \
00944     OP(%%mm5, out, %%mm7, d)
00945 
00946 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)                \
00947 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst,           \
00948                                                   uint8_t *src,           \
00949                                                   int dstStride,          \
00950                                                   int srcStride,          \
00951                                                   int h)                  \
00952 {                                                                         \
00953     uint64_t temp;                                                        \
00954                                                                           \
00955     __asm__ volatile (                                                    \
00956         "pxor      %%mm7, %%mm7             \n\t"                         \
00957         "1:                                 \n\t"                         \
00958         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
00959         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
00960         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
00961         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
00962         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
00963         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
00964         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
00965         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
00966         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
00967         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
00968         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
00969         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
00970         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
00971         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
00972         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
00973         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
00974         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
00975         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
00976         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
00977         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
00978         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
00979         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
00980         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
00981         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
00982         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
00983         "paddw        %6, %%mm6             \n\t"                         \
00984         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
00985         "psraw        $5, %%mm0             \n\t"                         \
00986         "movq      %%mm0, %5                \n\t"                         \
00987         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
00988                                                                           \
00989         "movq      5(%0), %%mm0             \n\t" /* FGHIJKLM */          \
00990         "movq      %%mm0, %%mm5             \n\t" /* FGHIJKLM */          \
00991         "movq      %%mm0, %%mm6             \n\t" /* FGHIJKLM */          \
00992         "psrlq        $8, %%mm0             \n\t" /* GHIJKLM0 */          \
00993         "psrlq       $16, %%mm5             \n\t" /* HIJKLM00 */          \
00994         "punpcklbw %%mm7, %%mm0             \n\t" /* 0G0H0I0J */          \
00995         "punpcklbw %%mm7, %%mm5             \n\t" /* 0H0I0J0K */          \
00996         "paddw     %%mm0, %%mm2             \n\t" /* b */                 \
00997         "paddw     %%mm5, %%mm3             \n\t" /* c */                 \
00998         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
00999         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
01000         "movq      %%mm6, %%mm2             \n\t" /* FGHIJKLM */          \
01001         "psrlq       $24, %%mm6             \n\t" /* IJKLM000 */          \
01002         "punpcklbw %%mm7, %%mm2             \n\t" /* 0F0G0H0I */          \
01003         "punpcklbw %%mm7, %%mm6             \n\t" /* 0I0J0K0L */          \
01004         "pmullw "MANGLE(ff_pw_3)", %%mm3    \n\t" /* 3c - 6b */           \
01005         "paddw     %%mm2, %%mm1             \n\t" /* a */                 \
01006         "paddw     %%mm6, %%mm4             \n\t" /* d */                 \
01007         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
01008         "psubw     %%mm4, %%mm3             \n\t" /* - 6b +3c - d */      \
01009         "paddw        %6, %%mm1             \n\t"                         \
01010         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b +3c - d */  \
01011         "psraw        $5, %%mm3             \n\t"                         \
01012         "movq         %5, %%mm1             \n\t"                         \
01013         "packuswb  %%mm3, %%mm1             \n\t"                         \
01014         OP_MMX2(%%mm1, (%1), %%mm4, q)                                    \
01015         /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */     \
01016                                                                           \
01017         "movq      9(%0), %%mm1             \n\t" /* JKLMNOPQ */          \
01018         "movq      %%mm1, %%mm4             \n\t" /* JKLMNOPQ */          \
01019         "movq      %%mm1, %%mm3             \n\t" /* JKLMNOPQ */          \
01020         "psrlq        $8, %%mm1             \n\t" /* KLMNOPQ0 */          \
01021         "psrlq       $16, %%mm4             \n\t" /* LMNOPQ00 */          \
01022         "punpcklbw %%mm7, %%mm1             \n\t" /* 0K0L0M0N */          \
01023         "punpcklbw %%mm7, %%mm4             \n\t" /* 0L0M0N0O */          \
01024         "paddw     %%mm1, %%mm5             \n\t" /* b */                 \
01025         "paddw     %%mm4, %%mm0             \n\t" /* c */                 \
01026         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
01027         "psubw     %%mm5, %%mm0             \n\t" /* c - 2b */            \
01028         "movq      %%mm3, %%mm5             \n\t" /* JKLMNOPQ */          \
01029         "psrlq       $24, %%mm3             \n\t" /* MNOPQ000 */          \
01030         "pmullw "MANGLE(ff_pw_3)", %%mm0    \n\t" /* 3c - 6b */           \
01031         "punpcklbw %%mm7, %%mm3             \n\t" /* 0M0N0O0P */          \
01032         "paddw     %%mm3, %%mm2             \n\t" /* d */                 \
01033         "psubw     %%mm2, %%mm0             \n\t" /* -6b + 3c - d */      \
01034         "movq      %%mm5, %%mm2             \n\t" /* JKLMNOPQ */          \
01035         "punpcklbw %%mm7, %%mm2             \n\t" /* 0J0K0L0M */          \
01036         "punpckhbw %%mm7, %%mm5             \n\t" /* 0N0O0P0Q */          \
01037         "paddw     %%mm2, %%mm6             \n\t" /* a */                 \
01038         "pmullw "MANGLE(ff_pw_20)", %%mm6   \n\t" /* 20a */               \
01039         "paddw        %6, %%mm0             \n\t"                         \
01040         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
01041         "psraw        $5, %%mm0             \n\t"                         \
01042         /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */                         \
01043         /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */                              \
01044                                                                           \
01045         "paddw    %%mm5, %%mm3              \n\t" /* a */                 \
01046         "pshufw   $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */          \
01047         "paddw    %%mm4, %%mm6              \n\t" /* b */                 \
01048         "pshufw   $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */          \
01049         "pshufw   $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */          \
01050         "paddw    %%mm1, %%mm4              \n\t" /* c */                 \
01051         "paddw    %%mm2, %%mm5              \n\t" /* d */                 \
01052         "paddw    %%mm6, %%mm6              \n\t" /* 2b */                \
01053         "psubw    %%mm6, %%mm4              \n\t" /* c - 2b */            \
01054         "pmullw "MANGLE(ff_pw_20)", %%mm3   \n\t" /* 20a */               \
01055         "pmullw  "MANGLE(ff_pw_3)", %%mm4   \n\t" /* 3c - 6b */           \
01056         "psubw    %%mm5, %%mm3              \n\t" /* -6b + 3c - d */      \
01057         "paddw       %6, %%mm4              \n\t"                         \
01058         "paddw    %%mm3, %%mm4              \n\t" /* 20a - 6b + 3c - d */ \
01059         "psraw       $5, %%mm4              \n\t"                         \
01060         "packuswb %%mm4, %%mm0              \n\t"                         \
01061         OP_MMX2(%%mm0, 8(%1), %%mm4, q)                                   \
01062                                                                           \
01063         "add         %3, %0                 \n\t"                         \
01064         "add         %4, %1                 \n\t"                         \
01065         "decl        %2                     \n\t"                         \
01066         "jnz         1b                     \n\t"                         \
01067         : "+a"(src), "+c"(dst), "+D"(h)                                   \
01068         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride),               \
01069           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER)      \
01070         : "memory"                                                        \
01071         );                                                                \
01072 }                                                                         \
01073                                                                           \
01074 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst,          \
01075                                                    uint8_t *src,          \
01076                                                    int dstStride,         \
01077                                                    int srcStride,         \
01078                                                    int h)                 \
01079 {                                                                         \
01080     int i;                                                                \
01081     int16_t temp[16];                                                     \
01082     /* quick HACK, XXX FIXME MUST be optimized */                         \
01083     for (i = 0; i < h; i++) {                                             \
01084         temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 +   \
01085                    (src[ 1] + src[ 3]) *  3 - (src[ 2] + src[ 4]);        \
01086         temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 +   \
01087                    (src[ 0] + src[ 4]) *  3 - (src[ 1] + src[ 5]);        \
01088         temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 +   \
01089                    (src[ 0] + src[ 5]) *  3 - (src[ 0] + src[ 6]);        \
01090         temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 +   \
01091                    (src[ 1] + src[ 6]) *  3 - (src[ 0] + src[ 7]);        \
01092         temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 +   \
01093                    (src[ 2] + src[ 7]) *  3 - (src[ 1] + src[ 8]);        \
01094         temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 +   \
01095                    (src[ 3] + src[ 8]) *  3 - (src[ 2] + src[ 9]);        \
01096         temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 +   \
01097                    (src[ 4] + src[ 9]) *  3 - (src[ 3] + src[10]);        \
01098         temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 +   \
01099                    (src[ 5] + src[10]) *  3 - (src[ 4] + src[11]);        \
01100         temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 +   \
01101                    (src[ 6] + src[11]) *  3 - (src[ 5] + src[12]);        \
01102         temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 +   \
01103                    (src[ 7] + src[12]) *  3 - (src[ 6] + src[13]);        \
01104         temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 +   \
01105                    (src[ 8] + src[13]) *  3 - (src[ 7] + src[14]);        \
01106         temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 +   \
01107                    (src[ 9] + src[14]) *  3 - (src[ 8] + src[15]);        \
01108         temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 +   \
01109                    (src[10] + src[15]) *  3 - (src[ 9] + src[16]);        \
01110         temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 +   \
01111                    (src[11] + src[16]) *  3 - (src[10] + src[16]);        \
01112         temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 +   \
01113                    (src[12] + src[16]) *  3 - (src[11] + src[15]);        \
01114         temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 +   \
01115                    (src[13] + src[15]) *  3 - (src[12] + src[14]);        \
01116         __asm__ volatile (                                                \
01117             "movq      (%0), %%mm0          \n\t"                         \
01118             "movq     8(%0), %%mm1          \n\t"                         \
01119             "paddw       %2, %%mm0          \n\t"                         \
01120             "paddw       %2, %%mm1          \n\t"                         \
01121             "psraw       $5, %%mm0          \n\t"                         \
01122             "psraw       $5, %%mm1          \n\t"                         \
01123             "packuswb %%mm1, %%mm0          \n\t"                         \
01124             OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
01125             "movq    16(%0), %%mm0          \n\t"                         \
01126             "movq    24(%0), %%mm1          \n\t"                         \
01127             "paddw       %2, %%mm0          \n\t"                         \
01128             "paddw       %2, %%mm1          \n\t"                         \
01129             "psraw       $5, %%mm0          \n\t"                         \
01130             "psraw       $5, %%mm1          \n\t"                         \
01131             "packuswb %%mm1, %%mm0          \n\t"                         \
01132             OP_3DNOW(%%mm0, 8(%1), %%mm1, q)                              \
01133             :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
01134             : "memory"                                                    \
01135             );                                                            \
01136         dst += dstStride;                                                 \
01137         src += srcStride;                                                 \
01138     }                                                                     \
01139 }                                                                         \
01140                                                                           \
01141 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst,            \
01142                                                  uint8_t *src,            \
01143                                                  int dstStride,           \
01144                                                  int srcStride,           \
01145                                                  int h)                   \
01146 {                                                                         \
01147     __asm__ volatile (                                                    \
01148         "pxor      %%mm7, %%mm7             \n\t"                         \
01149         "1:                                 \n\t"                         \
01150         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
01151         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
01152         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
01153         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
01154         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
01155         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
01156         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
01157         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
01158         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
01159         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
01160         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
01161         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
01162         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
01163         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
01164         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
01165         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
01166         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
01167         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
01168         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
01169         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
01170         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
01171         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
01172         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
01173         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
01174         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
01175         "paddw        %5, %%mm6             \n\t"                         \
01176         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
01177         "psraw        $5, %%mm0             \n\t"                         \
01178         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
01179                                                                           \
01180         "movd      5(%0), %%mm5             \n\t" /* FGHI */              \
01181         "punpcklbw %%mm7, %%mm5             \n\t" /* 0F0G0H0I */          \
01182         "pshufw    $0xF9, %%mm5, %%mm6      \n\t" /* 0G0H0I0I */          \
01183         "paddw     %%mm5, %%mm1             \n\t" /* a */                 \
01184         "paddw     %%mm6, %%mm2             \n\t" /* b */                 \
01185         "pshufw    $0xBE, %%mm5, %%mm6      \n\t" /* 0H0I0I0H */          \
01186         "pshufw    $0x6F, %%mm5, %%mm5      \n\t" /* 0I0I0H0G */          \
01187         "paddw     %%mm6, %%mm3             \n\t" /* c */                 \
01188         "paddw     %%mm5, %%mm4             \n\t" /* d */                 \
01189         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
01190         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
01191         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
01192         "pmullw  "MANGLE(ff_pw_3)", %%mm3   \n\t" /* 3c - 6b */           \
01193         "psubw     %%mm4, %%mm3             \n\t" /* -6b + 3c - d */      \
01194         "paddw        %5, %%mm1             \n\t"                         \
01195         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b + 3c - d */ \
01196         "psraw        $5, %%mm3             \n\t"                         \
01197         "packuswb  %%mm3, %%mm0             \n\t"                         \
01198         OP_MMX2(%%mm0, (%1), %%mm4, q)                                    \
01199                                                                           \
01200         "add          %3, %0                \n\t"                         \
01201         "add          %4, %1                \n\t"                         \
01202         "decl         %2                    \n\t"                         \
01203         "jnz          1b                    \n\t"                         \
01204         : "+a"(src), "+c"(dst), "+d"(h)                                   \
01205         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),               \
01206           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER)                 \
01207         : "memory"                                                        \
01208         );                                                                \
01209 }                                                                         \
01210                                                                           \
01211 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst,           \
01212                                                   uint8_t *src,           \
01213                                                   int dstStride,          \
01214                                                   int srcStride,          \
01215                                                   int h)                  \
01216 {                                                                         \
01217     int i;                                                                \
01218     int16_t temp[8];                                                      \
01219     /* quick HACK, XXX FIXME MUST be optimized */                         \
01220     for (i = 0; i < h; i++) {                                             \
01221         temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 +        \
01222                   (src[1] + src[3]) *  3 - (src[2] + src[4]);             \
01223         temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 +        \
01224                   (src[0] + src[4]) *  3 - (src[1] + src[5]);             \
01225         temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 +        \
01226                   (src[0] + src[5]) *  3 - (src[0] + src[6]);             \
01227         temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 +        \
01228                   (src[1] + src[6]) *  3 - (src[0] + src[7]);             \
01229         temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 +        \
01230                   (src[2] + src[7]) *  3 - (src[1] + src[8]);             \
01231         temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 +        \
01232                   (src[3] + src[8]) *  3 - (src[2] + src[8]);             \
01233         temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 +        \
01234                   (src[4] + src[8]) *  3 - (src[3] + src[7]);             \
01235         temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 +        \
01236                   (src[5] + src[7]) *  3 - (src[4] + src[6]);             \
01237         __asm__ volatile (                                                \
01238             "movq      (%0), %%mm0      \n\t"                             \
01239             "movq     8(%0), %%mm1      \n\t"                             \
01240             "paddw       %2, %%mm0      \n\t"                             \
01241             "paddw       %2, %%mm1      \n\t"                             \
01242             "psraw       $5, %%mm0      \n\t"                             \
01243             "psraw       $5, %%mm1      \n\t"                             \
01244             "packuswb %%mm1, %%mm0      \n\t"                             \
01245             OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
01246             :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
01247             : "memory"                                                    \
01248             );                                                            \
01249         dst += dstStride;                                                 \
01250         src += srcStride;                                                 \
01251     }                                                                     \
01252 }
01253 
01254 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)                          \
01255 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst,      \
01256                                                      uint8_t *src,      \
01257                                                      int dstStride,     \
01258                                                      int srcStride)     \
01259 {                                                                       \
01260     uint64_t temp[17 * 4];                                              \
01261     uint64_t *temp_ptr = temp;                                          \
01262     int count = 17;                                                     \
01263                                                                         \
01264     /* FIXME unroll */                                                  \
01265     __asm__ volatile (                                                  \
01266         "pxor      %%mm7, %%mm7             \n\t"                       \
01267         "1:                                 \n\t"                       \
01268         "movq       (%0), %%mm0             \n\t"                       \
01269         "movq       (%0), %%mm1             \n\t"                       \
01270         "movq      8(%0), %%mm2             \n\t"                       \
01271         "movq      8(%0), %%mm3             \n\t"                       \
01272         "punpcklbw %%mm7, %%mm0             \n\t"                       \
01273         "punpckhbw %%mm7, %%mm1             \n\t"                       \
01274         "punpcklbw %%mm7, %%mm2             \n\t"                       \
01275         "punpckhbw %%mm7, %%mm3             \n\t"                       \
01276         "movq      %%mm0, (%1)              \n\t"                       \
01277         "movq      %%mm1, 17 * 8(%1)        \n\t"                       \
01278         "movq      %%mm2, 2 * 17 * 8(%1)    \n\t"                       \
01279         "movq      %%mm3, 3 * 17 * 8(%1)    \n\t"                       \
01280         "add          $8, %1                \n\t"                       \
01281         "add          %3, %0                \n\t"                       \
01282         "decl         %2                    \n\t"                       \
01283         "jnz          1b                    \n\t"                       \
01284         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
01285         : "r"((x86_reg)srcStride)                                       \
01286         : "memory"                                                      \
01287         );                                                              \
01288                                                                         \
01289     temp_ptr = temp;                                                    \
01290     count    = 4;                                                       \
01291                                                                         \
01292     /* FIXME reorder for speed */                                       \
01293     __asm__ volatile (                                                  \
01294         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
01295         "1:                             \n\t"                           \
01296         "movq    (%0), %%mm0            \n\t"                           \
01297         "movq   8(%0), %%mm1            \n\t"                           \
01298         "movq  16(%0), %%mm2            \n\t"                           \
01299         "movq  24(%0), %%mm3            \n\t"                           \
01300         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),   8(%0),    (%0),  32(%0), (%1),     OP) \
01301         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),    (%0),    (%0),  40(%0), (%1, %3), OP) \
01302         "add       %4, %1               \n\t"                           \
01303         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),    (%0),   8(%0),  48(%0), (%1),     OP) \
01304                                                                         \
01305         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),   8(%0),  16(%0),  56(%0), (%1, %3), OP) \
01306         "add       %4, %1               \n\t"                           \
01307         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0),  16(%0),  24(%0),  64(%0), (%1),     OP) \
01308         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0),  24(%0),  32(%0),  72(%0), (%1, %3), OP) \
01309         "add       %4, %1               \n\t"                           \
01310         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0),  32(%0),  40(%0),  80(%0), (%1),     OP) \
01311         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0),  40(%0),  48(%0),  88(%0), (%1, %3), OP) \
01312         "add       %4, %1               \n\t"                           \
01313         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0),  48(%0),  56(%0),  96(%0), (%1),     OP) \
01314         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0),  56(%0),  64(%0), 104(%0), (%1, %3), OP) \
01315         "add       %4, %1               \n\t"                           \
01316         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0),  64(%0),  72(%0), 112(%0), (%1),     OP) \
01317         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0),  72(%0),  80(%0), 120(%0), (%1, %3), OP) \
01318         "add       %4, %1               \n\t"                           \
01319         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0),  80(%0),  88(%0), 128(%0), (%1),     OP) \
01320                                                                         \
01321         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0),  88(%0),  96(%0), 128(%0), (%1, %3), OP) \
01322         "add       %4, %1               \n\t"                           \
01323         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0),  96(%0), 104(%0), 120(%0), (%1),     OP) \
01324         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
01325                                                                         \
01326         "add     $136, %0               \n\t"                           \
01327         "add       %6, %1               \n\t"                           \
01328         "decl      %2                   \n\t"                           \
01329         "jnz       1b                   \n\t"                           \
01330                                                                         \
01331         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
01332         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
01333           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
01334           "g"(4 - 14 * (x86_reg)dstStride)                              \
01335         : "memory"                                                      \
01336         );                                                              \
01337 }                                                                       \
01338                                                                         \
01339 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst,       \
01340                                                     uint8_t *src,       \
01341                                                     int dstStride,      \
01342                                                     int srcStride)      \
01343 {                                                                       \
01344     uint64_t temp[9 * 2];                                               \
01345     uint64_t *temp_ptr = temp;                                          \
01346     int count = 9;                                                      \
01347                                                                         \
01348     /* FIXME unroll */                                                  \
01349     __asm__ volatile (                                                  \
01350         "pxor      %%mm7, %%mm7         \n\t"                           \
01351         "1:                             \n\t"                           \
01352         "movq       (%0), %%mm0         \n\t"                           \
01353         "movq       (%0), %%mm1         \n\t"                           \
01354         "punpcklbw %%mm7, %%mm0         \n\t"                           \
01355         "punpckhbw %%mm7, %%mm1         \n\t"                           \
01356         "movq      %%mm0, (%1)          \n\t"                           \
01357         "movq      %%mm1, 9*8(%1)       \n\t"                           \
01358         "add          $8, %1            \n\t"                           \
01359         "add          %3, %0            \n\t"                           \
01360         "decl         %2                \n\t"                           \
01361         "jnz          1b                \n\t"                           \
01362         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
01363         : "r"((x86_reg)srcStride)                                       \
01364         : "memory"                                                      \
01365         );                                                              \
01366                                                                         \
01367     temp_ptr = temp;                                                    \
01368     count    = 2;                                                       \
01369                                                                         \
01370     /* FIXME reorder for speed */                                       \
01371     __asm__ volatile (                                                  \
01372         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
01373         "1:                             \n\t"                           \
01374         "movq    (%0), %%mm0            \n\t"                           \
01375         "movq   8(%0), %%mm1            \n\t"                           \
01376         "movq  16(%0), %%mm2            \n\t"                           \
01377         "movq  24(%0), %%mm3            \n\t"                           \
01378         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)     \
01379         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP) \
01380         "add       %4, %1               \n\t"                           \
01381         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)     \
01382                                                                         \
01383         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP) \
01384         "add       %4, %1               \n\t"                           \
01385         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)     \
01386                                                                         \
01387         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
01388         "add       %4, %1               \n\t"                           \
01389         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)     \
01390         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
01391                                                                         \
01392         "add      $72, %0               \n\t"                           \
01393         "add       %6, %1               \n\t"                           \
01394         "decl      %2                   \n\t"                           \
01395         "jnz       1b                   \n\t"                           \
01396                                                                         \
01397         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
01398         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
01399           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
01400           "g"(4 - 6 * (x86_reg)dstStride)                               \
01401         : "memory"                                                      \
01402         );                                                              \
01403 }                                                                       \
01404                                                                         \
01405 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
01406                                           int stride)                   \
01407 {                                                                       \
01408     OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);                     \
01409 }                                                                       \
01410                                                                         \
01411 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
01412                                          int stride)                    \
01413 {                                                                       \
01414     uint64_t temp[8];                                                   \
01415     uint8_t * const half = (uint8_t*)temp;                              \
01416     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
01417                                                 stride, 8);             \
01418     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
01419 }                                                                       \
01420                                                                         \
01421 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
01422                                          int stride)                    \
01423 {                                                                       \
01424     OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,           \
01425                                             stride, 8);                 \
01426 }                                                                       \
01427                                                                         \
01428 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
01429                                          int stride)                    \
01430 {                                                                       \
01431     uint64_t temp[8];                                                   \
01432     uint8_t * const half = (uint8_t*)temp;                              \
01433     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
01434                                                 stride, 8);             \
01435     OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,            \
01436                                  stride, 8);                            \
01437 }                                                                       \
01438                                                                         \
01439 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
01440                                          int stride)                    \
01441 {                                                                       \
01442     uint64_t temp[8];                                                   \
01443     uint8_t * const half = (uint8_t*)temp;                              \
01444     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
01445     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
01446 }                                                                       \
01447                                                                         \
01448 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
01449                                          int stride)                    \
01450 {                                                                       \
01451     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);  \
01452 }                                                                       \
01453                                                                         \
01454 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
01455                                          int stride)                    \
01456 {                                                                       \
01457     uint64_t temp[8];                                                   \
01458     uint8_t * const half = (uint8_t*)temp;                              \
01459     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
01460     OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,       \
01461                                  stride, 8);                            \
01462 }                                                                       \
01463                                                                         \
01464 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
01465                                          int stride)                    \
01466 {                                                                       \
01467     uint64_t half[8 + 9];                                               \
01468     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01469     uint8_t * const halfHV = ((uint8_t*)half);                          \
01470     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01471                                                 stride, 9);             \
01472     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
01473     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01474     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
01475 }                                                                       \
01476                                                                         \
01477 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
01478                                          int stride)                    \
01479 {                                                                       \
01480     uint64_t half[8 + 9];                                               \
01481     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01482     uint8_t * const halfHV = ((uint8_t*)half);                          \
01483     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01484                                                 stride, 9);             \
01485     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
01486                                      stride, 9);                        \
01487     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01488     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
01489 }                                                                       \
01490                                                                         \
01491 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
01492                                          int stride)                    \
01493 {                                                                       \
01494     uint64_t half[8 + 9];                                               \
01495     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01496     uint8_t * const halfHV = ((uint8_t*)half);                          \
01497     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01498                                                 stride, 9);             \
01499     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
01500     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01501     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01502 }                                                                       \
01503                                                                         \
01504 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
01505                                          int stride)                    \
01506 {                                                                       \
01507     uint64_t half[8 + 9];                                               \
01508     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01509     uint8_t * const halfHV = ((uint8_t*)half);                          \
01510     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01511                                                 stride, 9);             \
01512     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
01513                                      stride, 9);                        \
01514     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01515     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01516 }                                                                       \
01517                                                                         \
01518 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
01519                                          int stride)                    \
01520 {                                                                       \
01521     uint64_t half[8 + 9];                                               \
01522     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01523     uint8_t * const halfHV = ((uint8_t*)half);                          \
01524     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01525                                                 stride, 9);             \
01526     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01527     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
01528 }                                                                       \
01529                                                                         \
01530 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
01531                                          int stride)                    \
01532 {                                                                       \
01533     uint64_t half[8 + 9];                                               \
01534     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
01535     uint8_t * const halfHV = ((uint8_t*)half);                          \
01536     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01537                                                 stride, 9);             \
01538     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
01539     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01540 }                                                                       \
01541                                                                         \
01542 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
01543                                          int stride)                    \
01544 {                                                                       \
01545     uint64_t half[8 + 9];                                               \
01546     uint8_t * const halfH = ((uint8_t*)half);                           \
01547     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01548                                                 stride, 9);             \
01549     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
01550     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
01551 }                                                                       \
01552                                                                         \
01553 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
01554                                          int stride)                    \
01555 {                                                                       \
01556     uint64_t half[8 + 9];                                               \
01557     uint8_t * const halfH = ((uint8_t*)half);                           \
01558     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01559                                                 stride, 9);             \
01560     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
01561                                      stride, 9);                        \
01562     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
01563 }                                                                       \
01564                                                                         \
01565 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
01566                                          int stride)                    \
01567 {                                                                       \
01568     uint64_t half[9];                                                   \
01569     uint8_t * const halfH = ((uint8_t*)half);                           \
01570     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
01571                                                 stride, 9);             \
01572     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
01573 }                                                                       \
01574                                                                         \
01575 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
01576                                            int stride)                  \
01577 {                                                                       \
01578     OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);                   \
01579 }                                                                       \
01580                                                                         \
01581 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
01582                                           int stride)                   \
01583 {                                                                       \
01584     uint64_t temp[32];                                                  \
01585     uint8_t * const half = (uint8_t*)temp;                              \
01586     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
01587                                                  stride, 16);           \
01588     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
01589 }                                                                       \
01590                                                                         \
01591 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
01592                                           int stride)                   \
01593 {                                                                       \
01594     OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,                  \
01595                                              stride, stride, 16);       \
01596 }                                                                       \
01597                                                                         \
01598 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
01599                                           int stride)                   \
01600 {                                                                       \
01601     uint64_t temp[32];                                                  \
01602     uint8_t * const half = (uint8_t*)temp;                              \
01603     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
01604                                                  stride, 16);           \
01605     OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,                   \
01606                                   stride, stride, 16);                  \
01607 }                                                                       \
01608                                                                         \
01609 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
01610                                           int stride)                   \
01611 {                                                                       \
01612     uint64_t temp[32];                                                  \
01613     uint8_t * const half = (uint8_t*)temp;                              \
01614     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
01615                                                  stride);               \
01616     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
01617 }                                                                       \
01618                                                                         \
01619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
01620                                           int stride)                   \
01621 {                                                                       \
01622     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
01623 }                                                                       \
01624                                                                         \
01625 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
01626                                           int stride)                   \
01627 {                                                                       \
01628     uint64_t temp[32];                                                  \
01629     uint8_t * const half = (uint8_t*)temp;                              \
01630     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
01631                                                  stride);               \
01632     OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,                \
01633                                   stride, stride, 16);                  \
01634 }                                                                       \
01635                                                                         \
01636 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
01637                                           int stride)                   \
01638 {                                                                       \
01639     uint64_t half[16 * 2 + 17 * 2];                                     \
01640     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01641     uint8_t * const halfHV = ((uint8_t*)half);                          \
01642     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01643                                                  stride, 17);           \
01644     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
01645                                       stride, 17);                      \
01646     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01647                                                  16, 16);               \
01648     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
01649 }                                                                       \
01650                                                                         \
01651 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
01652                                           int stride)                   \
01653 {                                                                       \
01654     uint64_t half[16 * 2 + 17 * 2];                                     \
01655     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01656     uint8_t * const halfHV = ((uint8_t*)half);                          \
01657     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01658                                                  stride, 17);           \
01659     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
01660                                       stride, 17);                      \
01661     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01662                                                  16, 16);               \
01663     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
01664 }                                                                       \
01665                                                                         \
01666 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
01667                                           int stride)                   \
01668 {                                                                       \
01669     uint64_t half[16 * 2 + 17 * 2];                                     \
01670     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01671     uint8_t * const halfHV = ((uint8_t*)half);                          \
01672     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01673                                                  stride, 17);           \
01674     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
01675                                       stride, 17);                      \
01676     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01677                                                  16, 16);               \
01678     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
01679                                   16, 16);                              \
01680 }                                                                       \
01681                                                                         \
01682 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
01683                                           int stride)                   \
01684 {                                                                       \
01685     uint64_t half[16 * 2 + 17 * 2];                                     \
01686     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01687     uint8_t * const halfHV = ((uint8_t*)half);                          \
01688     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01689                                                  stride, 17);           \
01690     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
01691                                       stride, 17);                      \
01692     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01693                                                  16, 16);               \
01694     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
01695                                   16, 16);                              \
01696 }                                                                       \
01697                                                                         \
01698 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
01699                                           int stride)                   \
01700 {                                                                       \
01701     uint64_t half[16 * 2 + 17 * 2];                                     \
01702     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01703     uint8_t * const halfHV = ((uint8_t*)half);                          \
01704     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01705                                                  stride, 17);           \
01706     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01707                                                  16, 16);               \
01708     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
01709 }                                                                       \
01710                                                                         \
01711 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
01712                                           int stride)                   \
01713 {                                                                       \
01714     uint64_t half[16 * 2 + 17 * 2];                                     \
01715     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
01716     uint8_t * const halfHV = ((uint8_t*)half);                          \
01717     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01718                                                  stride, 17);           \
01719     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
01720                                                  16, 16);               \
01721     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
01722                                   16, 16);                              \
01723 }                                                                       \
01724                                                                         \
01725 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
01726                                           int stride)                   \
01727 {                                                                       \
01728     uint64_t half[17 * 2];                                              \
01729     uint8_t * const halfH = ((uint8_t*)half);                           \
01730     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01731                                                  stride, 17);           \
01732     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
01733                                       stride, 17);                      \
01734     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
01735 }                                                                       \
01736                                                                         \
01737 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
01738                                           int stride)                   \
01739 {                                                                       \
01740     uint64_t half[17 * 2];                                              \
01741     uint8_t * const halfH = ((uint8_t*)half);                           \
01742     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01743                                                  stride, 17);           \
01744     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
01745                                       stride, 17);                      \
01746     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
01747 }                                                                       \
01748                                                                         \
01749 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
01750                                           int stride)                   \
01751 {                                                                       \
01752     uint64_t half[17 * 2];                                              \
01753     uint8_t * const halfH = ((uint8_t*)half);                           \
01754     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
01755                                                  stride, 17);           \
01756     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
01757 }
01758 
01759 #define PUT_OP(a, b, temp, size)                \
01760     "mov"#size"        "#a", "#b"       \n\t"
01761 
01762 #define AVG_3DNOW_OP(a, b, temp, size)          \
01763     "mov"#size"        "#b", "#temp"    \n\t"   \
01764     "pavgusb        "#temp", "#a"       \n\t"   \
01765     "mov"#size"        "#a", "#b"       \n\t"
01766 
01767 #define AVG_MMX2_OP(a, b, temp, size)           \
01768     "mov"#size"        "#b", "#temp"    \n\t"   \
01769     "pavgb          "#temp", "#a"       \n\t"   \
01770     "mov"#size"        "#a", "#b"       \n\t"
01771 
01772 QPEL_BASE(put_,        ff_pw_16, _,        PUT_OP,       PUT_OP)
01773 QPEL_BASE(avg_,        ff_pw_16, _,        AVG_MMX2_OP,  AVG_3DNOW_OP)
01774 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP,       PUT_OP)
01775 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       3dnow)
01776 QPEL_OP(avg_,          ff_pw_16, _,        AVG_3DNOW_OP, 3dnow)
01777 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       3dnow)
01778 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       mmx2)
01779 QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMX2_OP,  mmx2)
01780 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       mmx2)
01781 
01782 /***********************************/
01783 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
01784 
01785 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)                              \
01786 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01787                                                                  uint8_t *src, \
01788                                                                  int stride)   \
01789 {                                                                              \
01790     OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);                  \
01791 }
01792 
01793 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)                        \
01794 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01795                                                                  uint8_t *src, \
01796                                                                  int stride)   \
01797 {                                                                              \
01798     OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE,    \
01799                                                S1, S2);                        \
01800 }
01801 
01802 #define QPEL_2TAP(OPNAME, SIZE, MMX)                                        \
01803 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)                            \
01804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)                            \
01805 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)                               \
01806 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =    \
01807     OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;                                \
01808 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =    \
01809     OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;                           \
01810 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =    \
01811     OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;                           \
01812 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst,      \
01813                                                          uint8_t *src,      \
01814                                                          int stride)        \
01815 {                                                                           \
01816     OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE);    \
01817 }                                                                           \
01818 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst,      \
01819                                                          uint8_t *src,      \
01820                                                          int stride)        \
01821 {                                                                           \
01822     OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride,              \
01823                                             stride, SIZE);                  \
01824 }                                                                           \
01825 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,           1,       0)                \
01826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,          -1,       0)                \
01827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,           stride,  0)                \
01828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,     -stride,  0)                \
01829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,           stride,  1)                \
01830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,           stride, -1)                \
01831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,     -stride,  1)                \
01832 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1)                \
01833 
01834 QPEL_2TAP(put_, 16, mmx2)
01835 QPEL_2TAP(avg_, 16, mmx2)
01836 QPEL_2TAP(put_,  8, mmx2)
01837 QPEL_2TAP(avg_,  8, mmx2)
01838 QPEL_2TAP(put_, 16, 3dnow)
01839 QPEL_2TAP(avg_, 16, 3dnow)
01840 QPEL_2TAP(put_,  8, 3dnow)
01841 QPEL_2TAP(avg_,  8, 3dnow)
01842 
01843 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01844 {
01845   put_pixels8_xy2_mmx(dst, src, stride, 8);
01846 }
01847 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01848 {
01849   put_pixels16_xy2_mmx(dst, src, stride, 16);
01850 }
01851 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01852 {
01853   avg_pixels8_xy2_mmx(dst, src, stride, 8);
01854 }
01855 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01856 {
01857   avg_pixels16_xy2_mmx(dst, src, stride, 16);
01858 }
01859 
01860 #endif /* HAVE_INLINE_ASM */
01861 
01862 #if HAVE_YASM
01863 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
01864                                 x86_reg linesize, x86_reg start_y,
01865                                 x86_reg end_y, x86_reg block_h,
01866                                 x86_reg start_x, x86_reg end_x,
01867                                 x86_reg block_w);
01868 extern emu_edge_core_func ff_emu_edge_core_mmx;
01869 extern emu_edge_core_func ff_emu_edge_core_sse;
01870 
01871 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
01872                                               int linesize,
01873                                               int block_w, int block_h,
01874                                               int src_x, int src_y,
01875                                               int w, int h,
01876                                               emu_edge_core_func *core_fn)
01877 {
01878     int start_y, start_x, end_y, end_x, src_y_add = 0;
01879 
01880     if (src_y >= h) {
01881         src_y_add = h - 1 - src_y;
01882         src_y     = h - 1;
01883     } else if (src_y <= -block_h) {
01884         src_y_add = 1 - block_h - src_y;
01885         src_y     = 1 - block_h;
01886     }
01887     if (src_x >= w) {
01888         src   += w - 1 - src_x;
01889         src_x  = w - 1;
01890     } else if (src_x <= -block_w) {
01891         src   += 1 - block_w - src_x;
01892         src_x  = 1 - block_w;
01893     }
01894 
01895     start_y = FFMAX(0, -src_y);
01896     start_x = FFMAX(0, -src_x);
01897     end_y   = FFMIN(block_h, h-src_y);
01898     end_x   = FFMIN(block_w, w-src_x);
01899     assert(start_x < end_x && block_w > 0);
01900     assert(start_y < end_y && block_h > 0);
01901 
01902     // fill in the to-be-copied part plus all above/below
01903     src += (src_y_add + start_y) * linesize + start_x;
01904     buf += start_x;
01905     core_fn(buf, src, linesize, start_y, end_y,
01906             block_h, start_x, end_x, block_w);
01907 }
01908 
01909 #if ARCH_X86_32
01910 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
01911                                              int linesize,
01912                                              int block_w, int block_h,
01913                                              int src_x, int src_y, int w, int h)
01914 {
01915     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01916                      w, h, &ff_emu_edge_core_mmx);
01917 }
01918 #endif
01919 
01920 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
01921                                              int linesize,
01922                                              int block_w, int block_h,
01923                                              int src_x, int src_y, int w, int h)
01924 {
01925     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01926                      w, h, &ff_emu_edge_core_sse);
01927 }
01928 #endif /* HAVE_YASM */
01929 
01930 #if HAVE_INLINE_ASM
01931 
01932 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
01933                                    int linesize, int block_w, int block_h,
01934                                    int src_x, int src_y, int w, int h);
01935 
01936 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
01937                                  int stride, int h, int ox, int oy,
01938                                  int dxx, int dxy, int dyx, int dyy,
01939                                  int shift, int r, int width, int height,
01940                                  emulated_edge_mc_func *emu_edge_fn)
01941 {
01942     const int w    = 8;
01943     const int ix   = ox  >> (16 + shift);
01944     const int iy   = oy  >> (16 + shift);
01945     const int oxs  = ox  >> 4;
01946     const int oys  = oy  >> 4;
01947     const int dxxs = dxx >> 4;
01948     const int dxys = dxy >> 4;
01949     const int dyxs = dyx >> 4;
01950     const int dyys = dyy >> 4;
01951     const uint16_t r4[4]   = { r, r, r, r };
01952     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
01953     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
01954     const uint64_t shift2 = 2 * shift;
01955     uint8_t edge_buf[(h + 1) * stride];
01956     int x, y;
01957 
01958     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
01959     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
01960     const int dxh = dxy * (h - 1);
01961     const int dyw = dyx * (w - 1);
01962     if ( // non-constant fullpel offset (3% of blocks)
01963         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
01964          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
01965         // uses more than 16 bits of subpel mv (only at huge resolution)
01966         || (dxx | dxy | dyx | dyy) & 15) {
01967         // FIXME could still use mmx for some of the rows
01968         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
01969                  shift, r, width, height);
01970         return;
01971     }
01972 
01973     src += ix + iy * stride;
01974     if ((unsigned)ix >= width  - w ||
01975         (unsigned)iy >= height - h) {
01976         emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
01977         src = edge_buf;
01978     }
01979 
01980     __asm__ volatile (
01981         "movd         %0, %%mm6         \n\t"
01982         "pxor      %%mm7, %%mm7         \n\t"
01983         "punpcklwd %%mm6, %%mm6         \n\t"
01984         "punpcklwd %%mm6, %%mm6         \n\t"
01985         :: "r"(1<<shift)
01986     );
01987 
01988     for (x = 0; x < w; x += 4) {
01989         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
01990                             oxs - dxys + dxxs * (x + 1),
01991                             oxs - dxys + dxxs * (x + 2),
01992                             oxs - dxys + dxxs * (x + 3) };
01993         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
01994                             oys - dyys + dyxs * (x + 1),
01995                             oys - dyys + dyxs * (x + 2),
01996                             oys - dyys + dyxs * (x + 3) };
01997 
01998         for (y = 0; y < h; y++) {
01999             __asm__ volatile (
02000                 "movq      %0, %%mm4    \n\t"
02001                 "movq      %1, %%mm5    \n\t"
02002                 "paddw     %2, %%mm4    \n\t"
02003                 "paddw     %3, %%mm5    \n\t"
02004                 "movq   %%mm4, %0       \n\t"
02005                 "movq   %%mm5, %1       \n\t"
02006                 "psrlw    $12, %%mm4    \n\t"
02007                 "psrlw    $12, %%mm5    \n\t"
02008                 : "+m"(*dx4), "+m"(*dy4)
02009                 : "m"(*dxy4), "m"(*dyy4)
02010             );
02011 
02012             __asm__ volatile (
02013                 "movq      %%mm6, %%mm2 \n\t"
02014                 "movq      %%mm6, %%mm1 \n\t"
02015                 "psubw     %%mm4, %%mm2 \n\t"
02016                 "psubw     %%mm5, %%mm1 \n\t"
02017                 "movq      %%mm2, %%mm0 \n\t"
02018                 "movq      %%mm4, %%mm3 \n\t"
02019                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
02020                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
02021                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
02022                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
02023 
02024                 "movd         %4, %%mm5 \n\t"
02025                 "movd         %3, %%mm4 \n\t"
02026                 "punpcklbw %%mm7, %%mm5 \n\t"
02027                 "punpcklbw %%mm7, %%mm4 \n\t"
02028                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
02029                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
02030 
02031                 "movd         %2, %%mm5 \n\t"
02032                 "movd         %1, %%mm4 \n\t"
02033                 "punpcklbw %%mm7, %%mm5 \n\t"
02034                 "punpcklbw %%mm7, %%mm4 \n\t"
02035                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
02036                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
02037                 "paddw        %5, %%mm1 \n\t"
02038                 "paddw     %%mm3, %%mm2 \n\t"
02039                 "paddw     %%mm1, %%mm0 \n\t"
02040                 "paddw     %%mm2, %%mm0 \n\t"
02041 
02042                 "psrlw        %6, %%mm0 \n\t"
02043                 "packuswb  %%mm0, %%mm0 \n\t"
02044                 "movd      %%mm0, %0    \n\t"
02045 
02046                 : "=m"(dst[x + y * stride])
02047                 : "m"(src[0]), "m"(src[1]),
02048                   "m"(src[stride]), "m"(src[stride + 1]),
02049                   "m"(*r4), "m"(shift2)
02050             );
02051             src += stride;
02052         }
02053         src += 4 - h * stride;
02054     }
02055 }
02056 
02057 #if HAVE_YASM
02058 #if ARCH_X86_32
02059 static void gmc_mmx(uint8_t *dst, uint8_t *src,
02060                     int stride, int h, int ox, int oy,
02061                     int dxx, int dxy, int dyx, int dyy,
02062                     int shift, int r, int width, int height)
02063 {
02064     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02065         width, height, &emulated_edge_mc_mmx);
02066 }
02067 #endif
02068 static void gmc_sse(uint8_t *dst, uint8_t *src,
02069                     int stride, int h, int ox, int oy,
02070                     int dxx, int dxy, int dyx, int dyy,
02071                     int shift, int r, int width, int height)
02072 {
02073     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02074         width, height, &emulated_edge_mc_sse);
02075 }
02076 #else
02077 static void gmc_mmx(uint8_t *dst, uint8_t *src,
02078                     int stride, int h, int ox, int oy,
02079                     int dxx, int dxy, int dyx, int dyy,
02080                     int shift, int r, int width, int height)
02081 {
02082     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
02083         width, height, &ff_emulated_edge_mc_8);
02084 }
02085 #endif
02086 
02087 #define PREFETCH(name, op)                      \
02088 static void name(void *mem, int stride, int h)  \
02089 {                                               \
02090     const uint8_t *p = mem;                     \
02091     do {                                        \
02092         __asm__ volatile (#op" %0" :: "m"(*p)); \
02093         p += stride;                            \
02094     } while (--h);                              \
02095 }
02096 
02097 PREFETCH(prefetch_mmx2,  prefetcht0)
02098 PREFETCH(prefetch_3dnow, prefetch)
02099 #undef PREFETCH
02100 
02101 #endif /* HAVE_INLINE_ASM */
02102 
02103 #include "h264_qpel.c"
02104 
02105 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
02106                                       int stride, int h, int x, int y);
02107 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
02108                                       int stride, int h, int x, int y);
02109 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
02110                                       int stride, int h, int x, int y);
02111 
02112 void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
02113                                       int stride, int h, int x, int y);
02114 void ff_avg_h264_chroma_mc4_mmx2     (uint8_t *dst, uint8_t *src,
02115                                       int stride, int h, int x, int y);
02116 void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
02117                                       int stride, int h, int x, int y);
02118 
02119 void ff_put_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
02120                                       int stride, int h, int x, int y);
02121 void ff_avg_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
02122                                       int stride, int h, int x, int y);
02123 
02124 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
02125                                       int stride, int h, int x, int y);
02126 void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
02127                                       int stride, int h, int x, int y);
02128 
02129 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
02130                                       int stride, int h, int x, int y);
02131 void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
02132                                       int stride, int h, int x, int y);
02133 
02134 #define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
02135 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
02136                                       (uint8_t *dst, uint8_t *src,      \
02137                                        int stride, int h, int x, int y);
02138 
02139 CHROMA_MC(put, 2, 10, mmx2)
02140 CHROMA_MC(avg, 2, 10, mmx2)
02141 CHROMA_MC(put, 4, 10, mmx2)
02142 CHROMA_MC(avg, 4, 10, mmx2)
02143 CHROMA_MC(put, 8, 10, sse2)
02144 CHROMA_MC(avg, 8, 10, sse2)
02145 CHROMA_MC(put, 8, 10, avx)
02146 CHROMA_MC(avg, 8, 10, avx)
02147 
02148 #if HAVE_INLINE_ASM
02149 
02150 /* CAVS-specific */
02151 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02152 {
02153     put_pixels8_mmx(dst, src, stride, 8);
02154 }
02155 
02156 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02157 {
02158     avg_pixels8_mmx(dst, src, stride, 8);
02159 }
02160 
02161 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02162 {
02163     put_pixels16_mmx(dst, src, stride, 16);
02164 }
02165 
02166 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
02167 {
02168     avg_pixels16_mmx(dst, src, stride, 16);
02169 }
02170 
02171 /* VC-1-specific */
02172 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
02173                                int stride, int rnd)
02174 {
02175     put_pixels8_mmx(dst, src, stride, 8);
02176 }
02177 
02178 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
02179                                 int stride, int rnd)
02180 {
02181     avg_pixels8_mmx2(dst, src, stride, 8);
02182 }
02183 
02184 /* only used in VP3/5/6 */
02185 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
02186 {
02187 //    START_TIMER
02188     MOVQ_BFE(mm6);
02189     __asm__ volatile(
02190         "1:                             \n\t"
02191         "movq   (%1), %%mm0             \n\t"
02192         "movq   (%2), %%mm1             \n\t"
02193         "movq   (%1,%4), %%mm2          \n\t"
02194         "movq   (%2,%4), %%mm3          \n\t"
02195         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
02196         "movq   %%mm4, (%3)             \n\t"
02197         "movq   %%mm5, (%3,%4)          \n\t"
02198 
02199         "movq   (%1,%4,2), %%mm0        \n\t"
02200         "movq   (%2,%4,2), %%mm1        \n\t"
02201         "movq   (%1,%5), %%mm2          \n\t"
02202         "movq   (%2,%5), %%mm3          \n\t"
02203         "lea    (%1,%4,4), %1           \n\t"
02204         "lea    (%2,%4,4), %2           \n\t"
02205         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
02206         "movq   %%mm4, (%3,%4,2)        \n\t"
02207         "movq   %%mm5, (%3,%5)          \n\t"
02208         "lea    (%3,%4,4), %3           \n\t"
02209         "subl   $4, %0                  \n\t"
02210         "jnz    1b                      \n\t"
02211         :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
02212         :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
02213         :"memory");
02214 //    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
02215 }
02216 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
02217 {
02218     put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
02219     put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
02220 }
02221 
02222 #if CONFIG_DIRAC_DECODER
02223 #define DIRAC_PIXOP(OPNAME, EXT)\
02224 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02225 {\
02226     OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
02227 }\
02228 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02229 {\
02230     OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
02231 }\
02232 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02233 {\
02234     OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
02235     OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
02236 }
02237 
02238 DIRAC_PIXOP(put, mmx)
02239 DIRAC_PIXOP(avg, mmx)
02240 DIRAC_PIXOP(avg, mmx2)
02241 
02242 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02243 {
02244     put_pixels16_sse2(dst, src[0], stride, h);
02245 }
02246 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02247 {
02248     avg_pixels16_sse2(dst, src[0], stride, h);
02249 }
02250 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02251 {
02252     put_pixels16_sse2(dst   , src[0]   , stride, h);
02253     put_pixels16_sse2(dst+16, src[0]+16, stride, h);
02254 }
02255 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02256 {
02257     avg_pixels16_sse2(dst   , src[0]   , stride, h);
02258     avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
02259 }
02260 #endif
02261 
02262 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
02263  * converted. */
02264 #if CONFIG_GPL
02265 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
02266                                     DCTELEM *block)
02267 {
02268     ff_mmx_idct(block);
02269     ff_put_pixels_clamped_mmx(block, dest, line_size);
02270 }
02271 
02272 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
02273                                     DCTELEM *block)
02274 {
02275     ff_mmx_idct(block);
02276     ff_add_pixels_clamped_mmx(block, dest, line_size);
02277 }
02278 
02279 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
02280                                      DCTELEM *block)
02281 {
02282     ff_mmxext_idct(block);
02283     ff_put_pixels_clamped_mmx(block, dest, line_size);
02284 }
02285 
02286 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
02287                                      DCTELEM *block)
02288 {
02289     ff_mmxext_idct(block);
02290     ff_add_pixels_clamped_mmx(block, dest, line_size);
02291 }
02292 #endif
02293 
02294 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02295 {
02296     int i;
02297     __asm__ volatile ("pxor %%mm7, %%mm7":);
02298     for (i = 0; i < blocksize; i += 2) {
02299         __asm__ volatile (
02300             "movq       %0, %%mm0   \n\t"
02301             "movq       %1, %%mm1   \n\t"
02302             "movq    %%mm0, %%mm2   \n\t"
02303             "movq    %%mm1, %%mm3   \n\t"
02304             "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
02305             "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
02306             "pslld     $31, %%mm2   \n\t" // keep only the sign bit
02307             "pxor    %%mm2, %%mm1   \n\t"
02308             "movq    %%mm3, %%mm4   \n\t"
02309             "pand    %%mm1, %%mm3   \n\t"
02310             "pandn   %%mm1, %%mm4   \n\t"
02311             "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
02312             "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
02313             "movq    %%mm3, %1      \n\t"
02314             "movq    %%mm0, %0      \n\t"
02315             : "+m"(mag[i]), "+m"(ang[i])
02316             :: "memory"
02317         );
02318     }
02319     __asm__ volatile ("femms");
02320 }
02321 
02322 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02323 {
02324     int i;
02325 
02326     __asm__ volatile (
02327         "movaps  %0, %%xmm5 \n\t"
02328         :: "m"(ff_pdw_80000000[0])
02329     );
02330     for (i = 0; i < blocksize; i += 4) {
02331         __asm__ volatile (
02332             "movaps      %0, %%xmm0 \n\t"
02333             "movaps      %1, %%xmm1 \n\t"
02334             "xorps   %%xmm2, %%xmm2 \n\t"
02335             "xorps   %%xmm3, %%xmm3 \n\t"
02336             "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
02337             "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
02338             "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
02339             "xorps   %%xmm2, %%xmm1 \n\t"
02340             "movaps  %%xmm3, %%xmm4 \n\t"
02341             "andps   %%xmm1, %%xmm3 \n\t"
02342             "andnps  %%xmm1, %%xmm4 \n\t"
02343             "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
02344             "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
02345             "movaps  %%xmm3, %1     \n\t"
02346             "movaps  %%xmm0, %0     \n\t"
02347             : "+m"(mag[i]), "+m"(ang[i])
02348             :: "memory"
02349         );
02350     }
02351 }
02352 
02353 #if HAVE_6REGS
02354 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
02355                                         const float *src1, const float *win,
02356                                         int len)
02357 {
02358     x86_reg i = -len * 4;
02359     x86_reg j =  len * 4 - 8;
02360     __asm__ volatile (
02361         "1:                             \n"
02362         "pswapd (%5, %1), %%mm1         \n"
02363         "movq   (%5, %0), %%mm0         \n"
02364         "pswapd (%4, %1), %%mm5         \n"
02365         "movq   (%3, %0), %%mm4         \n"
02366         "movq      %%mm0, %%mm2         \n"
02367         "movq      %%mm1, %%mm3         \n"
02368         "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
02369         "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
02370         "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
02371         "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
02372         "pfadd     %%mm3, %%mm2         \n"
02373         "pfsub     %%mm0, %%mm1         \n"
02374         "pswapd    %%mm2, %%mm2         \n"
02375         "movq      %%mm1, (%2, %0)      \n"
02376         "movq      %%mm2, (%2, %1)      \n"
02377         "sub          $8, %1            \n"
02378         "add          $8, %0            \n"
02379         "jl           1b                \n"
02380         "femms                          \n"
02381         : "+r"(i), "+r"(j)
02382         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02383     );
02384 }
02385 
02386 static void vector_fmul_window_sse(float *dst, const float *src0,
02387                                    const float *src1, const float *win, int len)
02388 {
02389     x86_reg i = -len * 4;
02390     x86_reg j =  len * 4 - 16;
02391     __asm__ volatile (
02392         "1:                             \n"
02393         "movaps      (%5, %1), %%xmm1   \n"
02394         "movaps      (%5, %0), %%xmm0   \n"
02395         "movaps      (%4, %1), %%xmm5   \n"
02396         "movaps      (%3, %0), %%xmm4   \n"
02397         "shufps $0x1b, %%xmm1, %%xmm1   \n"
02398         "shufps $0x1b, %%xmm5, %%xmm5   \n"
02399         "movaps        %%xmm0, %%xmm2   \n"
02400         "movaps        %%xmm1, %%xmm3   \n"
02401         "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
02402         "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
02403         "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
02404         "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
02405         "addps         %%xmm3, %%xmm2   \n"
02406         "subps         %%xmm0, %%xmm1   \n"
02407         "shufps $0x1b, %%xmm2, %%xmm2   \n"
02408         "movaps        %%xmm1, (%2, %0) \n"
02409         "movaps        %%xmm2, (%2, %1) \n"
02410         "sub              $16, %1       \n"
02411         "add              $16, %0       \n"
02412         "jl                1b           \n"
02413         : "+r"(i), "+r"(j)
02414         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02415     );
02416 }
02417 #endif /* HAVE_6REGS */
02418 
02419 static void vector_clipf_sse(float *dst, const float *src,
02420                              float min, float max, int len)
02421 {
02422     x86_reg i = (len - 16) * 4;
02423     __asm__ volatile (
02424         "movss          %3, %%xmm4      \n\t"
02425         "movss          %4, %%xmm5      \n\t"
02426         "shufps $0, %%xmm4, %%xmm4      \n\t"
02427         "shufps $0, %%xmm5, %%xmm5      \n\t"
02428         "1:                             \n\t"
02429         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
02430         "movaps 16(%2, %0), %%xmm1      \n\t"
02431         "movaps 32(%2, %0), %%xmm2      \n\t"
02432         "movaps 48(%2, %0), %%xmm3      \n\t"
02433         "maxps      %%xmm4, %%xmm0      \n\t"
02434         "maxps      %%xmm4, %%xmm1      \n\t"
02435         "maxps      %%xmm4, %%xmm2      \n\t"
02436         "maxps      %%xmm4, %%xmm3      \n\t"
02437         "minps      %%xmm5, %%xmm0      \n\t"
02438         "minps      %%xmm5, %%xmm1      \n\t"
02439         "minps      %%xmm5, %%xmm2      \n\t"
02440         "minps      %%xmm5, %%xmm3      \n\t"
02441         "movaps     %%xmm0,   (%1, %0)  \n\t"
02442         "movaps     %%xmm1, 16(%1, %0)  \n\t"
02443         "movaps     %%xmm2, 32(%1, %0)  \n\t"
02444         "movaps     %%xmm3, 48(%1, %0)  \n\t"
02445         "sub           $64, %0          \n\t"
02446         "jge            1b              \n\t"
02447         : "+&r"(i)
02448         : "r"(dst), "r"(src), "m"(min), "m"(max)
02449         : "memory"
02450     );
02451 }
02452 
02453 #endif /* HAVE_INLINE_ASM */
02454 
02455 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
02456                                     int order);
02457 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
02458                                     int order);
02459 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
02460                                              const int16_t *v3,
02461                                              int order, int mul);
02462 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
02463                                              const int16_t *v3,
02464                                              int order, int mul);
02465 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
02466                                               const int16_t *v3,
02467                                               int order, int mul);
02468 
02469 void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
02470                                       const int16_t *window, unsigned int len);
02471 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
02472                                       const int16_t *window, unsigned int len);
02473 void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
02474                                       const int16_t *window, unsigned int len);
02475 void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
02476                                       const int16_t *window, unsigned int len);
02477 void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
02478                                       const int16_t *window, unsigned int len);
02479 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
02480                                       const int16_t *window, unsigned int len);
02481 
02482 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
02483 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
02484 
02485 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
02486                                         const uint8_t *diff, int w,
02487                                         int *left, int *left_top);
02488 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
02489                                        int w, int left);
02490 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
02491                                       int w, int left);
02492 
02493 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02494 
02495 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
02496                                 const float *src1, int len);
02497 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
02498                                 const float *src1, int len);
02499 
02500 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02501                             const float *src2, int len);
02502 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
02503                             const float *src2, int len);
02504 
02505 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
02506                                    int32_t min, int32_t max, unsigned int len);
02507 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
02508                                    int32_t min, int32_t max, unsigned int len);
02509 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
02510                                    int32_t min, int32_t max, unsigned int len);
02511 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
02512                                    int32_t min, int32_t max, unsigned int len);
02513 
02514 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
02515                                                 const float *src1, int len);
02516 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
02517                                                 const float *src1, int len);
02518 
02519 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
02520     do {                                                                     \
02521     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
02522     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
02523     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
02524     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
02525     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
02526     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
02527     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
02528     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
02529     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
02530     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
02531     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
02532     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
02533     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
02534     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
02535     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
02536     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
02537     } while (0)
02538 
02539 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
02540     do {                                                                        \
02541         c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
02542         c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
02543         c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
02544         c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
02545     } while (0)
02546 
02547 #define H264_QPEL_FUNCS(x, y, CPU)                                                            \
02548     do {                                                                                      \
02549         c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02550         c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
02551         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02552         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
02553     } while (0)
02554 
02555 #define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
02556     do {                                                                                            \
02557         c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02558         c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
02559         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02560         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
02561     } while (0)
02562 
02563 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02564 {
02565     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02566 
02567 #if HAVE_INLINE_ASM
02568     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
02569     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02570     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
02571 
02572     if (!high_bit_depth) {
02573         c->clear_block  = clear_block_mmx;
02574         c->clear_blocks = clear_blocks_mmx;
02575         c->draw_edges   = draw_edges_mmx;
02576 
02577         SET_HPEL_FUNCS(put,        0, 16, mmx);
02578         SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02579         SET_HPEL_FUNCS(avg,        0, 16, mmx);
02580         SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02581         SET_HPEL_FUNCS(put,        1,  8, mmx);
02582         SET_HPEL_FUNCS(put_no_rnd, 1,  8, mmx);
02583         SET_HPEL_FUNCS(avg,        1,  8, mmx);
02584         SET_HPEL_FUNCS(avg_no_rnd, 1,  8, mmx);
02585     }
02586 
02587 #if ARCH_X86_32 || !HAVE_YASM
02588     c->gmc = gmc_mmx;
02589 #endif
02590 
02591     c->add_bytes = add_bytes_mmx;
02592 
02593     c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
02594     c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
02595 
02596     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02597         c->h263_v_loop_filter = h263_v_loop_filter_mmx;
02598         c->h263_h_loop_filter = h263_h_loop_filter_mmx;
02599     }
02600 #endif /* HAVE_INLINE_ASM */
02601 
02602 #if HAVE_YASM
02603 #if ARCH_X86_32
02604     if (!high_bit_depth)
02605         c->emulated_edge_mc = emulated_edge_mc_mmx;
02606 #endif
02607 
02608     if (!high_bit_depth && CONFIG_H264CHROMA) {
02609         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
02610         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
02611     }
02612 
02613     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
02614 #endif
02615 
02616 }
02617 
02618 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
02619                               int mm_flags)
02620 {
02621     const int bit_depth      = avctx->bits_per_raw_sample;
02622     const int high_bit_depth = bit_depth > 8;
02623 
02624 #if HAVE_INLINE_ASM
02625     c->prefetch = prefetch_mmx2;
02626 
02627     if (!high_bit_depth) {
02628         c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02629         c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02630 
02631         c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02632         c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02633         c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02634 
02635         c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02636         c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02637 
02638         c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02639         c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02640         c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02641     }
02642 
02643     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
02644         if (!high_bit_depth) {
02645             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02646             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02647             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02648             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02649 
02650             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02651             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02652         }
02653     }
02654 
02655     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02656                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
02657         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
02658         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
02659     }
02660 #endif /* HAVE_INLINE_ASM */
02661 
02662     if (CONFIG_H264QPEL) {
02663 #if HAVE_INLINE_ASM
02664         SET_QPEL_FUNCS(put_qpel,        0, 16, mmx2, );
02665         SET_QPEL_FUNCS(put_qpel,        1,  8, mmx2, );
02666         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
02667         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmx2, );
02668         SET_QPEL_FUNCS(avg_qpel,        0, 16, mmx2, );
02669         SET_QPEL_FUNCS(avg_qpel,        1,  8, mmx2, );
02670 #endif /* HAVE_INLINE_ASM */
02671 
02672         if (!high_bit_depth) {
02673 #if HAVE_INLINE_ASM
02674             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
02675             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmx2, );
02676             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmx2, );
02677             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
02678             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmx2, );
02679             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmx2, );
02680 #endif /* HAVE_INLINE_ASM */
02681         } else if (bit_depth == 10) {
02682 #if HAVE_YASM
02683 #if !ARCH_X86_64
02684             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
02685             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
02686             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
02687             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
02688 #endif
02689             SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
02690             SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
02691 #endif /* HAVE_YASM */
02692         }
02693 
02694 #if HAVE_INLINE_ASM
02695         SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
02696         SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, mmx2, );
02697         SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
02698         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, mmx2, );
02699 #endif /* HAVE_INLINE_ASM */
02700     }
02701 
02702 #if HAVE_YASM
02703     if (!high_bit_depth && CONFIG_H264CHROMA) {
02704         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
02705         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
02706         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
02707         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
02708     }
02709     if (bit_depth == 10 && CONFIG_H264CHROMA) {
02710         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
02711         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
02712         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
02713         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
02714     }
02715 
02716     /* slower than cmov version on AMD */
02717     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
02718         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
02719 
02720     c->scalarproduct_int16          = ff_scalarproduct_int16_mmx2;
02721     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
02722 
02723     if (avctx->flags & CODEC_FLAG_BITEXACT) {
02724         c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
02725     } else {
02726         c->apply_window_int16 = ff_apply_window_int16_mmxext;
02727     }
02728 #endif /* HAVE_YASM */
02729 }
02730 
02731 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
02732                                int mm_flags)
02733 {
02734     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02735 
02736 #if HAVE_INLINE_ASM
02737     c->prefetch = prefetch_3dnow;
02738 
02739     if (!high_bit_depth) {
02740         c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02741         c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02742 
02743         c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02744         c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02745         c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02746 
02747         c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02748         c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02749 
02750         c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02751         c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02752         c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02753 
02754         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
02755             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02756             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02757             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02758             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02759 
02760             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02761             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02762         }
02763     }
02764 
02765     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02766                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
02767         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02768         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02769     }
02770 
02771     if (CONFIG_H264QPEL) {
02772         SET_QPEL_FUNCS(put_qpel,        0, 16, 3dnow, );
02773         SET_QPEL_FUNCS(put_qpel,        1,  8, 3dnow, );
02774         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
02775         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, 3dnow, );
02776         SET_QPEL_FUNCS(avg_qpel,        0, 16, 3dnow, );
02777         SET_QPEL_FUNCS(avg_qpel,        1,  8, 3dnow, );
02778 
02779         if (!high_bit_depth) {
02780             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
02781             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 3dnow, );
02782             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, 3dnow, );
02783             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
02784             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 3dnow, );
02785             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, 3dnow, );
02786         }
02787 
02788         SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
02789         SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, 3dnow, );
02790         SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
02791         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, 3dnow, );
02792     }
02793 
02794     c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02795 #endif /* HAVE_INLINE_ASM */
02796 
02797 #if HAVE_YASM
02798     if (!high_bit_depth && CONFIG_H264CHROMA) {
02799         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
02800         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
02801     }
02802 #endif /* HAVE_YASM */
02803 }
02804 
02805 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
02806                                   int mm_flags)
02807 {
02808 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
02809     c->vector_fmul_window  = vector_fmul_window_3dnowext;
02810 #endif
02811 }
02812 
02813 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02814 {
02815     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02816 
02817 #if HAVE_INLINE_ASM
02818     if (!high_bit_depth) {
02819         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
02820             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
02821             c->clear_block  = clear_block_sse;
02822             c->clear_blocks = clear_blocks_sse;
02823         }
02824     }
02825 
02826     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02827 
02828 #if HAVE_6REGS
02829     c->vector_fmul_window = vector_fmul_window_sse;
02830 #endif
02831 
02832     c->vector_clipf = vector_clipf_sse;
02833 #endif /* HAVE_INLINE_ASM */
02834 
02835 #if HAVE_YASM
02836     c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
02837     c->vector_fmul_add     = ff_vector_fmul_add_sse;
02838 
02839     c->scalarproduct_float          = ff_scalarproduct_float_sse;
02840     c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
02841 
02842     if (!high_bit_depth)
02843         c->emulated_edge_mc = emulated_edge_mc_sse;
02844 #if HAVE_INLINE_ASM
02845     c->gmc = gmc_sse;
02846 #endif
02847 #endif /* HAVE_YASM */
02848 }
02849 
02850 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
02851                               int mm_flags)
02852 {
02853     const int bit_depth      = avctx->bits_per_raw_sample;
02854 
02855 #if HAVE_INLINE_ASM
02856     const int high_bit_depth = bit_depth > 8;
02857 
02858     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02859         // these functions are slower than mmx on AMD, but faster on Intel
02860         if (!high_bit_depth) {
02861             c->put_pixels_tab[0][0]        = put_pixels16_sse2;
02862             c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
02863             c->avg_pixels_tab[0][0]        = avg_pixels16_sse2;
02864             if (CONFIG_H264QPEL)
02865                 H264_QPEL_FUNCS(0, 0, sse2);
02866         }
02867     }
02868 
02869     if (!high_bit_depth && CONFIG_H264QPEL) {
02870         H264_QPEL_FUNCS(0, 1, sse2);
02871         H264_QPEL_FUNCS(0, 2, sse2);
02872         H264_QPEL_FUNCS(0, 3, sse2);
02873         H264_QPEL_FUNCS(1, 1, sse2);
02874         H264_QPEL_FUNCS(1, 2, sse2);
02875         H264_QPEL_FUNCS(1, 3, sse2);
02876         H264_QPEL_FUNCS(2, 1, sse2);
02877         H264_QPEL_FUNCS(2, 2, sse2);
02878         H264_QPEL_FUNCS(2, 3, sse2);
02879         H264_QPEL_FUNCS(3, 1, sse2);
02880         H264_QPEL_FUNCS(3, 2, sse2);
02881         H264_QPEL_FUNCS(3, 3, sse2);
02882     }
02883 #endif /* HAVE_INLINE_ASM */
02884 
02885 #if HAVE_YASM
02886     if (bit_depth == 10) {
02887         if (CONFIG_H264QPEL) {
02888             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
02889             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
02890             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
02891             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
02892             H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
02893             H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
02894             H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
02895         }
02896         if (CONFIG_H264CHROMA) {
02897             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
02898             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
02899         }
02900     }
02901 
02902     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
02903     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02904     if (mm_flags & AV_CPU_FLAG_ATOM) {
02905         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
02906     } else {
02907         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
02908     }
02909     if (avctx->flags & CODEC_FLAG_BITEXACT) {
02910         c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
02911     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02912         c->apply_window_int16 = ff_apply_window_int16_sse2;
02913     }
02914     c->bswap_buf = ff_bswap32_buf_sse2;
02915 #endif /* HAVE_YASM */
02916 }
02917 
02918 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
02919                                int mm_flags)
02920 {
02921     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02922     const int bit_depth      = avctx->bits_per_raw_sample;
02923 
02924 #if HAVE_SSSE3_INLINE
02925     if (!high_bit_depth && CONFIG_H264QPEL) {
02926         H264_QPEL_FUNCS(1, 0, ssse3);
02927         H264_QPEL_FUNCS(1, 1, ssse3);
02928         H264_QPEL_FUNCS(1, 2, ssse3);
02929         H264_QPEL_FUNCS(1, 3, ssse3);
02930         H264_QPEL_FUNCS(2, 0, ssse3);
02931         H264_QPEL_FUNCS(2, 1, ssse3);
02932         H264_QPEL_FUNCS(2, 2, ssse3);
02933         H264_QPEL_FUNCS(2, 3, ssse3);
02934         H264_QPEL_FUNCS(3, 0, ssse3);
02935         H264_QPEL_FUNCS(3, 1, ssse3);
02936         H264_QPEL_FUNCS(3, 2, ssse3);
02937         H264_QPEL_FUNCS(3, 3, ssse3);
02938     }
02939 #endif /* HAVE_SSSE3_INLINE */
02940 
02941 #if HAVE_SSSE3_EXTERNAL
02942     if (bit_depth == 10 && CONFIG_H264QPEL) {
02943         H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
02944         H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
02945         H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
02946     }
02947     if (!high_bit_depth && CONFIG_H264CHROMA) {
02948         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
02949         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
02950         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
02951         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
02952     }
02953     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02954     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
02955         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02956 
02957     if (mm_flags & AV_CPU_FLAG_ATOM)
02958         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
02959     else
02960         c->apply_window_int16 = ff_apply_window_int16_ssse3;
02961     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
02962         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02963     c->bswap_buf = ff_bswap32_buf_ssse3;
02964 #endif /* HAVE_SSSE3_EXTERNAL */
02965 }
02966 
02967 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
02968                               int mm_flags)
02969 {
02970 #if HAVE_SSE4_EXTERNAL
02971     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
02972 #endif /* HAVE_SSE4_EXTERNAL */
02973 }
02974 
02975 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02976 {
02977 #if HAVE_AVX_EXTERNAL
02978     const int bit_depth = avctx->bits_per_raw_sample;
02979 
02980     if (bit_depth == 10) {
02981         // AVX implies !cache64.
02982         // TODO: Port cache(32|64) detection from x264.
02983         if (CONFIG_H264QPEL) {
02984             H264_QPEL_FUNCS_10(1, 0, sse2);
02985             H264_QPEL_FUNCS_10(2, 0, sse2);
02986             H264_QPEL_FUNCS_10(3, 0, sse2);
02987         }
02988 
02989         if (CONFIG_H264CHROMA) {
02990             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
02991             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
02992         }
02993     }
02994     c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
02995     c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
02996     c->vector_fmul_add = ff_vector_fmul_add_avx;
02997 #endif /* HAVE_AVX_EXTERNAL */
02998 }
02999 
03000 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
03001 {
03002     int mm_flags = av_get_cpu_flags();
03003 
03004 #if HAVE_7REGS && HAVE_INLINE_ASM
03005     if (mm_flags & AV_CPU_FLAG_CMOV)
03006         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
03007 #endif
03008 
03009     if (mm_flags & AV_CPU_FLAG_MMX) {
03010 #if HAVE_INLINE_ASM
03011         const int idct_algo = avctx->idct_algo;
03012 
03013         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
03014             if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
03015                 c->idct_put              = ff_simple_idct_put_mmx;
03016                 c->idct_add              = ff_simple_idct_add_mmx;
03017                 c->idct                  = ff_simple_idct_mmx;
03018                 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
03019 #if CONFIG_GPL
03020             } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
03021                 if (mm_flags & AV_CPU_FLAG_MMX2) {
03022                     c->idct_put = ff_libmpeg2mmx2_idct_put;
03023                     c->idct_add = ff_libmpeg2mmx2_idct_add;
03024                     c->idct     = ff_mmxext_idct;
03025                 } else {
03026                     c->idct_put = ff_libmpeg2mmx_idct_put;
03027                     c->idct_add = ff_libmpeg2mmx_idct_add;
03028                     c->idct     = ff_mmx_idct;
03029                 }
03030                 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
03031 #endif
03032             } else if (idct_algo == FF_IDCT_XVIDMMX) {
03033                 if (mm_flags & AV_CPU_FLAG_SSE2) {
03034                     c->idct_put              = ff_idct_xvid_sse2_put;
03035                     c->idct_add              = ff_idct_xvid_sse2_add;
03036                     c->idct                  = ff_idct_xvid_sse2;
03037                     c->idct_permutation_type = FF_SSE2_IDCT_PERM;
03038                 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
03039                     c->idct_put              = ff_idct_xvid_mmx2_put;
03040                     c->idct_add              = ff_idct_xvid_mmx2_add;
03041                     c->idct                  = ff_idct_xvid_mmx2;
03042                 } else {
03043                     c->idct_put              = ff_idct_xvid_mmx_put;
03044                     c->idct_add              = ff_idct_xvid_mmx_add;
03045                     c->idct                  = ff_idct_xvid_mmx;
03046                 }
03047             }
03048         }
03049 #endif /* HAVE_INLINE_ASM */
03050 
03051         dsputil_init_mmx(c, avctx, mm_flags);
03052     }
03053 
03054     if (mm_flags & AV_CPU_FLAG_MMXEXT)
03055         dsputil_init_mmx2(c, avctx, mm_flags);
03056 
03057     if (mm_flags & AV_CPU_FLAG_3DNOW)
03058         dsputil_init_3dnow(c, avctx, mm_flags);
03059 
03060     if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
03061         dsputil_init_3dnowext(c, avctx, mm_flags);
03062 
03063     if (mm_flags & AV_CPU_FLAG_SSE)
03064         dsputil_init_sse(c, avctx, mm_flags);
03065 
03066     if (mm_flags & AV_CPU_FLAG_SSE2)
03067         dsputil_init_sse2(c, avctx, mm_flags);
03068 
03069     if (mm_flags & AV_CPU_FLAG_SSSE3)
03070         dsputil_init_ssse3(c, avctx, mm_flags);
03071 
03072     if (mm_flags & AV_CPU_FLAG_SSE4)
03073         dsputil_init_sse4(c, avctx, mm_flags);
03074 
03075     if (mm_flags & AV_CPU_FLAG_AVX)
03076         dsputil_init_avx(c, avctx, mm_flags);
03077 
03078     if (CONFIG_ENCODERS)
03079         ff_dsputilenc_init_mmx(c, avctx);
03080 }
libavcodec/x86/dsputil_mmx.c