FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
31 
32 int ff_sum_abs_dctelem_mmx(int16_t *block);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
34 int ff_sum_abs_dctelem_sse2(int16_t *block);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37  int line_size, int h);
38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39  int line_size, int h);
40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41  int line_size, int h);
42 int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
43 int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
44 
45 #define hadamard_func(cpu) \
46  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
47  uint8_t *src2, int stride, int h); \
48  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
49  uint8_t *src2, int stride, int h);
50 
52 hadamard_func(mmxext)
53 hadamard_func(sse2)
54 hadamard_func(ssse3)
55 
56 #if HAVE_YASM
57 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
58  int line_size, int h)
59 {
60  int score1, score2;
61 
62  if (c)
63  score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
64  else
65  score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
66  score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
67  - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
68 
69  if (c)
70  return score1 + FFABS(score2) * c->avctx->nsse_weight;
71  else
72  return score1 + FFABS(score2) * 8;
73 }
74 
75 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
76  int line_size, int h)
77 {
78  int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
79  int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
80  ff_hf_noise8_mmx(pix2, line_size, h);
81 
82  if (c)
83  return score1 + FFABS(score2) * c->avctx->nsse_weight;
84  else
85  return score1 + FFABS(score2) * 8;
86 }
87 
88 #endif /* HAVE_YASM */
89 
90 #if HAVE_INLINE_ASM
91 
92 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
93  int line_size, int h)
94 {
95  int tmp;
96 
97  av_assert2((((int) pix) & 7) == 0);
98  av_assert2((line_size & 7) == 0);
99 
100 #define SUM(in0, in1, out0, out1) \
101  "movq (%0), %%mm2\n" \
102  "movq 8(%0), %%mm3\n" \
103  "add %2,%0\n" \
104  "movq %%mm2, " #out0 "\n" \
105  "movq %%mm3, " #out1 "\n" \
106  "psubusb " #in0 ", %%mm2\n" \
107  "psubusb " #in1 ", %%mm3\n" \
108  "psubusb " #out0 ", " #in0 "\n" \
109  "psubusb " #out1 ", " #in1 "\n" \
110  "por %%mm2, " #in0 "\n" \
111  "por %%mm3, " #in1 "\n" \
112  "movq " #in0 ", %%mm2\n" \
113  "movq " #in1 ", %%mm3\n" \
114  "punpcklbw %%mm7, " #in0 "\n" \
115  "punpcklbw %%mm7, " #in1 "\n" \
116  "punpckhbw %%mm7, %%mm2\n" \
117  "punpckhbw %%mm7, %%mm3\n" \
118  "paddw " #in1 ", " #in0 "\n" \
119  "paddw %%mm3, %%mm2\n" \
120  "paddw %%mm2, " #in0 "\n" \
121  "paddw " #in0 ", %%mm6\n"
122 
123 
124  __asm__ volatile (
125  "movl %3, %%ecx\n"
126  "pxor %%mm6, %%mm6\n"
127  "pxor %%mm7, %%mm7\n"
128  "movq (%0), %%mm0\n"
129  "movq 8(%0), %%mm1\n"
130  "add %2, %0\n"
131  "jmp 2f\n"
132  "1:\n"
133 
134  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
135  "2:\n"
136  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
137 
138  "subl $2, %%ecx\n"
139  "jnz 1b\n"
140 
141  "movq %%mm6, %%mm0\n"
142  "psrlq $32, %%mm6\n"
143  "paddw %%mm6, %%mm0\n"
144  "movq %%mm0, %%mm6\n"
145  "psrlq $16, %%mm0\n"
146  "paddw %%mm6, %%mm0\n"
147  "movd %%mm0, %1\n"
148  : "+r" (pix), "=r" (tmp)
149  : "r" ((x86_reg) line_size), "m" (h)
150  : "%ecx");
151 
152  return tmp & 0xFFFF;
153 }
154 #undef SUM
155 
156 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
157  int line_size, int h)
158 {
159  int tmp;
160 
161  av_assert2((((int) pix) & 7) == 0);
162  av_assert2((line_size & 7) == 0);
163 
164 #define SUM(in0, in1, out0, out1) \
165  "movq (%0), " #out0 "\n" \
166  "movq 8(%0), " #out1 "\n" \
167  "add %2, %0\n" \
168  "psadbw " #out0 ", " #in0 "\n" \
169  "psadbw " #out1 ", " #in1 "\n" \
170  "paddw " #in1 ", " #in0 "\n" \
171  "paddw " #in0 ", %%mm6\n"
172 
173  __asm__ volatile (
174  "movl %3, %%ecx\n"
175  "pxor %%mm6, %%mm6\n"
176  "pxor %%mm7, %%mm7\n"
177  "movq (%0), %%mm0\n"
178  "movq 8(%0), %%mm1\n"
179  "add %2, %0\n"
180  "jmp 2f\n"
181  "1:\n"
182 
183  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
184  "2:\n"
185  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
186 
187  "subl $2, %%ecx\n"
188  "jnz 1b\n"
189 
190  "movd %%mm6, %1\n"
191  : "+r" (pix), "=r" (tmp)
192  : "r" ((x86_reg) line_size), "m" (h)
193  : "%ecx");
194 
195  return tmp;
196 }
197 #undef SUM
198 
199 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
200  int line_size, int h)
201 {
202  int tmp;
203 
204  av_assert2((((int) pix1) & 7) == 0);
205  av_assert2((((int) pix2) & 7) == 0);
206  av_assert2((line_size & 7) == 0);
207 
208 #define SUM(in0, in1, out0, out1) \
209  "movq (%0), %%mm2\n" \
210  "movq (%1), " #out0 "\n" \
211  "movq 8(%0), %%mm3\n" \
212  "movq 8(%1), " #out1 "\n" \
213  "add %3, %0\n" \
214  "add %3, %1\n" \
215  "psubb " #out0 ", %%mm2\n" \
216  "psubb " #out1 ", %%mm3\n" \
217  "pxor %%mm7, %%mm2\n" \
218  "pxor %%mm7, %%mm3\n" \
219  "movq %%mm2, " #out0 "\n" \
220  "movq %%mm3, " #out1 "\n" \
221  "psubusb " #in0 ", %%mm2\n" \
222  "psubusb " #in1 ", %%mm3\n" \
223  "psubusb " #out0 ", " #in0 "\n" \
224  "psubusb " #out1 ", " #in1 "\n" \
225  "por %%mm2, " #in0 "\n" \
226  "por %%mm3, " #in1 "\n" \
227  "movq " #in0 ", %%mm2\n" \
228  "movq " #in1 ", %%mm3\n" \
229  "punpcklbw %%mm7, " #in0 "\n" \
230  "punpcklbw %%mm7, " #in1 "\n" \
231  "punpckhbw %%mm7, %%mm2\n" \
232  "punpckhbw %%mm7, %%mm3\n" \
233  "paddw " #in1 ", " #in0 "\n" \
234  "paddw %%mm3, %%mm2\n" \
235  "paddw %%mm2, " #in0 "\n" \
236  "paddw " #in0 ", %%mm6\n"
237 
238 
239  __asm__ volatile (
240  "movl %4, %%ecx\n"
241  "pxor %%mm6, %%mm6\n"
242  "pcmpeqw %%mm7, %%mm7\n"
243  "psllw $15, %%mm7\n"
244  "packsswb %%mm7, %%mm7\n"
245  "movq (%0), %%mm0\n"
246  "movq (%1), %%mm2\n"
247  "movq 8(%0), %%mm1\n"
248  "movq 8(%1), %%mm3\n"
249  "add %3, %0\n"
250  "add %3, %1\n"
251  "psubb %%mm2, %%mm0\n"
252  "psubb %%mm3, %%mm1\n"
253  "pxor %%mm7, %%mm0\n"
254  "pxor %%mm7, %%mm1\n"
255  "jmp 2f\n"
256  "1:\n"
257 
258  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
259  "2:\n"
260  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
261 
262  "subl $2, %%ecx\n"
263  "jnz 1b\n"
264 
265  "movq %%mm6, %%mm0\n"
266  "psrlq $32, %%mm6\n"
267  "paddw %%mm6, %%mm0\n"
268  "movq %%mm0, %%mm6\n"
269  "psrlq $16, %%mm0\n"
270  "paddw %%mm6, %%mm0\n"
271  "movd %%mm0, %2\n"
272  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
273  : "r" ((x86_reg) line_size), "m" (h)
274  : "%ecx");
275 
276  return tmp & 0x7FFF;
277 }
278 #undef SUM
279 
280 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
281  int line_size, int h)
282 {
283  int tmp;
284 
285  av_assert2((((int) pix1) & 7) == 0);
286  av_assert2((((int) pix2) & 7) == 0);
287  av_assert2((line_size & 7) == 0);
288 
289 #define SUM(in0, in1, out0, out1) \
290  "movq (%0), " #out0 "\n" \
291  "movq (%1), %%mm2\n" \
292  "movq 8(%0), " #out1 "\n" \
293  "movq 8(%1), %%mm3\n" \
294  "add %3, %0\n" \
295  "add %3, %1\n" \
296  "psubb %%mm2, " #out0 "\n" \
297  "psubb %%mm3, " #out1 "\n" \
298  "pxor %%mm7, " #out0 "\n" \
299  "pxor %%mm7, " #out1 "\n" \
300  "psadbw " #out0 ", " #in0 "\n" \
301  "psadbw " #out1 ", " #in1 "\n" \
302  "paddw " #in1 ", " #in0 "\n" \
303  "paddw " #in0 ", %%mm6\n "
304 
305  __asm__ volatile (
306  "movl %4, %%ecx\n"
307  "pxor %%mm6, %%mm6\n"
308  "pcmpeqw %%mm7, %%mm7\n"
309  "psllw $15, %%mm7\n"
310  "packsswb %%mm7, %%mm7\n"
311  "movq (%0), %%mm0\n"
312  "movq (%1), %%mm2\n"
313  "movq 8(%0), %%mm1\n"
314  "movq 8(%1), %%mm3\n"
315  "add %3, %0\n"
316  "add %3, %1\n"
317  "psubb %%mm2, %%mm0\n"
318  "psubb %%mm3, %%mm1\n"
319  "pxor %%mm7, %%mm0\n"
320  "pxor %%mm7, %%mm1\n"
321  "jmp 2f\n"
322  "1:\n"
323 
324  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
325  "2:\n"
326  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
327 
328  "subl $2, %%ecx\n"
329  "jnz 1b\n"
330 
331  "movd %%mm6, %2\n"
332  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
333  : "r" ((x86_reg) line_size), "m" (h)
334  : "%ecx");
335 
336  return tmp;
337 }
338 #undef SUM
339 
340 
341 
342 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
343  0x0000000000000000ULL,
344  0x0001000100010001ULL,
345  0x0002000200020002ULL,
346 };
347 
348 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
349 
350 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
351 {
352  x86_reg len = -(x86_reg)stride * h;
353  __asm__ volatile (
354  ".p2align 4 \n\t"
355  "1: \n\t"
356  "movq (%1, %%"REG_a"), %%mm0 \n\t"
357  "movq (%2, %%"REG_a"), %%mm2 \n\t"
358  "movq (%2, %%"REG_a"), %%mm4 \n\t"
359  "add %3, %%"REG_a" \n\t"
360  "psubusb %%mm0, %%mm2 \n\t"
361  "psubusb %%mm4, %%mm0 \n\t"
362  "movq (%1, %%"REG_a"), %%mm1 \n\t"
363  "movq (%2, %%"REG_a"), %%mm3 \n\t"
364  "movq (%2, %%"REG_a"), %%mm5 \n\t"
365  "psubusb %%mm1, %%mm3 \n\t"
366  "psubusb %%mm5, %%mm1 \n\t"
367  "por %%mm2, %%mm0 \n\t"
368  "por %%mm1, %%mm3 \n\t"
369  "movq %%mm0, %%mm1 \n\t"
370  "movq %%mm3, %%mm2 \n\t"
371  "punpcklbw %%mm7, %%mm0 \n\t"
372  "punpckhbw %%mm7, %%mm1 \n\t"
373  "punpcklbw %%mm7, %%mm3 \n\t"
374  "punpckhbw %%mm7, %%mm2 \n\t"
375  "paddw %%mm1, %%mm0 \n\t"
376  "paddw %%mm3, %%mm2 \n\t"
377  "paddw %%mm2, %%mm0 \n\t"
378  "paddw %%mm0, %%mm6 \n\t"
379  "add %3, %%"REG_a" \n\t"
380  " js 1b \n\t"
381  : "+a" (len)
382  : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
383 }
384 
385 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
386  int stride, int h)
387 {
388  __asm__ volatile (
389  ".p2align 4 \n\t"
390  "1: \n\t"
391  "movq (%1), %%mm0 \n\t"
392  "movq (%1, %3), %%mm1 \n\t"
393  "psadbw (%2), %%mm0 \n\t"
394  "psadbw (%2, %3), %%mm1 \n\t"
395  "paddw %%mm0, %%mm6 \n\t"
396  "paddw %%mm1, %%mm6 \n\t"
397  "lea (%1,%3,2), %1 \n\t"
398  "lea (%2,%3,2), %2 \n\t"
399  "sub $2, %0 \n\t"
400  " jg 1b \n\t"
401  : "+r" (h), "+r" (blk1), "+r" (blk2)
402  : "r" ((x86_reg) stride));
403 }
404 
405 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
406  int stride, int h)
407 {
408  int ret;
409  __asm__ volatile (
410  "pxor %%xmm2, %%xmm2 \n\t"
411  ".p2align 4 \n\t"
412  "1: \n\t"
413  "movdqu (%1), %%xmm0 \n\t"
414  "movdqu (%1, %4), %%xmm1 \n\t"
415  "psadbw (%2), %%xmm0 \n\t"
416  "psadbw (%2, %4), %%xmm1 \n\t"
417  "paddw %%xmm0, %%xmm2 \n\t"
418  "paddw %%xmm1, %%xmm2 \n\t"
419  "lea (%1,%4,2), %1 \n\t"
420  "lea (%2,%4,2), %2 \n\t"
421  "sub $2, %0 \n\t"
422  " jg 1b \n\t"
423  "movhlps %%xmm2, %%xmm0 \n\t"
424  "paddw %%xmm0, %%xmm2 \n\t"
425  "movd %%xmm2, %3 \n\t"
426  : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
427  : "r" ((x86_reg) stride));
428  return ret;
429 }
430 
431 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
432  int stride, int h)
433 {
434  __asm__ volatile (
435  ".p2align 4 \n\t"
436  "1: \n\t"
437  "movq (%1), %%mm0 \n\t"
438  "movq (%1, %3), %%mm1 \n\t"
439  "pavgb 1(%1), %%mm0 \n\t"
440  "pavgb 1(%1, %3), %%mm1 \n\t"
441  "psadbw (%2), %%mm0 \n\t"
442  "psadbw (%2, %3), %%mm1 \n\t"
443  "paddw %%mm0, %%mm6 \n\t"
444  "paddw %%mm1, %%mm6 \n\t"
445  "lea (%1,%3,2), %1 \n\t"
446  "lea (%2,%3,2), %2 \n\t"
447  "sub $2, %0 \n\t"
448  " jg 1b \n\t"
449  : "+r" (h), "+r" (blk1), "+r" (blk2)
450  : "r" ((x86_reg) stride));
451 }
452 
453 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
454  int stride, int h)
455 {
456  __asm__ volatile (
457  "movq (%1), %%mm0 \n\t"
458  "add %3, %1 \n\t"
459  ".p2align 4 \n\t"
460  "1: \n\t"
461  "movq (%1), %%mm1 \n\t"
462  "movq (%1, %3), %%mm2 \n\t"
463  "pavgb %%mm1, %%mm0 \n\t"
464  "pavgb %%mm2, %%mm1 \n\t"
465  "psadbw (%2), %%mm0 \n\t"
466  "psadbw (%2, %3), %%mm1 \n\t"
467  "paddw %%mm0, %%mm6 \n\t"
468  "paddw %%mm1, %%mm6 \n\t"
469  "movq %%mm2, %%mm0 \n\t"
470  "lea (%1,%3,2), %1 \n\t"
471  "lea (%2,%3,2), %2 \n\t"
472  "sub $2, %0 \n\t"
473  " jg 1b \n\t"
474  : "+r" (h), "+r" (blk1), "+r" (blk2)
475  : "r" ((x86_reg) stride));
476 }
477 
478 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
479  int stride, int h)
480 {
481  __asm__ volatile (
482  "movq "MANGLE(bone)", %%mm5 \n\t"
483  "movq (%1), %%mm0 \n\t"
484  "pavgb 1(%1), %%mm0 \n\t"
485  "add %3, %1 \n\t"
486  ".p2align 4 \n\t"
487  "1: \n\t"
488  "movq (%1), %%mm1 \n\t"
489  "movq (%1,%3), %%mm2 \n\t"
490  "pavgb 1(%1), %%mm1 \n\t"
491  "pavgb 1(%1,%3), %%mm2 \n\t"
492  "psubusb %%mm5, %%mm1 \n\t"
493  "pavgb %%mm1, %%mm0 \n\t"
494  "pavgb %%mm2, %%mm1 \n\t"
495  "psadbw (%2), %%mm0 \n\t"
496  "psadbw (%2,%3), %%mm1 \n\t"
497  "paddw %%mm0, %%mm6 \n\t"
498  "paddw %%mm1, %%mm6 \n\t"
499  "movq %%mm2, %%mm0 \n\t"
500  "lea (%1,%3,2), %1 \n\t"
501  "lea (%2,%3,2), %2 \n\t"
502  "sub $2, %0 \n\t"
503  " jg 1b \n\t"
504  : "+r" (h), "+r" (blk1), "+r" (blk2)
505  : "r" ((x86_reg) stride)
506  NAMED_CONSTRAINTS_ADD(bone));
507 }
508 
509 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
510  int stride, int h)
511 {
512  x86_reg len = -(x86_reg)stride * h;
513  __asm__ volatile (
514  ".p2align 4 \n\t"
515  "1: \n\t"
516  "movq (%1, %%"REG_a"), %%mm0 \n\t"
517  "movq (%2, %%"REG_a"), %%mm1 \n\t"
518  "movq (%1, %%"REG_a"), %%mm2 \n\t"
519  "movq (%2, %%"REG_a"), %%mm3 \n\t"
520  "punpcklbw %%mm7, %%mm0 \n\t"
521  "punpcklbw %%mm7, %%mm1 \n\t"
522  "punpckhbw %%mm7, %%mm2 \n\t"
523  "punpckhbw %%mm7, %%mm3 \n\t"
524  "paddw %%mm0, %%mm1 \n\t"
525  "paddw %%mm2, %%mm3 \n\t"
526  "movq (%3, %%"REG_a"), %%mm4 \n\t"
527  "movq (%3, %%"REG_a"), %%mm2 \n\t"
528  "paddw %%mm5, %%mm1 \n\t"
529  "paddw %%mm5, %%mm3 \n\t"
530  "psrlw $1, %%mm1 \n\t"
531  "psrlw $1, %%mm3 \n\t"
532  "packuswb %%mm3, %%mm1 \n\t"
533  "psubusb %%mm1, %%mm4 \n\t"
534  "psubusb %%mm2, %%mm1 \n\t"
535  "por %%mm4, %%mm1 \n\t"
536  "movq %%mm1, %%mm0 \n\t"
537  "punpcklbw %%mm7, %%mm0 \n\t"
538  "punpckhbw %%mm7, %%mm1 \n\t"
539  "paddw %%mm1, %%mm0 \n\t"
540  "paddw %%mm0, %%mm6 \n\t"
541  "add %4, %%"REG_a" \n\t"
542  " js 1b \n\t"
543  : "+a" (len)
544  : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
545  "r" ((x86_reg) stride));
546 }
547 
548 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
549 {
550  x86_reg len = -(x86_reg)stride * h;
551  __asm__ volatile (
552  "movq (%1, %%"REG_a"), %%mm0 \n\t"
553  "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
554  "movq %%mm0, %%mm1 \n\t"
555  "movq %%mm2, %%mm3 \n\t"
556  "punpcklbw %%mm7, %%mm0 \n\t"
557  "punpckhbw %%mm7, %%mm1 \n\t"
558  "punpcklbw %%mm7, %%mm2 \n\t"
559  "punpckhbw %%mm7, %%mm3 \n\t"
560  "paddw %%mm2, %%mm0 \n\t"
561  "paddw %%mm3, %%mm1 \n\t"
562  ".p2align 4 \n\t"
563  "1: \n\t"
564  "movq (%2, %%"REG_a"), %%mm2 \n\t"
565  "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
566  "movq %%mm2, %%mm3 \n\t"
567  "movq %%mm4, %%mm5 \n\t"
568  "punpcklbw %%mm7, %%mm2 \n\t"
569  "punpckhbw %%mm7, %%mm3 \n\t"
570  "punpcklbw %%mm7, %%mm4 \n\t"
571  "punpckhbw %%mm7, %%mm5 \n\t"
572  "paddw %%mm4, %%mm2 \n\t"
573  "paddw %%mm5, %%mm3 \n\t"
574  "movq %5, %%mm5 \n\t"
575  "paddw %%mm2, %%mm0 \n\t"
576  "paddw %%mm3, %%mm1 \n\t"
577  "paddw %%mm5, %%mm0 \n\t"
578  "paddw %%mm5, %%mm1 \n\t"
579  "movq (%3, %%"REG_a"), %%mm4 \n\t"
580  "movq (%3, %%"REG_a"), %%mm5 \n\t"
581  "psrlw $2, %%mm0 \n\t"
582  "psrlw $2, %%mm1 \n\t"
583  "packuswb %%mm1, %%mm0 \n\t"
584  "psubusb %%mm0, %%mm4 \n\t"
585  "psubusb %%mm5, %%mm0 \n\t"
586  "por %%mm4, %%mm0 \n\t"
587  "movq %%mm0, %%mm4 \n\t"
588  "punpcklbw %%mm7, %%mm0 \n\t"
589  "punpckhbw %%mm7, %%mm4 \n\t"
590  "paddw %%mm0, %%mm6 \n\t"
591  "paddw %%mm4, %%mm6 \n\t"
592  "movq %%mm2, %%mm0 \n\t"
593  "movq %%mm3, %%mm1 \n\t"
594  "add %4, %%"REG_a" \n\t"
595  " js 1b \n\t"
596  : "+a" (len)
597  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
598  "r" ((x86_reg) stride), "m" (round_tab[2]));
599 }
600 
601 static inline int sum_mmx(void)
602 {
603  int ret;
604  __asm__ volatile (
605  "movq %%mm6, %%mm0 \n\t"
606  "psrlq $32, %%mm6 \n\t"
607  "paddw %%mm0, %%mm6 \n\t"
608  "movq %%mm6, %%mm0 \n\t"
609  "psrlq $16, %%mm6 \n\t"
610  "paddw %%mm0, %%mm6 \n\t"
611  "movd %%mm6, %0 \n\t"
612  : "=r" (ret));
613  return ret & 0xFFFF;
614 }
615 
616 static inline int sum_mmxext(void)
617 {
618  int ret;
619  __asm__ volatile (
620  "movd %%mm6, %0 \n\t"
621  : "=r" (ret));
622  return ret;
623 }
624 
625 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
626 {
627  sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
628 }
629 
630 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
631 {
632  sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
633 }
634 
635 #define PIX_SAD(suf) \
636 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
637  uint8_t *blk1, int stride, int h) \
638 { \
639  av_assert2(h == 8); \
640  __asm__ volatile ( \
641  "pxor %%mm7, %%mm7 \n\t" \
642  "pxor %%mm6, %%mm6 \n\t" \
643  :); \
644  \
645  sad8_1_ ## suf(blk1, blk2, stride, 8); \
646  \
647  return sum_ ## suf(); \
648 } \
649  \
650 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
651  uint8_t *blk1, int stride, int h) \
652 { \
653  av_assert2(h == 8); \
654  __asm__ volatile ( \
655  "pxor %%mm7, %%mm7 \n\t" \
656  "pxor %%mm6, %%mm6 \n\t" \
657  "movq %0, %%mm5 \n\t" \
658  :: "m" (round_tab[1])); \
659  \
660  sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
661  \
662  return sum_ ## suf(); \
663 } \
664  \
665 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
666  uint8_t *blk1, int stride, int h) \
667 { \
668  av_assert2(h == 8); \
669  __asm__ volatile ( \
670  "pxor %%mm7, %%mm7 \n\t" \
671  "pxor %%mm6, %%mm6 \n\t" \
672  "movq %0, %%mm5 \n\t" \
673  :: "m" (round_tab[1])); \
674  \
675  sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
676  \
677  return sum_ ## suf(); \
678 } \
679  \
680 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
681  uint8_t *blk1, int stride, int h) \
682 { \
683  av_assert2(h == 8); \
684  __asm__ volatile ( \
685  "pxor %%mm7, %%mm7 \n\t" \
686  "pxor %%mm6, %%mm6 \n\t" \
687  ::); \
688  \
689  sad8_4_ ## suf(blk1, blk2, stride, 8); \
690  \
691  return sum_ ## suf(); \
692 } \
693  \
694 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
695  uint8_t *blk1, int stride, int h) \
696 { \
697  __asm__ volatile ( \
698  "pxor %%mm7, %%mm7 \n\t" \
699  "pxor %%mm6, %%mm6 \n\t" \
700  :); \
701  \
702  sad8_1_ ## suf(blk1, blk2, stride, h); \
703  sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
704  \
705  return sum_ ## suf(); \
706 } \
707  \
708 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
709  uint8_t *blk1, int stride, int h) \
710 { \
711  __asm__ volatile ( \
712  "pxor %%mm7, %%mm7 \n\t" \
713  "pxor %%mm6, %%mm6 \n\t" \
714  "movq %0, %%mm5 \n\t" \
715  :: "m" (round_tab[1])); \
716  \
717  sad8_x2a_ ## suf(blk1, blk2, stride, h); \
718  sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
719  \
720  return sum_ ## suf(); \
721 } \
722  \
723 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
724  uint8_t *blk1, int stride, int h) \
725 { \
726  __asm__ volatile ( \
727  "pxor %%mm7, %%mm7 \n\t" \
728  "pxor %%mm6, %%mm6 \n\t" \
729  "movq %0, %%mm5 \n\t" \
730  :: "m" (round_tab[1])); \
731  \
732  sad8_y2a_ ## suf(blk1, blk2, stride, h); \
733  sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
734  \
735  return sum_ ## suf(); \
736 } \
737  \
738 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
739  uint8_t *blk1, int stride, int h) \
740 { \
741  __asm__ volatile ( \
742  "pxor %%mm7, %%mm7 \n\t" \
743  "pxor %%mm6, %%mm6 \n\t" \
744  ::); \
745  \
746  sad8_4_ ## suf(blk1, blk2, stride, h); \
747  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
748  \
749  return sum_ ## suf(); \
750 } \
751 
752 PIX_SAD(mmx)
753 PIX_SAD(mmxext)
754 
755 #endif /* HAVE_INLINE_ASM */
756 
758 {
759  int cpu_flags = av_get_cpu_flags();
760 
761 #if HAVE_INLINE_ASM
762  if (INLINE_MMX(cpu_flags)) {
763  c->pix_abs[0][0] = sad16_mmx;
764  c->pix_abs[0][1] = sad16_x2_mmx;
765  c->pix_abs[0][2] = sad16_y2_mmx;
766  c->pix_abs[0][3] = sad16_xy2_mmx;
767  c->pix_abs[1][0] = sad8_mmx;
768  c->pix_abs[1][1] = sad8_x2_mmx;
769  c->pix_abs[1][2] = sad8_y2_mmx;
770  c->pix_abs[1][3] = sad8_xy2_mmx;
771 
772  c->sad[0] = sad16_mmx;
773  c->sad[1] = sad8_mmx;
774 
775  c->vsad[4] = vsad_intra16_mmx;
776 
777  if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
778  c->vsad[0] = vsad16_mmx;
779  }
780  }
781 
782  if (INLINE_MMXEXT(cpu_flags)) {
783  c->vsad[4] = vsad_intra16_mmxext;
784 
785  c->pix_abs[0][0] = sad16_mmxext;
786  c->pix_abs[1][0] = sad8_mmxext;
787 
788  c->sad[0] = sad16_mmxext;
789  c->sad[1] = sad8_mmxext;
790 
791  c->pix_abs[0][1] = sad16_x2_mmxext;
792  c->pix_abs[0][2] = sad16_y2_mmxext;
793  c->pix_abs[1][1] = sad8_x2_mmxext;
794  c->pix_abs[1][2] = sad8_y2_mmxext;
795 
796  if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
797  c->pix_abs[0][3] = sad16_xy2_mmxext;
798  c->pix_abs[1][3] = sad8_xy2_mmxext;
799 
800  c->vsad[0] = vsad16_mmxext;
801  }
802  }
803 
804  if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
805  c->sad[0] = sad16_sse2;
806  }
807 
808 #endif /* HAVE_INLINE_ASM */
809 
810  if (EXTERNAL_MMX(cpu_flags)) {
811  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
812  c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
814  c->sse[0] = ff_sse16_mmx;
815  c->sse[1] = ff_sse8_mmx;
816 #if HAVE_YASM
817  c->nsse[0] = nsse16_mmx;
818  c->nsse[1] = nsse8_mmx;
819 #endif
820  }
821 
822  if (EXTERNAL_MMXEXT(cpu_flags)) {
823  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
824  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
826  }
827 
828  if (EXTERNAL_SSE2(cpu_flags)) {
829  c->sse[0] = ff_sse16_sse2;
831 
832 #if HAVE_ALIGNED_STACK
833  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
834  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
835 #endif
836  }
837 
838  if (EXTERNAL_SSSE3(cpu_flags)) {
840 #if HAVE_ALIGNED_STACK
841  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
842  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
843 #endif
844  }
845 }