FFmpeg
rgb2rgb.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
26 #include <stdint.h>
27 
28 #include "config.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/x86/cpu.h"
31 #include "libavutil/cpu.h"
32 #include "libavutil/bswap.h"
33 #include "libavutil/mem_internal.h"
34 
35 #include "libswscale/rgb2rgb.h"
36 #include "libswscale/swscale.h"
38 
39 #if HAVE_INLINE_ASM
40 #include "libavutil/x86/asm.h"
41 
42 DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
43 DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
44 DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
45 DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
46 DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
47 DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
48 DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
49 DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
50 DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
51 DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
52 DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
53 DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
54 DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
55 DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
56 DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
57 DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
58 DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
59 DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
60 #define mask16b mask15b
61 DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
62 DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
63 #define red_16mask mask3215g
64 DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
65 DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
66 DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
67 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
68 #define blue_15mask blue_16mask
69 DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
70 DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
71 DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
72 
73 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
74 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
75 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
76 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
77 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
78 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
79 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
80 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
81 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
82 
83 // MMXEXT versions
84 #define PREFETCH "prefetchnta"
85 #define PAVGB "pavgb"
86 #define MOVNTQ "movntq"
87 #define SFENCE "sfence"
88 
89 #define EMMS "emms"
90 
91 static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
92 {
93  uint8_t *dest = dst;
94  const uint8_t *s = src;
95  const uint8_t *end;
96  const uint8_t *mm_end;
97  end = s + src_size;
98  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
99  mm_end = end - 23;
100  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
101  while (s < mm_end) {
102  __asm__ volatile(
103  PREFETCH" 32(%1) \n\t"
104  "movd (%1), %%mm0 \n\t"
105  "punpckldq 3(%1), %%mm0 \n\t"
106  "movd 6(%1), %%mm1 \n\t"
107  "punpckldq 9(%1), %%mm1 \n\t"
108  "movd 12(%1), %%mm2 \n\t"
109  "punpckldq 15(%1), %%mm2 \n\t"
110  "movd 18(%1), %%mm3 \n\t"
111  "punpckldq 21(%1), %%mm3 \n\t"
112  "por %%mm7, %%mm0 \n\t"
113  "por %%mm7, %%mm1 \n\t"
114  "por %%mm7, %%mm2 \n\t"
115  "por %%mm7, %%mm3 \n\t"
116  MOVNTQ" %%mm0, (%0) \n\t"
117  MOVNTQ" %%mm1, 8(%0) \n\t"
118  MOVNTQ" %%mm2, 16(%0) \n\t"
119  MOVNTQ" %%mm3, 24(%0)"
120  :: "r"(dest), "r"(s)
121  :"memory");
122  dest += 32;
123  s += 24;
124  }
125  __asm__ volatile(SFENCE:::"memory");
126  __asm__ volatile(EMMS:::"memory");
127  while (s < end) {
128  *dest++ = *s++;
129  *dest++ = *s++;
130  *dest++ = *s++;
131  *dest++ = 255;
132  }
133 }
134 
135 #define STORE_BGR24_MMX \
136  "psrlq $8, %%mm2 \n\t" \
137  "psrlq $8, %%mm3 \n\t" \
138  "psrlq $8, %%mm6 \n\t" \
139  "psrlq $8, %%mm7 \n\t" \
140  "pand "MANGLE(mask24l)", %%mm0\n\t" \
141  "pand "MANGLE(mask24l)", %%mm1\n\t" \
142  "pand "MANGLE(mask24l)", %%mm4\n\t" \
143  "pand "MANGLE(mask24l)", %%mm5\n\t" \
144  "pand "MANGLE(mask24h)", %%mm2\n\t" \
145  "pand "MANGLE(mask24h)", %%mm3\n\t" \
146  "pand "MANGLE(mask24h)", %%mm6\n\t" \
147  "pand "MANGLE(mask24h)", %%mm7\n\t" \
148  "por %%mm2, %%mm0 \n\t" \
149  "por %%mm3, %%mm1 \n\t" \
150  "por %%mm6, %%mm4 \n\t" \
151  "por %%mm7, %%mm5 \n\t" \
152  \
153  "movq %%mm1, %%mm2 \n\t" \
154  "movq %%mm4, %%mm3 \n\t" \
155  "psllq $48, %%mm2 \n\t" \
156  "psllq $32, %%mm3 \n\t" \
157  "por %%mm2, %%mm0 \n\t" \
158  "psrlq $16, %%mm1 \n\t" \
159  "psrlq $32, %%mm4 \n\t" \
160  "psllq $16, %%mm5 \n\t" \
161  "por %%mm3, %%mm1 \n\t" \
162  "por %%mm5, %%mm4 \n\t" \
163  \
164  MOVNTQ" %%mm0, (%0) \n\t" \
165  MOVNTQ" %%mm1, 8(%0) \n\t" \
166  MOVNTQ" %%mm4, 16(%0)"
167 
168 
169 static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
170 {
171  uint8_t *dest = dst;
172  const uint8_t *s = src;
173  const uint8_t *end;
174  const uint8_t *mm_end;
175  end = s + src_size;
176  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
177  mm_end = end - 31;
178  while (s < mm_end) {
179  __asm__ volatile(
180  PREFETCH" 32(%1) \n\t"
181  "movq (%1), %%mm0 \n\t"
182  "movq 8(%1), %%mm1 \n\t"
183  "movq 16(%1), %%mm4 \n\t"
184  "movq 24(%1), %%mm5 \n\t"
185  "movq %%mm0, %%mm2 \n\t"
186  "movq %%mm1, %%mm3 \n\t"
187  "movq %%mm4, %%mm6 \n\t"
188  "movq %%mm5, %%mm7 \n\t"
189  STORE_BGR24_MMX
190  :: "r"(dest), "r"(s)
191  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
192  :"memory");
193  dest += 24;
194  s += 32;
195  }
196  __asm__ volatile(SFENCE:::"memory");
197  __asm__ volatile(EMMS:::"memory");
198  while (s < end) {
199  *dest++ = *s++;
200  *dest++ = *s++;
201  *dest++ = *s++;
202  s++;
203  }
204 }
205 
206 /*
207  original by Strepto/Astral
208  ported to gcc & bugfixed: A'rpi
209  MMXEXT, 3DNOW optimization by Nick Kurshev
210  32-bit C version, and and&add trick by Michael Niedermayer
211 */
212 static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
213 {
214  register const uint8_t* s=src;
215  register uint8_t* d=dst;
216  register const uint8_t *end;
217  const uint8_t *mm_end;
218  end = s + src_size;
219  __asm__ volatile(PREFETCH" %0"::"m"(*s));
220  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
221  mm_end = end - 15;
222  while (s<mm_end) {
223  __asm__ volatile(
224  PREFETCH" 32(%1) \n\t"
225  "movq (%1), %%mm0 \n\t"
226  "movq 8(%1), %%mm2 \n\t"
227  "movq %%mm0, %%mm1 \n\t"
228  "movq %%mm2, %%mm3 \n\t"
229  "pand %%mm4, %%mm0 \n\t"
230  "pand %%mm4, %%mm2 \n\t"
231  "paddw %%mm1, %%mm0 \n\t"
232  "paddw %%mm3, %%mm2 \n\t"
233  MOVNTQ" %%mm0, (%0) \n\t"
234  MOVNTQ" %%mm2, 8(%0)"
235  :: "r"(d), "r"(s)
236  );
237  d+=16;
238  s+=16;
239  }
240  __asm__ volatile(SFENCE:::"memory");
241  __asm__ volatile(EMMS:::"memory");
242  mm_end = end - 3;
243  while (s < mm_end) {
244  register unsigned x= *((const uint32_t *)s);
245  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
246  d+=4;
247  s+=4;
248  }
249  if (s < end) {
250  register unsigned short x= *((const uint16_t *)s);
251  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
252  }
253 }
254 
255 static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
256 {
257  register const uint8_t* s=src;
258  register uint8_t* d=dst;
259  register const uint8_t *end;
260  const uint8_t *mm_end;
261  end = s + src_size;
262  __asm__ volatile(PREFETCH" %0"::"m"(*s));
263  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
264  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
265  mm_end = end - 15;
266  while (s<mm_end) {
267  __asm__ volatile(
268  PREFETCH" 32(%1) \n\t"
269  "movq (%1), %%mm0 \n\t"
270  "movq 8(%1), %%mm2 \n\t"
271  "movq %%mm0, %%mm1 \n\t"
272  "movq %%mm2, %%mm3 \n\t"
273  "psrlq $1, %%mm0 \n\t"
274  "psrlq $1, %%mm2 \n\t"
275  "pand %%mm7, %%mm0 \n\t"
276  "pand %%mm7, %%mm2 \n\t"
277  "pand %%mm6, %%mm1 \n\t"
278  "pand %%mm6, %%mm3 \n\t"
279  "por %%mm1, %%mm0 \n\t"
280  "por %%mm3, %%mm2 \n\t"
281  MOVNTQ" %%mm0, (%0) \n\t"
282  MOVNTQ" %%mm2, 8(%0)"
283  :: "r"(d), "r"(s)
284  );
285  d+=16;
286  s+=16;
287  }
288  __asm__ volatile(SFENCE:::"memory");
289  __asm__ volatile(EMMS:::"memory");
290  mm_end = end - 3;
291  while (s < mm_end) {
292  register uint32_t x= *((const uint32_t*)s);
293  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
294  s+=4;
295  d+=4;
296  }
297  if (s < end) {
298  register uint16_t x= *((const uint16_t*)s);
299  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300  }
301 }
302 
303 static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
304 {
305  const uint8_t *s = src;
306  const uint8_t *end;
307  const uint8_t *mm_end;
308  uint16_t *d = (uint16_t *)dst;
309  end = s + src_size;
310  mm_end = end - 15;
311  __asm__ volatile(
312  "movq %3, %%mm5 \n\t"
313  "movq %4, %%mm6 \n\t"
314  "movq %5, %%mm7 \n\t"
315  "jmp 2f \n\t"
316  ".p2align 4 \n\t"
317  "1: \n\t"
318  PREFETCH" 32(%1) \n\t"
319  "movd (%1), %%mm0 \n\t"
320  "movd 4(%1), %%mm3 \n\t"
321  "punpckldq 8(%1), %%mm0 \n\t"
322  "punpckldq 12(%1), %%mm3 \n\t"
323  "movq %%mm0, %%mm1 \n\t"
324  "movq %%mm3, %%mm4 \n\t"
325  "pand %%mm6, %%mm0 \n\t"
326  "pand %%mm6, %%mm3 \n\t"
327  "pmaddwd %%mm7, %%mm0 \n\t"
328  "pmaddwd %%mm7, %%mm3 \n\t"
329  "pand %%mm5, %%mm1 \n\t"
330  "pand %%mm5, %%mm4 \n\t"
331  "por %%mm1, %%mm0 \n\t"
332  "por %%mm4, %%mm3 \n\t"
333  "psrld $5, %%mm0 \n\t"
334  "pslld $11, %%mm3 \n\t"
335  "por %%mm3, %%mm0 \n\t"
336  MOVNTQ" %%mm0, (%0) \n\t"
337  "add $16, %1 \n\t"
338  "add $8, %0 \n\t"
339  "2: \n\t"
340  "cmp %2, %1 \n\t"
341  " jb 1b \n\t"
342  : "+r" (d), "+r"(s)
343  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
344  );
345  __asm__ volatile(SFENCE:::"memory");
346  __asm__ volatile(EMMS:::"memory");
347  while (s < end) {
348  register int rgb = *(const uint32_t*)s; s += 4;
349  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
350  }
351 }
352 
353 static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
354 {
355  const uint8_t *s = src;
356  const uint8_t *end;
357  const uint8_t *mm_end;
358  uint16_t *d = (uint16_t *)dst;
359  end = s + src_size;
360  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
361  __asm__ volatile(
362  "movq %0, %%mm7 \n\t"
363  "movq %1, %%mm6 \n\t"
364  ::"m"(red_16mask),"m"(green_16mask));
365  mm_end = end - 15;
366  while (s < mm_end) {
367  __asm__ volatile(
368  PREFETCH" 32(%1) \n\t"
369  "movd (%1), %%mm0 \n\t"
370  "movd 4(%1), %%mm3 \n\t"
371  "punpckldq 8(%1), %%mm0 \n\t"
372  "punpckldq 12(%1), %%mm3 \n\t"
373  "movq %%mm0, %%mm1 \n\t"
374  "movq %%mm0, %%mm2 \n\t"
375  "movq %%mm3, %%mm4 \n\t"
376  "movq %%mm3, %%mm5 \n\t"
377  "psllq $8, %%mm0 \n\t"
378  "psllq $8, %%mm3 \n\t"
379  "pand %%mm7, %%mm0 \n\t"
380  "pand %%mm7, %%mm3 \n\t"
381  "psrlq $5, %%mm1 \n\t"
382  "psrlq $5, %%mm4 \n\t"
383  "pand %%mm6, %%mm1 \n\t"
384  "pand %%mm6, %%mm4 \n\t"
385  "psrlq $19, %%mm2 \n\t"
386  "psrlq $19, %%mm5 \n\t"
387  "pand %2, %%mm2 \n\t"
388  "pand %2, %%mm5 \n\t"
389  "por %%mm1, %%mm0 \n\t"
390  "por %%mm4, %%mm3 \n\t"
391  "por %%mm2, %%mm0 \n\t"
392  "por %%mm5, %%mm3 \n\t"
393  "psllq $16, %%mm3 \n\t"
394  "por %%mm3, %%mm0 \n\t"
395  MOVNTQ" %%mm0, (%0) \n\t"
396  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
397  d += 4;
398  s += 16;
399  }
400  __asm__ volatile(SFENCE:::"memory");
401  __asm__ volatile(EMMS:::"memory");
402  while (s < end) {
403  register int rgb = *(const uint32_t*)s; s += 4;
404  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
405  }
406 }
407 
408 static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
409 {
410  const uint8_t *s = src;
411  const uint8_t *end;
412  const uint8_t *mm_end;
413  uint16_t *d = (uint16_t *)dst;
414  end = s + src_size;
415  mm_end = end - 15;
416  __asm__ volatile(
417  "movq %3, %%mm5 \n\t"
418  "movq %4, %%mm6 \n\t"
419  "movq %5, %%mm7 \n\t"
420  "jmp 2f \n\t"
421  ".p2align 4 \n\t"
422  "1: \n\t"
423  PREFETCH" 32(%1) \n\t"
424  "movd (%1), %%mm0 \n\t"
425  "movd 4(%1), %%mm3 \n\t"
426  "punpckldq 8(%1), %%mm0 \n\t"
427  "punpckldq 12(%1), %%mm3 \n\t"
428  "movq %%mm0, %%mm1 \n\t"
429  "movq %%mm3, %%mm4 \n\t"
430  "pand %%mm6, %%mm0 \n\t"
431  "pand %%mm6, %%mm3 \n\t"
432  "pmaddwd %%mm7, %%mm0 \n\t"
433  "pmaddwd %%mm7, %%mm3 \n\t"
434  "pand %%mm5, %%mm1 \n\t"
435  "pand %%mm5, %%mm4 \n\t"
436  "por %%mm1, %%mm0 \n\t"
437  "por %%mm4, %%mm3 \n\t"
438  "psrld $6, %%mm0 \n\t"
439  "pslld $10, %%mm3 \n\t"
440  "por %%mm3, %%mm0 \n\t"
441  MOVNTQ" %%mm0, (%0) \n\t"
442  "add $16, %1 \n\t"
443  "add $8, %0 \n\t"
444  "2: \n\t"
445  "cmp %2, %1 \n\t"
446  " jb 1b \n\t"
447  : "+r" (d), "+r"(s)
448  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
449  );
450  __asm__ volatile(SFENCE:::"memory");
451  __asm__ volatile(EMMS:::"memory");
452  while (s < end) {
453  register int rgb = *(const uint32_t*)s; s += 4;
454  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
455  }
456 }
457 
458 static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
459 {
460  const uint8_t *s = src;
461  const uint8_t *end;
462  const uint8_t *mm_end;
463  uint16_t *d = (uint16_t *)dst;
464  end = s + src_size;
465  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
466  __asm__ volatile(
467  "movq %0, %%mm7 \n\t"
468  "movq %1, %%mm6 \n\t"
469  ::"m"(red_15mask),"m"(green_15mask));
470  mm_end = end - 15;
471  while (s < mm_end) {
472  __asm__ volatile(
473  PREFETCH" 32(%1) \n\t"
474  "movd (%1), %%mm0 \n\t"
475  "movd 4(%1), %%mm3 \n\t"
476  "punpckldq 8(%1), %%mm0 \n\t"
477  "punpckldq 12(%1), %%mm3 \n\t"
478  "movq %%mm0, %%mm1 \n\t"
479  "movq %%mm0, %%mm2 \n\t"
480  "movq %%mm3, %%mm4 \n\t"
481  "movq %%mm3, %%mm5 \n\t"
482  "psllq $7, %%mm0 \n\t"
483  "psllq $7, %%mm3 \n\t"
484  "pand %%mm7, %%mm0 \n\t"
485  "pand %%mm7, %%mm3 \n\t"
486  "psrlq $6, %%mm1 \n\t"
487  "psrlq $6, %%mm4 \n\t"
488  "pand %%mm6, %%mm1 \n\t"
489  "pand %%mm6, %%mm4 \n\t"
490  "psrlq $19, %%mm2 \n\t"
491  "psrlq $19, %%mm5 \n\t"
492  "pand %2, %%mm2 \n\t"
493  "pand %2, %%mm5 \n\t"
494  "por %%mm1, %%mm0 \n\t"
495  "por %%mm4, %%mm3 \n\t"
496  "por %%mm2, %%mm0 \n\t"
497  "por %%mm5, %%mm3 \n\t"
498  "psllq $16, %%mm3 \n\t"
499  "por %%mm3, %%mm0 \n\t"
500  MOVNTQ" %%mm0, (%0) \n\t"
501  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
502  d += 4;
503  s += 16;
504  }
505  __asm__ volatile(SFENCE:::"memory");
506  __asm__ volatile(EMMS:::"memory");
507  while (s < end) {
508  register int rgb = *(const uint32_t*)s; s += 4;
509  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
510  }
511 }
512 
513 static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
514 {
515  const uint8_t *s = src;
516  const uint8_t *end;
517  const uint8_t *mm_end;
518  uint16_t *d = (uint16_t *)dst;
519  end = s + src_size;
520  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
521  __asm__ volatile(
522  "movq %0, %%mm7 \n\t"
523  "movq %1, %%mm6 \n\t"
524  ::"m"(red_16mask),"m"(green_16mask));
525  mm_end = end - 11;
526  while (s < mm_end) {
527  __asm__ volatile(
528  PREFETCH" 32(%1) \n\t"
529  "movd (%1), %%mm0 \n\t"
530  "movd 3(%1), %%mm3 \n\t"
531  "punpckldq 6(%1), %%mm0 \n\t"
532  "punpckldq 9(%1), %%mm3 \n\t"
533  "movq %%mm0, %%mm1 \n\t"
534  "movq %%mm0, %%mm2 \n\t"
535  "movq %%mm3, %%mm4 \n\t"
536  "movq %%mm3, %%mm5 \n\t"
537  "psrlq $3, %%mm0 \n\t"
538  "psrlq $3, %%mm3 \n\t"
539  "pand %2, %%mm0 \n\t"
540  "pand %2, %%mm3 \n\t"
541  "psrlq $5, %%mm1 \n\t"
542  "psrlq $5, %%mm4 \n\t"
543  "pand %%mm6, %%mm1 \n\t"
544  "pand %%mm6, %%mm4 \n\t"
545  "psrlq $8, %%mm2 \n\t"
546  "psrlq $8, %%mm5 \n\t"
547  "pand %%mm7, %%mm2 \n\t"
548  "pand %%mm7, %%mm5 \n\t"
549  "por %%mm1, %%mm0 \n\t"
550  "por %%mm4, %%mm3 \n\t"
551  "por %%mm2, %%mm0 \n\t"
552  "por %%mm5, %%mm3 \n\t"
553  "psllq $16, %%mm3 \n\t"
554  "por %%mm3, %%mm0 \n\t"
555  MOVNTQ" %%mm0, (%0) \n\t"
556  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
557  d += 4;
558  s += 12;
559  }
560  __asm__ volatile(SFENCE:::"memory");
561  __asm__ volatile(EMMS:::"memory");
562  while (s < end) {
563  const int b = *s++;
564  const int g = *s++;
565  const int r = *s++;
566  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
567  }
568 }
569 
570 static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
571 {
572  const uint8_t *s = src;
573  const uint8_t *end;
574  const uint8_t *mm_end;
575  uint16_t *d = (uint16_t *)dst;
576  end = s + src_size;
577  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
578  __asm__ volatile(
579  "movq %0, %%mm7 \n\t"
580  "movq %1, %%mm6 \n\t"
581  ::"m"(red_16mask),"m"(green_16mask));
582  mm_end = end - 15;
583  while (s < mm_end) {
584  __asm__ volatile(
585  PREFETCH" 32(%1) \n\t"
586  "movd (%1), %%mm0 \n\t"
587  "movd 3(%1), %%mm3 \n\t"
588  "punpckldq 6(%1), %%mm0 \n\t"
589  "punpckldq 9(%1), %%mm3 \n\t"
590  "movq %%mm0, %%mm1 \n\t"
591  "movq %%mm0, %%mm2 \n\t"
592  "movq %%mm3, %%mm4 \n\t"
593  "movq %%mm3, %%mm5 \n\t"
594  "psllq $8, %%mm0 \n\t"
595  "psllq $8, %%mm3 \n\t"
596  "pand %%mm7, %%mm0 \n\t"
597  "pand %%mm7, %%mm3 \n\t"
598  "psrlq $5, %%mm1 \n\t"
599  "psrlq $5, %%mm4 \n\t"
600  "pand %%mm6, %%mm1 \n\t"
601  "pand %%mm6, %%mm4 \n\t"
602  "psrlq $19, %%mm2 \n\t"
603  "psrlq $19, %%mm5 \n\t"
604  "pand %2, %%mm2 \n\t"
605  "pand %2, %%mm5 \n\t"
606  "por %%mm1, %%mm0 \n\t"
607  "por %%mm4, %%mm3 \n\t"
608  "por %%mm2, %%mm0 \n\t"
609  "por %%mm5, %%mm3 \n\t"
610  "psllq $16, %%mm3 \n\t"
611  "por %%mm3, %%mm0 \n\t"
612  MOVNTQ" %%mm0, (%0) \n\t"
613  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
614  d += 4;
615  s += 12;
616  }
617  __asm__ volatile(SFENCE:::"memory");
618  __asm__ volatile(EMMS:::"memory");
619  while (s < end) {
620  const int r = *s++;
621  const int g = *s++;
622  const int b = *s++;
623  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
624  }
625 }
626 
627 static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
628 {
629  const uint8_t *s = src;
630  const uint8_t *end;
631  const uint8_t *mm_end;
632  uint16_t *d = (uint16_t *)dst;
633  end = s + src_size;
634  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
635  __asm__ volatile(
636  "movq %0, %%mm7 \n\t"
637  "movq %1, %%mm6 \n\t"
638  ::"m"(red_15mask),"m"(green_15mask));
639  mm_end = end - 11;
640  while (s < mm_end) {
641  __asm__ volatile(
642  PREFETCH" 32(%1) \n\t"
643  "movd (%1), %%mm0 \n\t"
644  "movd 3(%1), %%mm3 \n\t"
645  "punpckldq 6(%1), %%mm0 \n\t"
646  "punpckldq 9(%1), %%mm3 \n\t"
647  "movq %%mm0, %%mm1 \n\t"
648  "movq %%mm0, %%mm2 \n\t"
649  "movq %%mm3, %%mm4 \n\t"
650  "movq %%mm3, %%mm5 \n\t"
651  "psrlq $3, %%mm0 \n\t"
652  "psrlq $3, %%mm3 \n\t"
653  "pand %2, %%mm0 \n\t"
654  "pand %2, %%mm3 \n\t"
655  "psrlq $6, %%mm1 \n\t"
656  "psrlq $6, %%mm4 \n\t"
657  "pand %%mm6, %%mm1 \n\t"
658  "pand %%mm6, %%mm4 \n\t"
659  "psrlq $9, %%mm2 \n\t"
660  "psrlq $9, %%mm5 \n\t"
661  "pand %%mm7, %%mm2 \n\t"
662  "pand %%mm7, %%mm5 \n\t"
663  "por %%mm1, %%mm0 \n\t"
664  "por %%mm4, %%mm3 \n\t"
665  "por %%mm2, %%mm0 \n\t"
666  "por %%mm5, %%mm3 \n\t"
667  "psllq $16, %%mm3 \n\t"
668  "por %%mm3, %%mm0 \n\t"
669  MOVNTQ" %%mm0, (%0) \n\t"
670  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
671  d += 4;
672  s += 12;
673  }
674  __asm__ volatile(SFENCE:::"memory");
675  __asm__ volatile(EMMS:::"memory");
676  while (s < end) {
677  const int b = *s++;
678  const int g = *s++;
679  const int r = *s++;
680  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
681  }
682 }
683 
684 static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
685 {
686  const uint8_t *s = src;
687  const uint8_t *end;
688  const uint8_t *mm_end;
689  uint16_t *d = (uint16_t *)dst;
690  end = s + src_size;
691  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
692  __asm__ volatile(
693  "movq %0, %%mm7 \n\t"
694  "movq %1, %%mm6 \n\t"
695  ::"m"(red_15mask),"m"(green_15mask));
696  mm_end = end - 15;
697  while (s < mm_end) {
698  __asm__ volatile(
699  PREFETCH" 32(%1) \n\t"
700  "movd (%1), %%mm0 \n\t"
701  "movd 3(%1), %%mm3 \n\t"
702  "punpckldq 6(%1), %%mm0 \n\t"
703  "punpckldq 9(%1), %%mm3 \n\t"
704  "movq %%mm0, %%mm1 \n\t"
705  "movq %%mm0, %%mm2 \n\t"
706  "movq %%mm3, %%mm4 \n\t"
707  "movq %%mm3, %%mm5 \n\t"
708  "psllq $7, %%mm0 \n\t"
709  "psllq $7, %%mm3 \n\t"
710  "pand %%mm7, %%mm0 \n\t"
711  "pand %%mm7, %%mm3 \n\t"
712  "psrlq $6, %%mm1 \n\t"
713  "psrlq $6, %%mm4 \n\t"
714  "pand %%mm6, %%mm1 \n\t"
715  "pand %%mm6, %%mm4 \n\t"
716  "psrlq $19, %%mm2 \n\t"
717  "psrlq $19, %%mm5 \n\t"
718  "pand %2, %%mm2 \n\t"
719  "pand %2, %%mm5 \n\t"
720  "por %%mm1, %%mm0 \n\t"
721  "por %%mm4, %%mm3 \n\t"
722  "por %%mm2, %%mm0 \n\t"
723  "por %%mm5, %%mm3 \n\t"
724  "psllq $16, %%mm3 \n\t"
725  "por %%mm3, %%mm0 \n\t"
726  MOVNTQ" %%mm0, (%0) \n\t"
727  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
728  d += 4;
729  s += 12;
730  }
731  __asm__ volatile(SFENCE:::"memory");
732  __asm__ volatile(EMMS:::"memory");
733  while (s < end) {
734  const int r = *s++;
735  const int g = *s++;
736  const int b = *s++;
737  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
738  }
739 }
740 
741 static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
742 {
743  const uint16_t *end;
744  const uint16_t *mm_end;
745  uint8_t *d = dst;
746  const uint16_t *s = (const uint16_t*)src;
747  end = s + src_size/2;
748  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
749  mm_end = end - 7;
750  while (s < mm_end) {
751  __asm__ volatile(
752  PREFETCH" 32(%1) \n\t"
753  "movq (%1), %%mm0 \n\t"
754  "movq (%1), %%mm1 \n\t"
755  "movq (%1), %%mm2 \n\t"
756  "pand %2, %%mm0 \n\t"
757  "pand %3, %%mm1 \n\t"
758  "pand %4, %%mm2 \n\t"
759  "psllq $5, %%mm0 \n\t"
760  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
761  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
762  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
763  "movq %%mm0, %%mm3 \n\t"
764  "movq %%mm1, %%mm4 \n\t"
765  "movq %%mm2, %%mm5 \n\t"
766  "punpcklwd %5, %%mm0 \n\t"
767  "punpcklwd %5, %%mm1 \n\t"
768  "punpcklwd %5, %%mm2 \n\t"
769  "punpckhwd %5, %%mm3 \n\t"
770  "punpckhwd %5, %%mm4 \n\t"
771  "punpckhwd %5, %%mm5 \n\t"
772  "psllq $8, %%mm1 \n\t"
773  "psllq $16, %%mm2 \n\t"
774  "por %%mm1, %%mm0 \n\t"
775  "por %%mm2, %%mm0 \n\t"
776  "psllq $8, %%mm4 \n\t"
777  "psllq $16, %%mm5 \n\t"
778  "por %%mm4, %%mm3 \n\t"
779  "por %%mm5, %%mm3 \n\t"
780 
781  "movq %%mm0, %%mm6 \n\t"
782  "movq %%mm3, %%mm7 \n\t"
783 
784  "movq 8(%1), %%mm0 \n\t"
785  "movq 8(%1), %%mm1 \n\t"
786  "movq 8(%1), %%mm2 \n\t"
787  "pand %2, %%mm0 \n\t"
788  "pand %3, %%mm1 \n\t"
789  "pand %4, %%mm2 \n\t"
790  "psllq $5, %%mm0 \n\t"
791  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
792  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
793  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
794  "movq %%mm0, %%mm3 \n\t"
795  "movq %%mm1, %%mm4 \n\t"
796  "movq %%mm2, %%mm5 \n\t"
797  "punpcklwd %5, %%mm0 \n\t"
798  "punpcklwd %5, %%mm1 \n\t"
799  "punpcklwd %5, %%mm2 \n\t"
800  "punpckhwd %5, %%mm3 \n\t"
801  "punpckhwd %5, %%mm4 \n\t"
802  "punpckhwd %5, %%mm5 \n\t"
803  "psllq $8, %%mm1 \n\t"
804  "psllq $16, %%mm2 \n\t"
805  "por %%mm1, %%mm0 \n\t"
806  "por %%mm2, %%mm0 \n\t"
807  "psllq $8, %%mm4 \n\t"
808  "psllq $16, %%mm5 \n\t"
809  "por %%mm4, %%mm3 \n\t"
810  "por %%mm5, %%mm3 \n\t"
811 
812  :"=m"(*d)
813  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
814  NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
815  :"memory");
816  /* borrowed 32 to 24 */
817  __asm__ volatile(
818  "movq %%mm0, %%mm4 \n\t"
819  "movq %%mm3, %%mm5 \n\t"
820  "movq %%mm6, %%mm0 \n\t"
821  "movq %%mm7, %%mm1 \n\t"
822 
823  "movq %%mm4, %%mm6 \n\t"
824  "movq %%mm5, %%mm7 \n\t"
825  "movq %%mm0, %%mm2 \n\t"
826  "movq %%mm1, %%mm3 \n\t"
827 
828  STORE_BGR24_MMX
829 
830  :: "r"(d), "m"(*s)
831  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
832  :"memory");
833  d += 24;
834  s += 8;
835  }
836  __asm__ volatile(SFENCE:::"memory");
837  __asm__ volatile(EMMS:::"memory");
838  while (s < end) {
839  register uint16_t bgr;
840  bgr = *s++;
841  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
842  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
843  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
844  }
845 }
846 
847 static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
848 {
849  const uint16_t *end;
850  const uint16_t *mm_end;
851  uint8_t *d = (uint8_t *)dst;
852  const uint16_t *s = (const uint16_t *)src;
853  end = s + src_size/2;
854  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
855  mm_end = end - 7;
856  while (s < mm_end) {
857  __asm__ volatile(
858  PREFETCH" 32(%1) \n\t"
859  "movq (%1), %%mm0 \n\t"
860  "movq (%1), %%mm1 \n\t"
861  "movq (%1), %%mm2 \n\t"
862  "pand %2, %%mm0 \n\t"
863  "pand %3, %%mm1 \n\t"
864  "pand %4, %%mm2 \n\t"
865  "psllq $5, %%mm0 \n\t"
866  "psrlq $1, %%mm2 \n\t"
867  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
868  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
869  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
870  "movq %%mm0, %%mm3 \n\t"
871  "movq %%mm1, %%mm4 \n\t"
872  "movq %%mm2, %%mm5 \n\t"
873  "punpcklwd %5, %%mm0 \n\t"
874  "punpcklwd %5, %%mm1 \n\t"
875  "punpcklwd %5, %%mm2 \n\t"
876  "punpckhwd %5, %%mm3 \n\t"
877  "punpckhwd %5, %%mm4 \n\t"
878  "punpckhwd %5, %%mm5 \n\t"
879  "psllq $8, %%mm1 \n\t"
880  "psllq $16, %%mm2 \n\t"
881  "por %%mm1, %%mm0 \n\t"
882  "por %%mm2, %%mm0 \n\t"
883  "psllq $8, %%mm4 \n\t"
884  "psllq $16, %%mm5 \n\t"
885  "por %%mm4, %%mm3 \n\t"
886  "por %%mm5, %%mm3 \n\t"
887 
888  "movq %%mm0, %%mm6 \n\t"
889  "movq %%mm3, %%mm7 \n\t"
890 
891  "movq 8(%1), %%mm0 \n\t"
892  "movq 8(%1), %%mm1 \n\t"
893  "movq 8(%1), %%mm2 \n\t"
894  "pand %2, %%mm0 \n\t"
895  "pand %3, %%mm1 \n\t"
896  "pand %4, %%mm2 \n\t"
897  "psllq $5, %%mm0 \n\t"
898  "psrlq $1, %%mm2 \n\t"
899  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
900  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
901  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
902  "movq %%mm0, %%mm3 \n\t"
903  "movq %%mm1, %%mm4 \n\t"
904  "movq %%mm2, %%mm5 \n\t"
905  "punpcklwd %5, %%mm0 \n\t"
906  "punpcklwd %5, %%mm1 \n\t"
907  "punpcklwd %5, %%mm2 \n\t"
908  "punpckhwd %5, %%mm3 \n\t"
909  "punpckhwd %5, %%mm4 \n\t"
910  "punpckhwd %5, %%mm5 \n\t"
911  "psllq $8, %%mm1 \n\t"
912  "psllq $16, %%mm2 \n\t"
913  "por %%mm1, %%mm0 \n\t"
914  "por %%mm2, %%mm0 \n\t"
915  "psllq $8, %%mm4 \n\t"
916  "psllq $16, %%mm5 \n\t"
917  "por %%mm4, %%mm3 \n\t"
918  "por %%mm5, %%mm3 \n\t"
919  :"=m"(*d)
920  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
921  NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
922  :"memory");
923  /* borrowed 32 to 24 */
924  __asm__ volatile(
925  "movq %%mm0, %%mm4 \n\t"
926  "movq %%mm3, %%mm5 \n\t"
927  "movq %%mm6, %%mm0 \n\t"
928  "movq %%mm7, %%mm1 \n\t"
929 
930  "movq %%mm4, %%mm6 \n\t"
931  "movq %%mm5, %%mm7 \n\t"
932  "movq %%mm0, %%mm2 \n\t"
933  "movq %%mm1, %%mm3 \n\t"
934 
935  STORE_BGR24_MMX
936 
937  :: "r"(d), "m"(*s)
938  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
939  :"memory");
940  d += 24;
941  s += 8;
942  }
943  __asm__ volatile(SFENCE:::"memory");
944  __asm__ volatile(EMMS:::"memory");
945  while (s < end) {
946  register uint16_t bgr;
947  bgr = *s++;
948  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
949  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
950  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
951  }
952 }
953 
954 /*
955  * mm0 = 00 B3 00 B2 00 B1 00 B0
956  * mm1 = 00 G3 00 G2 00 G1 00 G0
957  * mm2 = 00 R3 00 R2 00 R1 00 R0
958  * mm6 = FF FF FF FF FF FF FF FF
959  * mm7 = 00 00 00 00 00 00 00 00
960  */
961 #define PACK_RGB32 \
962  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
963  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
964  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
965  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
966  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
967  "movq %%mm0, %%mm3 \n\t" \
968  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
969  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
970  MOVNTQ" %%mm0, (%0) \n\t" \
971  MOVNTQ" %%mm3, 8(%0) \n\t" \
972 
973 static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
974 {
975  const uint16_t *end;
976  const uint16_t *mm_end;
977  uint8_t *d = dst;
978  const uint16_t *s = (const uint16_t *)src;
979  end = s + src_size/2;
980  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
981  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
982  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
983  mm_end = end - 3;
984  while (s < mm_end) {
985  __asm__ volatile(
986  PREFETCH" 32(%1) \n\t"
987  "movq (%1), %%mm0 \n\t"
988  "movq (%1), %%mm1 \n\t"
989  "movq (%1), %%mm2 \n\t"
990  "pand %2, %%mm0 \n\t"
991  "pand %3, %%mm1 \n\t"
992  "pand %4, %%mm2 \n\t"
993  "psllq $5, %%mm0 \n\t"
994  "pmulhw %5, %%mm0 \n\t"
995  "pmulhw %5, %%mm1 \n\t"
996  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
997  PACK_RGB32
998  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
999  NAMED_CONSTRAINTS_ADD(mul15_hi)
1000  :"memory");
1001  d += 16;
1002  s += 4;
1003  }
1004  __asm__ volatile(SFENCE:::"memory");
1005  __asm__ volatile(EMMS:::"memory");
1006  while (s < end) {
1007  register uint16_t bgr;
1008  bgr = *s++;
1009  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1010  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
1011  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
1012  *d++ = 255;
1013  }
1014 }
1015 
1016 static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1017 {
1018  const uint16_t *end;
1019  const uint16_t *mm_end;
1020  uint8_t *d = dst;
1021  const uint16_t *s = (const uint16_t*)src;
1022  end = s + src_size/2;
1023  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1024  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1025  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1026  mm_end = end - 3;
1027  while (s < mm_end) {
1028  __asm__ volatile(
1029  PREFETCH" 32(%1) \n\t"
1030  "movq (%1), %%mm0 \n\t"
1031  "movq (%1), %%mm1 \n\t"
1032  "movq (%1), %%mm2 \n\t"
1033  "pand %2, %%mm0 \n\t"
1034  "pand %3, %%mm1 \n\t"
1035  "pand %4, %%mm2 \n\t"
1036  "psllq $5, %%mm0 \n\t"
1037  "psrlq $1, %%mm2 \n\t"
1038  "pmulhw %5, %%mm0 \n\t"
1039  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
1040  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
1041  PACK_RGB32
1042  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1043  NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1044  :"memory");
1045  d += 16;
1046  s += 4;
1047  }
1048  __asm__ volatile(SFENCE:::"memory");
1049  __asm__ volatile(EMMS:::"memory");
1050  while (s < end) {
1051  register uint16_t bgr;
1052  bgr = *s++;
1053  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1054  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1055  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1056  *d++ = 255;
1057  }
1058 }
1059 
1060 static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1061 {
1062  x86_reg mmx_size= 23 - src_size;
1063  __asm__ volatile (
1064  "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
1065  "jns 2f \n\t"
1066  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1067  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1068  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1069  ".p2align 4 \n\t"
1070  "1: \n\t"
1071  PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
1072  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1073  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
1074  "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
1075  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1076  "pand %%mm5, %%mm0 \n\t"
1077  "pand %%mm6, %%mm1 \n\t"
1078  "pand %%mm7, %%mm2 \n\t"
1079  "por %%mm0, %%mm1 \n\t"
1080  "por %%mm2, %%mm1 \n\t"
1081  "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1082  MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
1083  "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
1084  "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
1085  "pand %%mm7, %%mm0 \n\t"
1086  "pand %%mm5, %%mm1 \n\t"
1087  "pand %%mm6, %%mm2 \n\t"
1088  "por %%mm0, %%mm1 \n\t"
1089  "por %%mm2, %%mm1 \n\t"
1090  "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
1091  MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1092  "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
1093  "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
1094  "pand %%mm6, %%mm0 \n\t"
1095  "pand %%mm7, %%mm1 \n\t"
1096  "pand %%mm5, %%mm2 \n\t"
1097  "por %%mm0, %%mm1 \n\t"
1098  "por %%mm2, %%mm1 \n\t"
1099  MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1100  "add $24, %%"FF_REG_a" \n\t"
1101  " js 1b \n\t"
1102  "2: \n\t"
1103  : "+a" (mmx_size)
1104  : "r" (src-mmx_size), "r"(dst-mmx_size)
1105  NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1106  );
1107 
1108  __asm__ volatile(SFENCE:::"memory");
1109  __asm__ volatile(EMMS:::"memory");
1110 
1111  if (mmx_size==23) return; //finished, was multiple of 8
1112 
1113  src+= src_size;
1114  dst+= src_size;
1115  src_size= 23-mmx_size;
1116  src-= src_size;
1117  dst-= src_size;
1118  for (unsigned i = 0; i < src_size; i +=3) {
1119  register uint8_t x;
1120  x = src[i + 2];
1121  dst[i + 1] = src[i + 1];
1122  dst[i + 2] = src[i + 0];
1123  dst[i + 0] = x;
1124  }
1125 }
1126 
1127 static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1128  int width, int height,
1129  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1130 {
1131  const x86_reg chromWidth= width>>1;
1132  for (int y = 0; y < height; y++) {
1133  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1134  __asm__ volatile(
1135  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1136  ".p2align 4 \n\t"
1137  "1: \n\t"
1138  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1139  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1140  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1141  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1142  "movq %%mm0, %%mm2 \n\t" // U(0)
1143  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1144  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1145  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1146 
1147  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1148  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1149  "movq %%mm3, %%mm4 \n\t" // Y(0)
1150  "movq %%mm5, %%mm6 \n\t" // Y(8)
1151  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1152  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1153  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1154  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1155 
1156  MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
1157  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1158  MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
1159  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1160 
1161  "add $8, %%"FF_REG_a" \n\t"
1162  "cmp %4, %%"FF_REG_a" \n\t"
1163  " jb 1b \n\t"
1164  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1165  : "%"FF_REG_a
1166  );
1167  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1168  usrc += chromStride;
1169  vsrc += chromStride;
1170  }
1171  ysrc += lumStride;
1172  dst += dstStride;
1173  }
1174  __asm__(EMMS" \n\t"
1175  SFENCE" \n\t"
1176  :::"memory");
1177 }
1178 
1179 /**
1180  * Height should be a multiple of 2 and width should be a multiple of 16.
1181  * (If this is a problem for anyone then tell me, and I will fix it.)
1182  */
1183 static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1184  int width, int height,
1185  int lumStride, int chromStride, int dstStride)
1186 {
1187  //FIXME interpolate chroma
1188  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1189 }
1190 
1191 static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1192  int width, int height,
1193  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1194 {
1195  const x86_reg chromWidth= width>>1;
1196  for (int y = 0; y < height; y++) {
1197  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1198  __asm__ volatile(
1199  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1200  ".p2align 4 \n\t"
1201  "1: \n\t"
1202  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1203  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1204  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1205  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1206  "movq %%mm0, %%mm2 \n\t" // U(0)
1207  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1208  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1209  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1210 
1211  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1212  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1213  "movq %%mm0, %%mm4 \n\t" // Y(0)
1214  "movq %%mm2, %%mm6 \n\t" // Y(8)
1215  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1216  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1217  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1218  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1219 
1220  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
1221  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1222  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
1223  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1224 
1225  "add $8, %%"FF_REG_a" \n\t"
1226  "cmp %4, %%"FF_REG_a" \n\t"
1227  " jb 1b \n\t"
1228  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1229  : "%"FF_REG_a
1230  );
1231  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1232  usrc += chromStride;
1233  vsrc += chromStride;
1234  }
1235  ysrc += lumStride;
1236  dst += dstStride;
1237  }
1238  __asm__(EMMS" \n\t"
1239  SFENCE" \n\t"
1240  :::"memory");
1241 }
1242 
1243 /**
1244  * Height should be a multiple of 2 and width should be a multiple of 16
1245  * (If this is a problem for anyone then tell me, and I will fix it.)
1246  */
1247 static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1248  int width, int height,
1249  int lumStride, int chromStride, int dstStride)
1250 {
1251  //FIXME interpolate chroma
1252  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1253 }
1254 
1255 /**
1256  * Width should be a multiple of 16.
1257  */
1258 static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1259  int width, int height,
1260  int lumStride, int chromStride, int dstStride)
1261 {
1262  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1263 }
1264 
1265 /**
1266  * Width should be a multiple of 16.
1267  */
1268 static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1269  int width, int height,
1270  int lumStride, int chromStride, int dstStride)
1271 {
1272  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1273 }
1274 
1275 static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1276 {
1277  dst[0]= src[0];
1278 
1279  // first line
1280  for (int x = 0; x < srcWidth - 1; x++) {
1281  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1282  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1283  }
1284  dst[2*srcWidth-1]= src[srcWidth-1];
1285 
1286  dst+= dstStride;
1287 
1288  for (int y = 1; y < srcHeight; y++) {
1289  x86_reg mmxSize= srcWidth&~15;
1290 
1291  if (mmxSize) {
1292  __asm__ volatile(
1293  "mov %4, %%"FF_REG_a" \n\t"
1294  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1295  "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
1296  "movq %%mm4, %%mm2 \n\t"
1297  "psllq $8, %%mm4 \n\t"
1298  "pand %%mm0, %%mm2 \n\t"
1299  "por %%mm2, %%mm4 \n\t"
1300  "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
1301  "movq %%mm5, %%mm3 \n\t"
1302  "psllq $8, %%mm5 \n\t"
1303  "pand %%mm0, %%mm3 \n\t"
1304  "por %%mm3, %%mm5 \n\t"
1305  "1: \n\t"
1306  "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
1307  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
1308  "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
1309  "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
1310  PAVGB" %%mm0, %%mm5 \n\t"
1311  PAVGB" %%mm0, %%mm3 \n\t"
1312  PAVGB" %%mm0, %%mm5 \n\t"
1313  PAVGB" %%mm0, %%mm3 \n\t"
1314  PAVGB" %%mm1, %%mm4 \n\t"
1315  PAVGB" %%mm1, %%mm2 \n\t"
1316  PAVGB" %%mm1, %%mm4 \n\t"
1317  PAVGB" %%mm1, %%mm2 \n\t"
1318  "movq %%mm5, %%mm7 \n\t"
1319  "movq %%mm4, %%mm6 \n\t"
1320  "punpcklbw %%mm3, %%mm5 \n\t"
1321  "punpckhbw %%mm3, %%mm7 \n\t"
1322  "punpcklbw %%mm2, %%mm4 \n\t"
1323  "punpckhbw %%mm2, %%mm6 \n\t"
1324  MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
1325  MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
1326  MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
1327  MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
1328  "add $8, %%"FF_REG_a" \n\t"
1329  "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
1330  "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
1331  " js 1b \n\t"
1332  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1333  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1334  "g" (-mmxSize)
1335  NAMED_CONSTRAINTS_ADD(mmx_ff)
1336  : "%"FF_REG_a
1337  );
1338  } else {
1339  mmxSize = 1;
1340  dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
1341  dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1342  }
1343 
1344  for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
1345  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1346  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1347  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1348  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1349  }
1350  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1351  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1352 
1353  dst+=dstStride*2;
1354  src+=srcStride;
1355  }
1356 
1357  // last line
1358  dst[0]= src[0];
1359 
1360  for (int x = 0; x < srcWidth - 1; x++) {
1361  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1362  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1363  }
1364  dst[2*srcWidth-1]= src[srcWidth-1];
1365 
1366  __asm__ volatile(EMMS" \n\t"
1367  SFENCE" \n\t"
1368  :::"memory");
1369 }
1370 
1371 /**
1372  * Height should be a multiple of 2 and width should be a multiple of 2.
1373  * (If this is a problem for anyone then tell me, and I will fix it.)
1374  * Chrominance data is only taken from every second line,
1375  * others are ignored in the C version.
1376  * FIXME: Write HQ version.
1377  */
1378 #if ARCH_X86_32 && HAVE_7REGS
1379 DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset) = 0x1010101010101010ULL;
1380 DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL;
1381 DECLARE_ASM_CONST(8, uint64_t, w1111) = 0x0001000100010001ULL;
1382 
1383 static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1384  int width, int height,
1385  int lumStride, int chromStride, int srcStride,
1386  const int32_t *rgb2yuv)
1387 {
1388 #define BGR2Y_IDX "16*4+16*32"
1389 #define BGR2U_IDX "16*4+16*33"
1390 #define BGR2V_IDX "16*4+16*34"
1391  int y;
1392  const x86_reg chromWidth= width>>1;
1393 
1394  if (height > 2) {
1395  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1396  src += 2*srcStride;
1397  ydst += 2*lumStride;
1398  udst += chromStride;
1399  vdst += chromStride;
1400  height -= 2;
1401  }
1402 
1403  for (y = 0; y < height - 2; y += 2) {
1404  for (int i = 0; i < 2; i++) {
1405  __asm__ volatile(
1406  "mov %2, %%"FF_REG_a"\n\t"
1407  "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
1408  "movq "MANGLE(w1111)", %%mm5 \n\t"
1409  "pxor %%mm7, %%mm7 \n\t"
1410  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1411  ".p2align 4 \n\t"
1412  "1: \n\t"
1413  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1414  "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
1415  "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
1416  "punpcklbw %%mm7, %%mm0 \n\t"
1417  "punpcklbw %%mm7, %%mm1 \n\t"
1418  "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1419  "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
1420  "punpcklbw %%mm7, %%mm2 \n\t"
1421  "punpcklbw %%mm7, %%mm3 \n\t"
1422  "pmaddwd %%mm6, %%mm0 \n\t"
1423  "pmaddwd %%mm6, %%mm1 \n\t"
1424  "pmaddwd %%mm6, %%mm2 \n\t"
1425  "pmaddwd %%mm6, %%mm3 \n\t"
1426  "psrad $8, %%mm0 \n\t"
1427  "psrad $8, %%mm1 \n\t"
1428  "psrad $8, %%mm2 \n\t"
1429  "psrad $8, %%mm3 \n\t"
1430  "packssdw %%mm1, %%mm0 \n\t"
1431  "packssdw %%mm3, %%mm2 \n\t"
1432  "pmaddwd %%mm5, %%mm0 \n\t"
1433  "pmaddwd %%mm5, %%mm2 \n\t"
1434  "packssdw %%mm2, %%mm0 \n\t"
1435  "psraw $7, %%mm0 \n\t"
1436 
1437  "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1438  "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
1439  "punpcklbw %%mm7, %%mm4 \n\t"
1440  "punpcklbw %%mm7, %%mm1 \n\t"
1441  "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1442  "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
1443  "punpcklbw %%mm7, %%mm2 \n\t"
1444  "punpcklbw %%mm7, %%mm3 \n\t"
1445  "pmaddwd %%mm6, %%mm4 \n\t"
1446  "pmaddwd %%mm6, %%mm1 \n\t"
1447  "pmaddwd %%mm6, %%mm2 \n\t"
1448  "pmaddwd %%mm6, %%mm3 \n\t"
1449  "psrad $8, %%mm4 \n\t"
1450  "psrad $8, %%mm1 \n\t"
1451  "psrad $8, %%mm2 \n\t"
1452  "psrad $8, %%mm3 \n\t"
1453  "packssdw %%mm1, %%mm4 \n\t"
1454  "packssdw %%mm3, %%mm2 \n\t"
1455  "pmaddwd %%mm5, %%mm4 \n\t"
1456  "pmaddwd %%mm5, %%mm2 \n\t"
1457  "add $24, %%"FF_REG_d"\n\t"
1458  "packssdw %%mm2, %%mm4 \n\t"
1459  "psraw $7, %%mm4 \n\t"
1460 
1461  "packuswb %%mm4, %%mm0 \n\t"
1462  "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1463 
1464  MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
1465  "add $8, %%"FF_REG_a" \n\t"
1466  " js 1b \n\t"
1467  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1468  NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset)
1469  : "%"FF_REG_a, "%"FF_REG_d
1470  );
1471  ydst += lumStride;
1472  src += srcStride;
1473  }
1474  src -= srcStride*2;
1475  __asm__ volatile(
1476  "mov %4, %%"FF_REG_a"\n\t"
1477  "movq "MANGLE(w1111)", %%mm5 \n\t"
1478  "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
1479  "pxor %%mm7, %%mm7 \n\t"
1480  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1481  "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
1482  ".p2align 4 \n\t"
1483  "1: \n\t"
1484  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1485  PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
1486  "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
1487  "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
1488  "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1489  "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
1490  PAVGB" %%mm1, %%mm0 \n\t"
1491  PAVGB" %%mm3, %%mm2 \n\t"
1492  "movq %%mm0, %%mm1 \n\t"
1493  "movq %%mm2, %%mm3 \n\t"
1494  "psrlq $24, %%mm0 \n\t"
1495  "psrlq $24, %%mm2 \n\t"
1496  PAVGB" %%mm1, %%mm0 \n\t"
1497  PAVGB" %%mm3, %%mm2 \n\t"
1498  "punpcklbw %%mm7, %%mm0 \n\t"
1499  "punpcklbw %%mm7, %%mm2 \n\t"
1500  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1501  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1502 
1503  "pmaddwd %%mm0, %%mm1 \n\t"
1504  "pmaddwd %%mm2, %%mm3 \n\t"
1505  "pmaddwd %%mm6, %%mm0 \n\t"
1506  "pmaddwd %%mm6, %%mm2 \n\t"
1507  "psrad $8, %%mm0 \n\t"
1508  "psrad $8, %%mm1 \n\t"
1509  "psrad $8, %%mm2 \n\t"
1510  "psrad $8, %%mm3 \n\t"
1511  "packssdw %%mm2, %%mm0 \n\t"
1512  "packssdw %%mm3, %%mm1 \n\t"
1513  "pmaddwd %%mm5, %%mm0 \n\t"
1514  "pmaddwd %%mm5, %%mm1 \n\t"
1515  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1516  "psraw $7, %%mm0 \n\t"
1517 
1518  "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1519  "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
1520  "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1521  "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
1522  PAVGB" %%mm1, %%mm4 \n\t"
1523  PAVGB" %%mm3, %%mm2 \n\t"
1524  "movq %%mm4, %%mm1 \n\t"
1525  "movq %%mm2, %%mm3 \n\t"
1526  "psrlq $24, %%mm4 \n\t"
1527  "psrlq $24, %%mm2 \n\t"
1528  PAVGB" %%mm1, %%mm4 \n\t"
1529  PAVGB" %%mm3, %%mm2 \n\t"
1530  "punpcklbw %%mm7, %%mm4 \n\t"
1531  "punpcklbw %%mm7, %%mm2 \n\t"
1532  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1533  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1534 
1535  "pmaddwd %%mm4, %%mm1 \n\t"
1536  "pmaddwd %%mm2, %%mm3 \n\t"
1537  "pmaddwd %%mm6, %%mm4 \n\t"
1538  "pmaddwd %%mm6, %%mm2 \n\t"
1539  "psrad $8, %%mm4 \n\t"
1540  "psrad $8, %%mm1 \n\t"
1541  "psrad $8, %%mm2 \n\t"
1542  "psrad $8, %%mm3 \n\t"
1543  "packssdw %%mm2, %%mm4 \n\t"
1544  "packssdw %%mm3, %%mm1 \n\t"
1545  "pmaddwd %%mm5, %%mm4 \n\t"
1546  "pmaddwd %%mm5, %%mm1 \n\t"
1547  "add $24, %%"FF_REG_d"\n\t"
1548  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1549  "psraw $7, %%mm4 \n\t"
1550 
1551  "movq %%mm0, %%mm1 \n\t"
1552  "punpckldq %%mm4, %%mm0 \n\t"
1553  "punpckhdq %%mm4, %%mm1 \n\t"
1554  "packsswb %%mm1, %%mm0 \n\t"
1555  "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1556  "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
1557  "punpckhdq %%mm0, %%mm0 \n\t"
1558  "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
1559  "add $4, %%"FF_REG_a" \n\t"
1560  " js 1b \n\t"
1561  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1562  NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset)
1563  : "%"FF_REG_a, "%"FF_REG_d
1564  );
1565 
1566  udst += chromStride;
1567  vdst += chromStride;
1568  src += srcStride*2;
1569  }
1570 
1571  __asm__ volatile(EMMS" \n\t"
1572  SFENCE" \n\t"
1573  :::"memory");
1574 
1575  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1576 }
1577 #endif /* HAVE_7REGS */
1578 
1579 static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1580 {
1581  dst += count;
1582  src += 2*count;
1583  count= - count;
1584 
1585  if(count <= -16) {
1586  count += 15;
1587  __asm__ volatile(
1588  "pcmpeqw %%mm7, %%mm7 \n\t"
1589  "psrlw $8, %%mm7 \n\t"
1590  "1: \n\t"
1591  "movq -30(%1, %0, 2), %%mm0 \n\t"
1592  "movq -22(%1, %0, 2), %%mm1 \n\t"
1593  "movq -14(%1, %0, 2), %%mm2 \n\t"
1594  "movq -6(%1, %0, 2), %%mm3 \n\t"
1595  "pand %%mm7, %%mm0 \n\t"
1596  "pand %%mm7, %%mm1 \n\t"
1597  "pand %%mm7, %%mm2 \n\t"
1598  "pand %%mm7, %%mm3 \n\t"
1599  "packuswb %%mm1, %%mm0 \n\t"
1600  "packuswb %%mm3, %%mm2 \n\t"
1601  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
1602  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
1603  "add $16, %0 \n\t"
1604  " js 1b \n\t"
1605  : "+r"(count)
1606  : "r"(src), "r"(dst)
1607  );
1608  count -= 15;
1609  }
1610  while(count<0) {
1611  dst[count]= src[2*count];
1612  count++;
1613  }
1614 }
1615 
1616 static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1617 {
1618  src ++;
1619  dst += count;
1620  src += 2*count;
1621  count= - count;
1622 
1623  if(count < -16) {
1624  count += 16;
1625  __asm__ volatile(
1626  "pcmpeqw %%mm7, %%mm7 \n\t"
1627  "psrlw $8, %%mm7 \n\t"
1628  "1: \n\t"
1629  "movq -32(%1, %0, 2), %%mm0 \n\t"
1630  "movq -24(%1, %0, 2), %%mm1 \n\t"
1631  "movq -16(%1, %0, 2), %%mm2 \n\t"
1632  "movq -8(%1, %0, 2), %%mm3 \n\t"
1633  "pand %%mm7, %%mm0 \n\t"
1634  "pand %%mm7, %%mm1 \n\t"
1635  "pand %%mm7, %%mm2 \n\t"
1636  "pand %%mm7, %%mm3 \n\t"
1637  "packuswb %%mm1, %%mm0 \n\t"
1638  "packuswb %%mm3, %%mm2 \n\t"
1639  MOVNTQ" %%mm0,-16(%2, %0) \n\t"
1640  MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
1641  "add $16, %0 \n\t"
1642  " js 1b \n\t"
1643  : "+r"(count)
1644  : "r"(src), "r"(dst)
1645  );
1646  count -= 16;
1647  }
1648  while(count<0) {
1649  dst[count]= src[2*count];
1650  count++;
1651  }
1652 }
1653 
1654 #if ARCH_X86_32
1655 static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1656 {
1657  dst0+= count;
1658  dst1+= count;
1659  src += 4*count;
1660  count= - count;
1661  if(count <= -8) {
1662  count += 7;
1663  __asm__ volatile(
1664  "pcmpeqw %%mm7, %%mm7 \n\t"
1665  "psrlw $8, %%mm7 \n\t"
1666  "1: \n\t"
1667  "movq -28(%1, %0, 4), %%mm0 \n\t"
1668  "movq -20(%1, %0, 4), %%mm1 \n\t"
1669  "movq -12(%1, %0, 4), %%mm2 \n\t"
1670  "movq -4(%1, %0, 4), %%mm3 \n\t"
1671  "pand %%mm7, %%mm0 \n\t"
1672  "pand %%mm7, %%mm1 \n\t"
1673  "pand %%mm7, %%mm2 \n\t"
1674  "pand %%mm7, %%mm3 \n\t"
1675  "packuswb %%mm1, %%mm0 \n\t"
1676  "packuswb %%mm3, %%mm2 \n\t"
1677  "movq %%mm0, %%mm1 \n\t"
1678  "movq %%mm2, %%mm3 \n\t"
1679  "psrlw $8, %%mm0 \n\t"
1680  "psrlw $8, %%mm2 \n\t"
1681  "pand %%mm7, %%mm1 \n\t"
1682  "pand %%mm7, %%mm3 \n\t"
1683  "packuswb %%mm2, %%mm0 \n\t"
1684  "packuswb %%mm3, %%mm1 \n\t"
1685  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
1686  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
1687  "add $8, %0 \n\t"
1688  " js 1b \n\t"
1689  : "+r"(count)
1690  : "r"(src), "r"(dst0), "r"(dst1)
1691  );
1692  count -= 7;
1693  }
1694  while(count<0) {
1695  dst0[count]= src[4*count+0];
1696  dst1[count]= src[4*count+2];
1697  count++;
1698  }
1699 }
1700 #endif /* ARCH_X86_32 */
1701 
1702 static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1703 {
1704  dst0 += count;
1705  dst1 += count;
1706  src0 += 4*count;
1707  src1 += 4*count;
1708  count= - count;
1709 #ifdef PAVGB
1710  if(count <= -8) {
1711  count += 7;
1712  __asm__ volatile(
1713  "pcmpeqw %%mm7, %%mm7 \n\t"
1714  "psrlw $8, %%mm7 \n\t"
1715  "1: \n\t"
1716  "movq -28(%1, %0, 4), %%mm0 \n\t"
1717  "movq -20(%1, %0, 4), %%mm1 \n\t"
1718  "movq -12(%1, %0, 4), %%mm2 \n\t"
1719  "movq -4(%1, %0, 4), %%mm3 \n\t"
1720  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
1721  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
1722  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
1723  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
1724  "pand %%mm7, %%mm0 \n\t"
1725  "pand %%mm7, %%mm1 \n\t"
1726  "pand %%mm7, %%mm2 \n\t"
1727  "pand %%mm7, %%mm3 \n\t"
1728  "packuswb %%mm1, %%mm0 \n\t"
1729  "packuswb %%mm3, %%mm2 \n\t"
1730  "movq %%mm0, %%mm1 \n\t"
1731  "movq %%mm2, %%mm3 \n\t"
1732  "psrlw $8, %%mm0 \n\t"
1733  "psrlw $8, %%mm2 \n\t"
1734  "pand %%mm7, %%mm1 \n\t"
1735  "pand %%mm7, %%mm3 \n\t"
1736  "packuswb %%mm2, %%mm0 \n\t"
1737  "packuswb %%mm3, %%mm1 \n\t"
1738  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
1739  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
1740  "add $8, %0 \n\t"
1741  " js 1b \n\t"
1742  : "+r"(count)
1743  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
1744  );
1745  count -= 7;
1746  }
1747 #endif
1748  while(count<0) {
1749  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
1750  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
1751  count++;
1752  }
1753 }
1754 
1755 static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1756 {
1757  dst0+= count;
1758  dst1+= count;
1759  src += 4*count;
1760  count= - count;
1761  if(count <= -8) {
1762  count += 7;
1763  __asm__ volatile(
1764  "pcmpeqw %%mm7, %%mm7 \n\t"
1765  "psrlw $8, %%mm7 \n\t"
1766  "1: \n\t"
1767  "movq -28(%1, %0, 4), %%mm0 \n\t"
1768  "movq -20(%1, %0, 4), %%mm1 \n\t"
1769  "movq -12(%1, %0, 4), %%mm2 \n\t"
1770  "movq -4(%1, %0, 4), %%mm3 \n\t"
1771  "psrlw $8, %%mm0 \n\t"
1772  "psrlw $8, %%mm1 \n\t"
1773  "psrlw $8, %%mm2 \n\t"
1774  "psrlw $8, %%mm3 \n\t"
1775  "packuswb %%mm1, %%mm0 \n\t"
1776  "packuswb %%mm3, %%mm2 \n\t"
1777  "movq %%mm0, %%mm1 \n\t"
1778  "movq %%mm2, %%mm3 \n\t"
1779  "psrlw $8, %%mm0 \n\t"
1780  "psrlw $8, %%mm2 \n\t"
1781  "pand %%mm7, %%mm1 \n\t"
1782  "pand %%mm7, %%mm3 \n\t"
1783  "packuswb %%mm2, %%mm0 \n\t"
1784  "packuswb %%mm3, %%mm1 \n\t"
1785  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
1786  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
1787  "add $8, %0 \n\t"
1788  " js 1b \n\t"
1789  : "+r"(count)
1790  : "r"(src), "r"(dst0), "r"(dst1)
1791  );
1792  count -= 7;
1793  }
1794  src++;
1795  while(count<0) {
1796  dst0[count]= src[4*count+0];
1797  dst1[count]= src[4*count+2];
1798  count++;
1799  }
1800 }
1801 
1802 static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1803 {
1804  dst0 += count;
1805  dst1 += count;
1806  src0 += 4*count;
1807  src1 += 4*count;
1808  count= - count;
1809 #ifdef PAVGB
1810  if(count <= -8) {
1811  count += 7;
1812  __asm__ volatile(
1813  "pcmpeqw %%mm7, %%mm7 \n\t"
1814  "psrlw $8, %%mm7 \n\t"
1815  "1: \n\t"
1816  "movq -28(%1, %0, 4), %%mm0 \n\t"
1817  "movq -20(%1, %0, 4), %%mm1 \n\t"
1818  "movq -12(%1, %0, 4), %%mm2 \n\t"
1819  "movq -4(%1, %0, 4), %%mm3 \n\t"
1820  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
1821  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
1822  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
1823  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
1824  "psrlw $8, %%mm0 \n\t"
1825  "psrlw $8, %%mm1 \n\t"
1826  "psrlw $8, %%mm2 \n\t"
1827  "psrlw $8, %%mm3 \n\t"
1828  "packuswb %%mm1, %%mm0 \n\t"
1829  "packuswb %%mm3, %%mm2 \n\t"
1830  "movq %%mm0, %%mm1 \n\t"
1831  "movq %%mm2, %%mm3 \n\t"
1832  "psrlw $8, %%mm0 \n\t"
1833  "psrlw $8, %%mm2 \n\t"
1834  "pand %%mm7, %%mm1 \n\t"
1835  "pand %%mm7, %%mm3 \n\t"
1836  "packuswb %%mm2, %%mm0 \n\t"
1837  "packuswb %%mm3, %%mm1 \n\t"
1838  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
1839  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
1840  "add $8, %0 \n\t"
1841  " js 1b \n\t"
1842  : "+r"(count)
1843  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
1844  );
1845  count -= 7;
1846  }
1847 #endif
1848  src0++;
1849  src1++;
1850  while(count<0) {
1851  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
1852  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
1853  count++;
1854  }
1855 }
1856 
1857 static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
1858  int width, int height,
1859  int lumStride, int chromStride, int srcStride)
1860 {
1861  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
1862 
1863  for (int y = 0; y < height; y++) {
1864  extract_even_mmxext(src, ydst, width);
1865  if(y&1) {
1866  extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
1867  udst+= chromStride;
1868  vdst+= chromStride;
1869  }
1870 
1871  src += srcStride;
1872  ydst+= lumStride;
1873  }
1874  __asm__(
1875  EMMS" \n\t"
1876  SFENCE" \n\t"
1877  ::: "memory"
1878  );
1879 }
1880 
1881 static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
1882  int width, int height,
1883  int lumStride, int chromStride, int srcStride)
1884 {
1885  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
1886 
1887  for (int y = 0; y < height; y++) {
1888  extract_even_mmxext(src, ydst, width);
1889  extract_odd2_mmxext(src, udst, vdst, chromWidth);
1890 
1891  src += srcStride;
1892  ydst+= lumStride;
1893  udst+= chromStride;
1894  vdst+= chromStride;
1895  }
1896  __asm__(
1897  EMMS" \n\t"
1898  SFENCE" \n\t"
1899  ::: "memory"
1900  );
1901 }
1902 
1903 static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
1904  int width, int height,
1905  int lumStride, int chromStride, int srcStride)
1906 {
1907  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
1908 
1909  for (int y = 0; y < height; y++) {
1910  extract_odd_mmxext(src, ydst, width);
1911  if(y&1) {
1912  extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
1913  udst+= chromStride;
1914  vdst+= chromStride;
1915  }
1916 
1917  src += srcStride;
1918  ydst+= lumStride;
1919  }
1920  __asm__(
1921  EMMS" \n\t"
1922  SFENCE" \n\t"
1923  ::: "memory"
1924  );
1925 }
1926 
1927 #if ARCH_X86_32
1928 static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
1929  int width, int height,
1930  int lumStride, int chromStride, int srcStride)
1931 {
1932  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
1933 
1934  for (int y = 0; y < height; y++) {
1935  extract_odd_mmxext(src, ydst, width);
1936  extract_even2_mmxext(src, udst, vdst, chromWidth);
1937 
1938  src += srcStride;
1939  ydst+= lumStride;
1940  udst+= chromStride;
1941  vdst+= chromStride;
1942  }
1943  __asm__(
1944  EMMS" \n\t"
1945  SFENCE" \n\t"
1946  ::: "memory"
1947  );
1948 }
1949 #endif /* ARCH_X86_32 */
1950 
1951 static av_cold void rgb2rgb_init_mmxext(void)
1952 {
1953  rgb15to16 = rgb15to16_mmxext;
1954  rgb15tobgr24 = rgb15tobgr24_mmxext;
1955  rgb15to32 = rgb15to32_mmxext;
1956  rgb16tobgr24 = rgb16tobgr24_mmxext;
1957  rgb16to32 = rgb16to32_mmxext;
1958  rgb16to15 = rgb16to15_mmxext;
1959  rgb24tobgr16 = rgb24tobgr16_mmxext;
1960  rgb24tobgr15 = rgb24tobgr15_mmxext;
1961  rgb24tobgr32 = rgb24tobgr32_mmxext;
1962  rgb32to16 = rgb32to16_mmxext;
1963  rgb32to15 = rgb32to15_mmxext;
1964  rgb32tobgr24 = rgb32tobgr24_mmxext;
1965  rgb24to15 = rgb24to15_mmxext;
1966  rgb24to16 = rgb24to16_mmxext;
1967  rgb24tobgr24 = rgb24tobgr24_mmxext;
1968  rgb32tobgr16 = rgb32tobgr16_mmxext;
1969  rgb32tobgr15 = rgb32tobgr15_mmxext;
1970  yv12toyuy2 = yv12toyuy2_mmxext;
1971  yv12touyvy = yv12touyvy_mmxext;
1972  yuv422ptoyuy2 = yuv422ptoyuy2_mmxext;
1973  yuv422ptouyvy = yuv422ptouyvy_mmxext;
1974 #if ARCH_X86_32
1975  uyvytoyuv422 = uyvytoyuv422_mmxext;
1976 #endif
1977  yuyvtoyuv422 = yuyvtoyuv422_mmxext;
1978 
1979  planar2x = planar2x_mmxext;
1980 #if ARCH_X86_32 && HAVE_7REGS
1981  ff_rgb24toyv12 = rgb24toyv12_mmxext;
1982 #endif /* ARCH_X86_32 && HAVE_7REGS */
1983 
1984  yuyvtoyuv420 = yuyvtoyuv420_mmxext;
1985  uyvytoyuv420 = uyvytoyuv420_mmxext;
1986 }
1987 
1988 //SSE2 versions
1989 static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1990  int width, int height, int src1Stride,
1991  int src2Stride, int dstStride)
1992 {
1993  for (int h = 0; h < height; h++) {
1994  if (width >= 16) {
1995  if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1996  __asm__(
1997  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1998  "1: \n\t"
1999  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2000  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2001  "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
2002  "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
2003  "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
2004  "punpcklbw %%xmm2, %%xmm0 \n\t"
2005  "punpckhbw %%xmm2, %%xmm1 \n\t"
2006  "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
2007  "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
2008  "add $16, %%"FF_REG_a" \n\t"
2009  "cmp %3, %%"FF_REG_a" \n\t"
2010  " jb 1b \n\t"
2011  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2012  : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
2013  );
2014  } else
2015  __asm__(
2016  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
2017  "1: \n\t"
2018  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2019  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2020  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
2021  "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
2022  "movq %%mm0, %%mm1 \n\t"
2023  "movq %%mm2, %%mm3 \n\t"
2024  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
2025  "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
2026  "punpcklbw %%mm4, %%mm0 \n\t"
2027  "punpckhbw %%mm4, %%mm1 \n\t"
2028  "punpcklbw %%mm5, %%mm2 \n\t"
2029  "punpckhbw %%mm5, %%mm3 \n\t"
2030  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
2031  MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
2032  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
2033  MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
2034  "add $16, %%"FF_REG_a" \n\t"
2035  "cmp %3, %%"FF_REG_a" \n\t"
2036  " jb 1b \n\t"
2037  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2038  : "memory", "%"FF_REG_a
2039  );
2040 
2041  }
2042  for (int w = (width & (~15)); w < width; w++) {
2043  dest[2*w+0] = src1[w];
2044  dest[2*w+1] = src2[w];
2045  }
2046  dest += dstStride;
2047  src1 += src1Stride;
2048  src2 += src2Stride;
2049  }
2050  __asm__(
2051  EMMS" \n\t"
2052  SFENCE" \n\t"
2053  ::: "memory"
2054  );
2055 }
2056 
2057 /*
2058  RGB15->RGB16 original by Strepto/Astral
2059  ported to gcc & bugfixed : A'rpi
2060  MMXEXT, 3DNOW optimization by Nick Kurshev
2061  32-bit C version, and and&add trick by Michael Niedermayer
2062 */
2063 
2064 #endif /* HAVE_INLINE_ASM */
2065 
2066 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2067 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2068 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2069 void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2070 void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2071 void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2072 void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2073 void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2074 void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2075 
2076 #if ARCH_X86_64
2077 void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2078 void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2079 void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2080 void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2081 void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2082 void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2083 void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2084 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2085 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2086 
2087 void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2088 void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2089 void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2090 void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2091 void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2092 void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2093 void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2094 void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2095 void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2096 
2097 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2098  const uint8_t *src, int width, int height,
2099  int lumStride, int chromStride, int srcStride);
2100 void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2101  const uint8_t *src, int width, int height,
2102  int lumStride, int chromStride, int srcStride);
2103 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2104  const uint8_t *src, int width, int height,
2105  int lumStride, int chromStride, int srcStride);
2106 void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2107  const uint8_t *src, int width, int height,
2108  int lumStride, int chromStride, int srcStride);
2109 #endif
2110 
2111 #define DEINTERLEAVE_BYTES(cpuext) \
2112 void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \
2113  const uint8_t *unused, \
2114  const uint8_t *src1, \
2115  const uint8_t *src2, \
2116  int w, \
2117  uint32_t *unused2, \
2118  void *opq); \
2119 static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
2120  int width, int height, int srcStride, \
2121  int dst1Stride, int dst2Stride) \
2122 { \
2123  for (int h = 0; h < height; h++) { \
2124  if (width >= 16) \
2125  ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
2126  for (int w = (width & (~15)); w < width; w++) { \
2127  dst1[w] = src[2*w+0]; \
2128  dst2[w] = src[2*w+1]; \
2129  } \
2130  src += srcStride; \
2131  dst1 += dst1Stride; \
2132  dst2 += dst2Stride; \
2133  } \
2134 }
2135 
2136 #if HAVE_SSE2_EXTERNAL
2137 DEINTERLEAVE_BYTES(sse2)
2138 #endif
2139 #if HAVE_AVX_EXTERNAL
2140 DEINTERLEAVE_BYTES(avx)
2141 #endif
2142 
2144 {
2145  int cpu_flags = av_get_cpu_flags();
2146 
2147 #if HAVE_INLINE_ASM
2148  if (INLINE_MMXEXT(cpu_flags))
2149  rgb2rgb_init_mmxext();
2150  if (INLINE_SSE2(cpu_flags))
2151  interleaveBytes = interleave_bytes_sse2;
2152 #endif /* HAVE_INLINE_ASM */
2153 
2154 #if HAVE_SSE2_EXTERNAL
2155  if (EXTERNAL_SSE2(cpu_flags)) {
2156 #if ARCH_X86_64
2157  uyvytoyuv422 = ff_uyvytoyuv422_sse2;
2158 #endif
2159  deinterleaveBytes = deinterleave_bytes_sse2;
2160  }
2161 #endif
2162  if (EXTERNAL_SSSE3(cpu_flags)) {
2172  }
2173 #if HAVE_AVX_EXTERNAL
2174  if (EXTERNAL_AVX(cpu_flags)) {
2175  deinterleaveBytes = deinterleave_bytes_avx;
2176 #if ARCH_X86_64
2177  uyvytoyuv422 = ff_uyvytoyuv422_avx;
2178  }
2180  shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
2181  shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
2182  shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
2183  shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
2184  shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
2185  shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
2186  shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
2187  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
2188  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
2189  }
2191  shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
2192  shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
2193  shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
2194  shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
2195  shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
2196  shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
2197  shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
2198  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
2199  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
2200  }
2202  uyvytoyuv422 = ff_uyvytoyuv422_avx2;
2203  }
2205  uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
2206 #endif
2207  }
2208 #endif
2209 }
rgb32tobgr24
void(* rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:35
shuffle_bytes_3012
void(* shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:57
cpu.h
r
const char * r
Definition: vf_curves.c:127
mem_internal.h
yv12toyuy2
void(* yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:65
ff_shuffle_bytes_3210_ssse3
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
src1
const pixel * src1
Definition: h264pred_template.c:420
DEINTERLEAVE_BYTES
#define DEINTERLEAVE_BYTES(cpuext)
Definition: rgb2rgb.c:2111
x86_reg
int x86_reg
Definition: asm.h:71
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:73
b
#define b
Definition: input.c:42
shuffle_bytes_3210
void(* shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:58
rgb2yuv
static const char rgb2yuv[]
Definition: vf_scale_vulkan.c:86
rgb32tobgr16
void(* rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:36
yuyvtoyuv422
void(* yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:103
rgb24tobgr16
void(* rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:40
rgb15to32
void(* rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:52
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
yv12touyvy
void(* yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:69
rgb32to16
void(* rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:45
rgb
Definition: rpzaenc.c:60
shuffle_bytes_2130
void(* shuffle_bytes_2130)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:61
MANGLE
#define MANGLE(a)
Definition: asm.h:126
ff_shuffle_bytes_0321_ssse3
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
ff_shuffle_bytes_2013_ssse3
void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
av_cold
#define av_cold
Definition: attributes.h:111
rgb16tobgr24
void(* rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:42
ff_shuffle_bytes_3012_ssse3
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:144
s
#define s(width, name)
Definition: cbs_vp9.c:198
AV_CEIL_RSHIFT
#define AV_CEIL_RSHIFT(a, b)
Definition: common.h:60
INLINE_SSE2
#define INLINE_SSE2(flags)
Definition: cpu.h:83
g
const char * g
Definition: vf_curves.c:128
shuffle_bytes_1230
void(* shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:56
rgb15tobgr24
void(* rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:43
yuv422ptoyuy2
void(* yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:73
shuffle_bytes_2103
void(* shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:55
rgb32tobgr15
void(* rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:37
interleaveBytes
void(* interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, int width, int height, int src1Stride, int src2Stride, int dstStride)
Definition: rgb2rgb.c:88
shuffle_bytes_3102
void(* shuffle_bytes_3102)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:59
rgb16to15
void(* rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:50
asm.h
yuyvtoyuv420
void(* yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:100
rgb24tobgr32
void(* rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:38
height
#define height
Definition: dsp.h:89
ff_shuffle_bytes_3102_ssse3
void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
cpu.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
rgb2rgb_init_x86
av_cold void rgb2rgb_init_x86(void)
Definition: rgb2rgb.c:2143
shuffle_bytes_0321
void(* shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:54
attributes.h
rgb24to16
void(* rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:47
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:53
uyvytoyuv422
void(* uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:97
ff_rgb24toyv12
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
Definition: rgb2rgb.c:81
ff_shuffle_bytes_2130_ssse3
void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
rgb24to15
void(* rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:48
src2
const pixel * src2
Definition: h264pred_template.c:421
DECLARE_ASM_CONST
DECLARE_ASM_CONST(16, double, pd_1)[2]
rgb32to15
void(* rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:46
swscale_internal.h
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:28
ff_shuffle_bytes_1203_ssse3
void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
bswap.h
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:64
deinterleaveBytes
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:91
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:97
uyvytoyuv420
void(* uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:94
EXTERNAL_AVX512ICL
#define EXTERNAL_AVX512ICL(flags)
Definition: cpu.h:78
rgb16to32
void(* rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:49
rgb24tobgr15
void(* rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:41
shuffle_bytes_2013
void(* shuffle_bytes_2013)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:60
MOVNTQ
#define MOVNTQ(a, b)
Definition: swscale_template.c:31
rgb15to16
void(* rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:51
shuffle_bytes_1203
void(* shuffle_bytes_1203)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:62
yuv422ptouyvy
void(* yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:77
INLINE_MMXEXT
#define INLINE_MMXEXT(flags)
Definition: cpu.h:81
rgb24tobgr24
void(* rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:39
src0
const pixel *const src0
Definition: h264pred_template.c:419
ff_shuffle_bytes_1230_ssse3
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
w
uint8_t w
Definition: llvidencdsp.c:39
ff_shuffle_bytes_2103_ssse3
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
int32_t
int32_t
Definition: audioconvert.c:56
ff_rgb24toyv12_c
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
width should be a multiple of 2.
Definition: rgb2rgb_template.c:580
planar2x
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:86
h
h
Definition: vp9dsp_template.c:2070
width
#define width
Definition: dsp.h:89
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:59
rgb2rgb.h
src
#define src
Definition: vp8dsp.c:248
swscale.h