FFmpeg
rgb2rgb_template.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  * lot of big-endian byte order fixes by Alex Beregszaszi
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 #include "libavutil/attributes.h"
31 #include "libavutil/x86/asm.h"
32 
33 #undef PREFETCH
34 #undef MOVNTQ
35 #undef EMMS
36 #undef SFENCE
37 #undef PAVGB
38 
39 #define PREFETCH "prefetchnta"
40 #define PAVGB "pavgb"
41 #define MOVNTQ "movntq"
42 #define SFENCE "sfence"
43 
44 #define EMMS "emms"
45 
46 #if !COMPILE_TEMPLATE_SSE2
47 
48 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
49 {
50  uint8_t *dest = dst;
51  const uint8_t *s = src;
52  const uint8_t *end;
53  const uint8_t *mm_end;
54  end = s + src_size;
55  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
56  mm_end = end - 23;
57  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
58  while (s < mm_end) {
59  __asm__ volatile(
60  PREFETCH" 32(%1) \n\t"
61  "movd (%1), %%mm0 \n\t"
62  "punpckldq 3(%1), %%mm0 \n\t"
63  "movd 6(%1), %%mm1 \n\t"
64  "punpckldq 9(%1), %%mm1 \n\t"
65  "movd 12(%1), %%mm2 \n\t"
66  "punpckldq 15(%1), %%mm2 \n\t"
67  "movd 18(%1), %%mm3 \n\t"
68  "punpckldq 21(%1), %%mm3 \n\t"
69  "por %%mm7, %%mm0 \n\t"
70  "por %%mm7, %%mm1 \n\t"
71  "por %%mm7, %%mm2 \n\t"
72  "por %%mm7, %%mm3 \n\t"
73  MOVNTQ" %%mm0, (%0) \n\t"
74  MOVNTQ" %%mm1, 8(%0) \n\t"
75  MOVNTQ" %%mm2, 16(%0) \n\t"
76  MOVNTQ" %%mm3, 24(%0)"
77  :: "r"(dest), "r"(s)
78  :"memory");
79  dest += 32;
80  s += 24;
81  }
82  __asm__ volatile(SFENCE:::"memory");
83  __asm__ volatile(EMMS:::"memory");
84  while (s < end) {
85  *dest++ = *s++;
86  *dest++ = *s++;
87  *dest++ = *s++;
88  *dest++ = 255;
89  }
90 }
91 
92 #define STORE_BGR24_MMX \
93  "psrlq $8, %%mm2 \n\t" \
94  "psrlq $8, %%mm3 \n\t" \
95  "psrlq $8, %%mm6 \n\t" \
96  "psrlq $8, %%mm7 \n\t" \
97  "pand "MANGLE(mask24l)", %%mm0\n\t" \
98  "pand "MANGLE(mask24l)", %%mm1\n\t" \
99  "pand "MANGLE(mask24l)", %%mm4\n\t" \
100  "pand "MANGLE(mask24l)", %%mm5\n\t" \
101  "pand "MANGLE(mask24h)", %%mm2\n\t" \
102  "pand "MANGLE(mask24h)", %%mm3\n\t" \
103  "pand "MANGLE(mask24h)", %%mm6\n\t" \
104  "pand "MANGLE(mask24h)", %%mm7\n\t" \
105  "por %%mm2, %%mm0 \n\t" \
106  "por %%mm3, %%mm1 \n\t" \
107  "por %%mm6, %%mm4 \n\t" \
108  "por %%mm7, %%mm5 \n\t" \
109  \
110  "movq %%mm1, %%mm2 \n\t" \
111  "movq %%mm4, %%mm3 \n\t" \
112  "psllq $48, %%mm2 \n\t" \
113  "psllq $32, %%mm3 \n\t" \
114  "por %%mm2, %%mm0 \n\t" \
115  "psrlq $16, %%mm1 \n\t" \
116  "psrlq $32, %%mm4 \n\t" \
117  "psllq $16, %%mm5 \n\t" \
118  "por %%mm3, %%mm1 \n\t" \
119  "por %%mm5, %%mm4 \n\t" \
120  \
121  MOVNTQ" %%mm0, (%0) \n\t" \
122  MOVNTQ" %%mm1, 8(%0) \n\t" \
123  MOVNTQ" %%mm4, 16(%0)"
124 
125 
126 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
127 {
128  uint8_t *dest = dst;
129  const uint8_t *s = src;
130  const uint8_t *end;
131  const uint8_t *mm_end;
132  end = s + src_size;
133  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
134  mm_end = end - 31;
135  while (s < mm_end) {
136  __asm__ volatile(
137  PREFETCH" 32(%1) \n\t"
138  "movq (%1), %%mm0 \n\t"
139  "movq 8(%1), %%mm1 \n\t"
140  "movq 16(%1), %%mm4 \n\t"
141  "movq 24(%1), %%mm5 \n\t"
142  "movq %%mm0, %%mm2 \n\t"
143  "movq %%mm1, %%mm3 \n\t"
144  "movq %%mm4, %%mm6 \n\t"
145  "movq %%mm5, %%mm7 \n\t"
147  :: "r"(dest), "r"(s)
148  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
149  :"memory");
150  dest += 24;
151  s += 32;
152  }
153  __asm__ volatile(SFENCE:::"memory");
154  __asm__ volatile(EMMS:::"memory");
155  while (s < end) {
156  *dest++ = *s++;
157  *dest++ = *s++;
158  *dest++ = *s++;
159  s++;
160  }
161 }
162 
163 /*
164  original by Strepto/Astral
165  ported to gcc & bugfixed: A'rpi
166  MMXEXT, 3DNOW optimization by Nick Kurshev
167  32-bit C version, and and&add trick by Michael Niedermayer
168 */
169 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
170 {
171  register const uint8_t* s=src;
172  register uint8_t* d=dst;
173  register const uint8_t *end;
174  const uint8_t *mm_end;
175  end = s + src_size;
176  __asm__ volatile(PREFETCH" %0"::"m"(*s));
177  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
178  mm_end = end - 15;
179  while (s<mm_end) {
180  __asm__ volatile(
181  PREFETCH" 32(%1) \n\t"
182  "movq (%1), %%mm0 \n\t"
183  "movq 8(%1), %%mm2 \n\t"
184  "movq %%mm0, %%mm1 \n\t"
185  "movq %%mm2, %%mm3 \n\t"
186  "pand %%mm4, %%mm0 \n\t"
187  "pand %%mm4, %%mm2 \n\t"
188  "paddw %%mm1, %%mm0 \n\t"
189  "paddw %%mm3, %%mm2 \n\t"
190  MOVNTQ" %%mm0, (%0) \n\t"
191  MOVNTQ" %%mm2, 8(%0)"
192  :: "r"(d), "r"(s)
193  );
194  d+=16;
195  s+=16;
196  }
197  __asm__ volatile(SFENCE:::"memory");
198  __asm__ volatile(EMMS:::"memory");
199  mm_end = end - 3;
200  while (s < mm_end) {
201  register unsigned x= *((const uint32_t *)s);
202  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
203  d+=4;
204  s+=4;
205  }
206  if (s < end) {
207  register unsigned short x= *((const uint16_t *)s);
208  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
209  }
210 }
211 
212 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
213 {
214  register const uint8_t* s=src;
215  register uint8_t* d=dst;
216  register const uint8_t *end;
217  const uint8_t *mm_end;
218  end = s + src_size;
219  __asm__ volatile(PREFETCH" %0"::"m"(*s));
220  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
221  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
222  mm_end = end - 15;
223  while (s<mm_end) {
224  __asm__ volatile(
225  PREFETCH" 32(%1) \n\t"
226  "movq (%1), %%mm0 \n\t"
227  "movq 8(%1), %%mm2 \n\t"
228  "movq %%mm0, %%mm1 \n\t"
229  "movq %%mm2, %%mm3 \n\t"
230  "psrlq $1, %%mm0 \n\t"
231  "psrlq $1, %%mm2 \n\t"
232  "pand %%mm7, %%mm0 \n\t"
233  "pand %%mm7, %%mm2 \n\t"
234  "pand %%mm6, %%mm1 \n\t"
235  "pand %%mm6, %%mm3 \n\t"
236  "por %%mm1, %%mm0 \n\t"
237  "por %%mm3, %%mm2 \n\t"
238  MOVNTQ" %%mm0, (%0) \n\t"
239  MOVNTQ" %%mm2, 8(%0)"
240  :: "r"(d), "r"(s)
241  );
242  d+=16;
243  s+=16;
244  }
245  __asm__ volatile(SFENCE:::"memory");
246  __asm__ volatile(EMMS:::"memory");
247  mm_end = end - 3;
248  while (s < mm_end) {
249  register uint32_t x= *((const uint32_t*)s);
250  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
251  s+=4;
252  d+=4;
253  }
254  if (s < end) {
255  register uint16_t x= *((const uint16_t*)s);
256  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
257  }
258 }
259 
260 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
261 {
262  const uint8_t *s = src;
263  const uint8_t *end;
264  const uint8_t *mm_end;
265  uint16_t *d = (uint16_t *)dst;
266  end = s + src_size;
267  mm_end = end - 15;
268  __asm__ volatile(
269  "movq %3, %%mm5 \n\t"
270  "movq %4, %%mm6 \n\t"
271  "movq %5, %%mm7 \n\t"
272  "jmp 2f \n\t"
273  ".p2align 4 \n\t"
274  "1: \n\t"
275  PREFETCH" 32(%1) \n\t"
276  "movd (%1), %%mm0 \n\t"
277  "movd 4(%1), %%mm3 \n\t"
278  "punpckldq 8(%1), %%mm0 \n\t"
279  "punpckldq 12(%1), %%mm3 \n\t"
280  "movq %%mm0, %%mm1 \n\t"
281  "movq %%mm3, %%mm4 \n\t"
282  "pand %%mm6, %%mm0 \n\t"
283  "pand %%mm6, %%mm3 \n\t"
284  "pmaddwd %%mm7, %%mm0 \n\t"
285  "pmaddwd %%mm7, %%mm3 \n\t"
286  "pand %%mm5, %%mm1 \n\t"
287  "pand %%mm5, %%mm4 \n\t"
288  "por %%mm1, %%mm0 \n\t"
289  "por %%mm4, %%mm3 \n\t"
290  "psrld $5, %%mm0 \n\t"
291  "pslld $11, %%mm3 \n\t"
292  "por %%mm3, %%mm0 \n\t"
293  MOVNTQ" %%mm0, (%0) \n\t"
294  "add $16, %1 \n\t"
295  "add $8, %0 \n\t"
296  "2: \n\t"
297  "cmp %2, %1 \n\t"
298  " jb 1b \n\t"
299  : "+r" (d), "+r"(s)
300  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
301  );
302  __asm__ volatile(SFENCE:::"memory");
303  __asm__ volatile(EMMS:::"memory");
304  while (s < end) {
305  register int rgb = *(const uint32_t*)s; s += 4;
306  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
307  }
308 }
309 
310 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
311 {
312  const uint8_t *s = src;
313  const uint8_t *end;
314  const uint8_t *mm_end;
315  uint16_t *d = (uint16_t *)dst;
316  end = s + src_size;
317  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
318  __asm__ volatile(
319  "movq %0, %%mm7 \n\t"
320  "movq %1, %%mm6 \n\t"
321  ::"m"(red_16mask),"m"(green_16mask));
322  mm_end = end - 15;
323  while (s < mm_end) {
324  __asm__ volatile(
325  PREFETCH" 32(%1) \n\t"
326  "movd (%1), %%mm0 \n\t"
327  "movd 4(%1), %%mm3 \n\t"
328  "punpckldq 8(%1), %%mm0 \n\t"
329  "punpckldq 12(%1), %%mm3 \n\t"
330  "movq %%mm0, %%mm1 \n\t"
331  "movq %%mm0, %%mm2 \n\t"
332  "movq %%mm3, %%mm4 \n\t"
333  "movq %%mm3, %%mm5 \n\t"
334  "psllq $8, %%mm0 \n\t"
335  "psllq $8, %%mm3 \n\t"
336  "pand %%mm7, %%mm0 \n\t"
337  "pand %%mm7, %%mm3 \n\t"
338  "psrlq $5, %%mm1 \n\t"
339  "psrlq $5, %%mm4 \n\t"
340  "pand %%mm6, %%mm1 \n\t"
341  "pand %%mm6, %%mm4 \n\t"
342  "psrlq $19, %%mm2 \n\t"
343  "psrlq $19, %%mm5 \n\t"
344  "pand %2, %%mm2 \n\t"
345  "pand %2, %%mm5 \n\t"
346  "por %%mm1, %%mm0 \n\t"
347  "por %%mm4, %%mm3 \n\t"
348  "por %%mm2, %%mm0 \n\t"
349  "por %%mm5, %%mm3 \n\t"
350  "psllq $16, %%mm3 \n\t"
351  "por %%mm3, %%mm0 \n\t"
352  MOVNTQ" %%mm0, (%0) \n\t"
353  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
354  d += 4;
355  s += 16;
356  }
357  __asm__ volatile(SFENCE:::"memory");
358  __asm__ volatile(EMMS:::"memory");
359  while (s < end) {
360  register int rgb = *(const uint32_t*)s; s += 4;
361  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
362  }
363 }
364 
365 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
366 {
367  const uint8_t *s = src;
368  const uint8_t *end;
369  const uint8_t *mm_end;
370  uint16_t *d = (uint16_t *)dst;
371  end = s + src_size;
372  mm_end = end - 15;
373  __asm__ volatile(
374  "movq %3, %%mm5 \n\t"
375  "movq %4, %%mm6 \n\t"
376  "movq %5, %%mm7 \n\t"
377  "jmp 2f \n\t"
378  ".p2align 4 \n\t"
379  "1: \n\t"
380  PREFETCH" 32(%1) \n\t"
381  "movd (%1), %%mm0 \n\t"
382  "movd 4(%1), %%mm3 \n\t"
383  "punpckldq 8(%1), %%mm0 \n\t"
384  "punpckldq 12(%1), %%mm3 \n\t"
385  "movq %%mm0, %%mm1 \n\t"
386  "movq %%mm3, %%mm4 \n\t"
387  "pand %%mm6, %%mm0 \n\t"
388  "pand %%mm6, %%mm3 \n\t"
389  "pmaddwd %%mm7, %%mm0 \n\t"
390  "pmaddwd %%mm7, %%mm3 \n\t"
391  "pand %%mm5, %%mm1 \n\t"
392  "pand %%mm5, %%mm4 \n\t"
393  "por %%mm1, %%mm0 \n\t"
394  "por %%mm4, %%mm3 \n\t"
395  "psrld $6, %%mm0 \n\t"
396  "pslld $10, %%mm3 \n\t"
397  "por %%mm3, %%mm0 \n\t"
398  MOVNTQ" %%mm0, (%0) \n\t"
399  "add $16, %1 \n\t"
400  "add $8, %0 \n\t"
401  "2: \n\t"
402  "cmp %2, %1 \n\t"
403  " jb 1b \n\t"
404  : "+r" (d), "+r"(s)
405  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
406  );
407  __asm__ volatile(SFENCE:::"memory");
408  __asm__ volatile(EMMS:::"memory");
409  while (s < end) {
410  register int rgb = *(const uint32_t*)s; s += 4;
411  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
412  }
413 }
414 
415 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
416 {
417  const uint8_t *s = src;
418  const uint8_t *end;
419  const uint8_t *mm_end;
420  uint16_t *d = (uint16_t *)dst;
421  end = s + src_size;
422  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
423  __asm__ volatile(
424  "movq %0, %%mm7 \n\t"
425  "movq %1, %%mm6 \n\t"
426  ::"m"(red_15mask),"m"(green_15mask));
427  mm_end = end - 15;
428  while (s < mm_end) {
429  __asm__ volatile(
430  PREFETCH" 32(%1) \n\t"
431  "movd (%1), %%mm0 \n\t"
432  "movd 4(%1), %%mm3 \n\t"
433  "punpckldq 8(%1), %%mm0 \n\t"
434  "punpckldq 12(%1), %%mm3 \n\t"
435  "movq %%mm0, %%mm1 \n\t"
436  "movq %%mm0, %%mm2 \n\t"
437  "movq %%mm3, %%mm4 \n\t"
438  "movq %%mm3, %%mm5 \n\t"
439  "psllq $7, %%mm0 \n\t"
440  "psllq $7, %%mm3 \n\t"
441  "pand %%mm7, %%mm0 \n\t"
442  "pand %%mm7, %%mm3 \n\t"
443  "psrlq $6, %%mm1 \n\t"
444  "psrlq $6, %%mm4 \n\t"
445  "pand %%mm6, %%mm1 \n\t"
446  "pand %%mm6, %%mm4 \n\t"
447  "psrlq $19, %%mm2 \n\t"
448  "psrlq $19, %%mm5 \n\t"
449  "pand %2, %%mm2 \n\t"
450  "pand %2, %%mm5 \n\t"
451  "por %%mm1, %%mm0 \n\t"
452  "por %%mm4, %%mm3 \n\t"
453  "por %%mm2, %%mm0 \n\t"
454  "por %%mm5, %%mm3 \n\t"
455  "psllq $16, %%mm3 \n\t"
456  "por %%mm3, %%mm0 \n\t"
457  MOVNTQ" %%mm0, (%0) \n\t"
458  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
459  d += 4;
460  s += 16;
461  }
462  __asm__ volatile(SFENCE:::"memory");
463  __asm__ volatile(EMMS:::"memory");
464  while (s < end) {
465  register int rgb = *(const uint32_t*)s; s += 4;
466  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
467  }
468 }
469 
470 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
471 {
472  const uint8_t *s = src;
473  const uint8_t *end;
474  const uint8_t *mm_end;
475  uint16_t *d = (uint16_t *)dst;
476  end = s + src_size;
477  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
478  __asm__ volatile(
479  "movq %0, %%mm7 \n\t"
480  "movq %1, %%mm6 \n\t"
481  ::"m"(red_16mask),"m"(green_16mask));
482  mm_end = end - 11;
483  while (s < mm_end) {
484  __asm__ volatile(
485  PREFETCH" 32(%1) \n\t"
486  "movd (%1), %%mm0 \n\t"
487  "movd 3(%1), %%mm3 \n\t"
488  "punpckldq 6(%1), %%mm0 \n\t"
489  "punpckldq 9(%1), %%mm3 \n\t"
490  "movq %%mm0, %%mm1 \n\t"
491  "movq %%mm0, %%mm2 \n\t"
492  "movq %%mm3, %%mm4 \n\t"
493  "movq %%mm3, %%mm5 \n\t"
494  "psrlq $3, %%mm0 \n\t"
495  "psrlq $3, %%mm3 \n\t"
496  "pand %2, %%mm0 \n\t"
497  "pand %2, %%mm3 \n\t"
498  "psrlq $5, %%mm1 \n\t"
499  "psrlq $5, %%mm4 \n\t"
500  "pand %%mm6, %%mm1 \n\t"
501  "pand %%mm6, %%mm4 \n\t"
502  "psrlq $8, %%mm2 \n\t"
503  "psrlq $8, %%mm5 \n\t"
504  "pand %%mm7, %%mm2 \n\t"
505  "pand %%mm7, %%mm5 \n\t"
506  "por %%mm1, %%mm0 \n\t"
507  "por %%mm4, %%mm3 \n\t"
508  "por %%mm2, %%mm0 \n\t"
509  "por %%mm5, %%mm3 \n\t"
510  "psllq $16, %%mm3 \n\t"
511  "por %%mm3, %%mm0 \n\t"
512  MOVNTQ" %%mm0, (%0) \n\t"
513  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
514  d += 4;
515  s += 12;
516  }
517  __asm__ volatile(SFENCE:::"memory");
518  __asm__ volatile(EMMS:::"memory");
519  while (s < end) {
520  const int b = *s++;
521  const int g = *s++;
522  const int r = *s++;
523  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
524  }
525 }
526 
527 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
528 {
529  const uint8_t *s = src;
530  const uint8_t *end;
531  const uint8_t *mm_end;
532  uint16_t *d = (uint16_t *)dst;
533  end = s + src_size;
534  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
535  __asm__ volatile(
536  "movq %0, %%mm7 \n\t"
537  "movq %1, %%mm6 \n\t"
538  ::"m"(red_16mask),"m"(green_16mask));
539  mm_end = end - 15;
540  while (s < mm_end) {
541  __asm__ volatile(
542  PREFETCH" 32(%1) \n\t"
543  "movd (%1), %%mm0 \n\t"
544  "movd 3(%1), %%mm3 \n\t"
545  "punpckldq 6(%1), %%mm0 \n\t"
546  "punpckldq 9(%1), %%mm3 \n\t"
547  "movq %%mm0, %%mm1 \n\t"
548  "movq %%mm0, %%mm2 \n\t"
549  "movq %%mm3, %%mm4 \n\t"
550  "movq %%mm3, %%mm5 \n\t"
551  "psllq $8, %%mm0 \n\t"
552  "psllq $8, %%mm3 \n\t"
553  "pand %%mm7, %%mm0 \n\t"
554  "pand %%mm7, %%mm3 \n\t"
555  "psrlq $5, %%mm1 \n\t"
556  "psrlq $5, %%mm4 \n\t"
557  "pand %%mm6, %%mm1 \n\t"
558  "pand %%mm6, %%mm4 \n\t"
559  "psrlq $19, %%mm2 \n\t"
560  "psrlq $19, %%mm5 \n\t"
561  "pand %2, %%mm2 \n\t"
562  "pand %2, %%mm5 \n\t"
563  "por %%mm1, %%mm0 \n\t"
564  "por %%mm4, %%mm3 \n\t"
565  "por %%mm2, %%mm0 \n\t"
566  "por %%mm5, %%mm3 \n\t"
567  "psllq $16, %%mm3 \n\t"
568  "por %%mm3, %%mm0 \n\t"
569  MOVNTQ" %%mm0, (%0) \n\t"
570  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
571  d += 4;
572  s += 12;
573  }
574  __asm__ volatile(SFENCE:::"memory");
575  __asm__ volatile(EMMS:::"memory");
576  while (s < end) {
577  const int r = *s++;
578  const int g = *s++;
579  const int b = *s++;
580  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
581  }
582 }
583 
584 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
585 {
586  const uint8_t *s = src;
587  const uint8_t *end;
588  const uint8_t *mm_end;
589  uint16_t *d = (uint16_t *)dst;
590  end = s + src_size;
591  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
592  __asm__ volatile(
593  "movq %0, %%mm7 \n\t"
594  "movq %1, %%mm6 \n\t"
595  ::"m"(red_15mask),"m"(green_15mask));
596  mm_end = end - 11;
597  while (s < mm_end) {
598  __asm__ volatile(
599  PREFETCH" 32(%1) \n\t"
600  "movd (%1), %%mm0 \n\t"
601  "movd 3(%1), %%mm3 \n\t"
602  "punpckldq 6(%1), %%mm0 \n\t"
603  "punpckldq 9(%1), %%mm3 \n\t"
604  "movq %%mm0, %%mm1 \n\t"
605  "movq %%mm0, %%mm2 \n\t"
606  "movq %%mm3, %%mm4 \n\t"
607  "movq %%mm3, %%mm5 \n\t"
608  "psrlq $3, %%mm0 \n\t"
609  "psrlq $3, %%mm3 \n\t"
610  "pand %2, %%mm0 \n\t"
611  "pand %2, %%mm3 \n\t"
612  "psrlq $6, %%mm1 \n\t"
613  "psrlq $6, %%mm4 \n\t"
614  "pand %%mm6, %%mm1 \n\t"
615  "pand %%mm6, %%mm4 \n\t"
616  "psrlq $9, %%mm2 \n\t"
617  "psrlq $9, %%mm5 \n\t"
618  "pand %%mm7, %%mm2 \n\t"
619  "pand %%mm7, %%mm5 \n\t"
620  "por %%mm1, %%mm0 \n\t"
621  "por %%mm4, %%mm3 \n\t"
622  "por %%mm2, %%mm0 \n\t"
623  "por %%mm5, %%mm3 \n\t"
624  "psllq $16, %%mm3 \n\t"
625  "por %%mm3, %%mm0 \n\t"
626  MOVNTQ" %%mm0, (%0) \n\t"
627  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
628  d += 4;
629  s += 12;
630  }
631  __asm__ volatile(SFENCE:::"memory");
632  __asm__ volatile(EMMS:::"memory");
633  while (s < end) {
634  const int b = *s++;
635  const int g = *s++;
636  const int r = *s++;
637  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
638  }
639 }
640 
641 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
642 {
643  const uint8_t *s = src;
644  const uint8_t *end;
645  const uint8_t *mm_end;
646  uint16_t *d = (uint16_t *)dst;
647  end = s + src_size;
648  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
649  __asm__ volatile(
650  "movq %0, %%mm7 \n\t"
651  "movq %1, %%mm6 \n\t"
652  ::"m"(red_15mask),"m"(green_15mask));
653  mm_end = end - 15;
654  while (s < mm_end) {
655  __asm__ volatile(
656  PREFETCH" 32(%1) \n\t"
657  "movd (%1), %%mm0 \n\t"
658  "movd 3(%1), %%mm3 \n\t"
659  "punpckldq 6(%1), %%mm0 \n\t"
660  "punpckldq 9(%1), %%mm3 \n\t"
661  "movq %%mm0, %%mm1 \n\t"
662  "movq %%mm0, %%mm2 \n\t"
663  "movq %%mm3, %%mm4 \n\t"
664  "movq %%mm3, %%mm5 \n\t"
665  "psllq $7, %%mm0 \n\t"
666  "psllq $7, %%mm3 \n\t"
667  "pand %%mm7, %%mm0 \n\t"
668  "pand %%mm7, %%mm3 \n\t"
669  "psrlq $6, %%mm1 \n\t"
670  "psrlq $6, %%mm4 \n\t"
671  "pand %%mm6, %%mm1 \n\t"
672  "pand %%mm6, %%mm4 \n\t"
673  "psrlq $19, %%mm2 \n\t"
674  "psrlq $19, %%mm5 \n\t"
675  "pand %2, %%mm2 \n\t"
676  "pand %2, %%mm5 \n\t"
677  "por %%mm1, %%mm0 \n\t"
678  "por %%mm4, %%mm3 \n\t"
679  "por %%mm2, %%mm0 \n\t"
680  "por %%mm5, %%mm3 \n\t"
681  "psllq $16, %%mm3 \n\t"
682  "por %%mm3, %%mm0 \n\t"
683  MOVNTQ" %%mm0, (%0) \n\t"
684  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
685  d += 4;
686  s += 12;
687  }
688  __asm__ volatile(SFENCE:::"memory");
689  __asm__ volatile(EMMS:::"memory");
690  while (s < end) {
691  const int r = *s++;
692  const int g = *s++;
693  const int b = *s++;
694  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
695  }
696 }
697 
698 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
699 {
700  const uint16_t *end;
701  const uint16_t *mm_end;
702  uint8_t *d = dst;
703  const uint16_t *s = (const uint16_t*)src;
704  end = s + src_size/2;
705  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
706  mm_end = end - 7;
707  while (s < mm_end) {
708  __asm__ volatile(
709  PREFETCH" 32(%1) \n\t"
710  "movq (%1), %%mm0 \n\t"
711  "movq (%1), %%mm1 \n\t"
712  "movq (%1), %%mm2 \n\t"
713  "pand %2, %%mm0 \n\t"
714  "pand %3, %%mm1 \n\t"
715  "pand %4, %%mm2 \n\t"
716  "psllq $5, %%mm0 \n\t"
717  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
718  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
719  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
720  "movq %%mm0, %%mm3 \n\t"
721  "movq %%mm1, %%mm4 \n\t"
722  "movq %%mm2, %%mm5 \n\t"
723  "punpcklwd %5, %%mm0 \n\t"
724  "punpcklwd %5, %%mm1 \n\t"
725  "punpcklwd %5, %%mm2 \n\t"
726  "punpckhwd %5, %%mm3 \n\t"
727  "punpckhwd %5, %%mm4 \n\t"
728  "punpckhwd %5, %%mm5 \n\t"
729  "psllq $8, %%mm1 \n\t"
730  "psllq $16, %%mm2 \n\t"
731  "por %%mm1, %%mm0 \n\t"
732  "por %%mm2, %%mm0 \n\t"
733  "psllq $8, %%mm4 \n\t"
734  "psllq $16, %%mm5 \n\t"
735  "por %%mm4, %%mm3 \n\t"
736  "por %%mm5, %%mm3 \n\t"
737 
738  "movq %%mm0, %%mm6 \n\t"
739  "movq %%mm3, %%mm7 \n\t"
740 
741  "movq 8(%1), %%mm0 \n\t"
742  "movq 8(%1), %%mm1 \n\t"
743  "movq 8(%1), %%mm2 \n\t"
744  "pand %2, %%mm0 \n\t"
745  "pand %3, %%mm1 \n\t"
746  "pand %4, %%mm2 \n\t"
747  "psllq $5, %%mm0 \n\t"
748  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
749  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
750  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
751  "movq %%mm0, %%mm3 \n\t"
752  "movq %%mm1, %%mm4 \n\t"
753  "movq %%mm2, %%mm5 \n\t"
754  "punpcklwd %5, %%mm0 \n\t"
755  "punpcklwd %5, %%mm1 \n\t"
756  "punpcklwd %5, %%mm2 \n\t"
757  "punpckhwd %5, %%mm3 \n\t"
758  "punpckhwd %5, %%mm4 \n\t"
759  "punpckhwd %5, %%mm5 \n\t"
760  "psllq $8, %%mm1 \n\t"
761  "psllq $16, %%mm2 \n\t"
762  "por %%mm1, %%mm0 \n\t"
763  "por %%mm2, %%mm0 \n\t"
764  "psllq $8, %%mm4 \n\t"
765  "psllq $16, %%mm5 \n\t"
766  "por %%mm4, %%mm3 \n\t"
767  "por %%mm5, %%mm3 \n\t"
768 
769  :"=m"(*d)
770  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
771  NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
772  :"memory");
773  /* borrowed 32 to 24 */
774  __asm__ volatile(
775  "movq %%mm0, %%mm4 \n\t"
776  "movq %%mm3, %%mm5 \n\t"
777  "movq %%mm6, %%mm0 \n\t"
778  "movq %%mm7, %%mm1 \n\t"
779 
780  "movq %%mm4, %%mm6 \n\t"
781  "movq %%mm5, %%mm7 \n\t"
782  "movq %%mm0, %%mm2 \n\t"
783  "movq %%mm1, %%mm3 \n\t"
784 
786 
787  :: "r"(d), "m"(*s)
788  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
789  :"memory");
790  d += 24;
791  s += 8;
792  }
793  __asm__ volatile(SFENCE:::"memory");
794  __asm__ volatile(EMMS:::"memory");
795  while (s < end) {
796  register uint16_t bgr;
797  bgr = *s++;
798  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
799  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
800  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
801  }
802 }
803 
804 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
805 {
806  const uint16_t *end;
807  const uint16_t *mm_end;
808  uint8_t *d = (uint8_t *)dst;
809  const uint16_t *s = (const uint16_t *)src;
810  end = s + src_size/2;
811  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
812  mm_end = end - 7;
813  while (s < mm_end) {
814  __asm__ volatile(
815  PREFETCH" 32(%1) \n\t"
816  "movq (%1), %%mm0 \n\t"
817  "movq (%1), %%mm1 \n\t"
818  "movq (%1), %%mm2 \n\t"
819  "pand %2, %%mm0 \n\t"
820  "pand %3, %%mm1 \n\t"
821  "pand %4, %%mm2 \n\t"
822  "psllq $5, %%mm0 \n\t"
823  "psrlq $1, %%mm2 \n\t"
824  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
825  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
826  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
827  "movq %%mm0, %%mm3 \n\t"
828  "movq %%mm1, %%mm4 \n\t"
829  "movq %%mm2, %%mm5 \n\t"
830  "punpcklwd %5, %%mm0 \n\t"
831  "punpcklwd %5, %%mm1 \n\t"
832  "punpcklwd %5, %%mm2 \n\t"
833  "punpckhwd %5, %%mm3 \n\t"
834  "punpckhwd %5, %%mm4 \n\t"
835  "punpckhwd %5, %%mm5 \n\t"
836  "psllq $8, %%mm1 \n\t"
837  "psllq $16, %%mm2 \n\t"
838  "por %%mm1, %%mm0 \n\t"
839  "por %%mm2, %%mm0 \n\t"
840  "psllq $8, %%mm4 \n\t"
841  "psllq $16, %%mm5 \n\t"
842  "por %%mm4, %%mm3 \n\t"
843  "por %%mm5, %%mm3 \n\t"
844 
845  "movq %%mm0, %%mm6 \n\t"
846  "movq %%mm3, %%mm7 \n\t"
847 
848  "movq 8(%1), %%mm0 \n\t"
849  "movq 8(%1), %%mm1 \n\t"
850  "movq 8(%1), %%mm2 \n\t"
851  "pand %2, %%mm0 \n\t"
852  "pand %3, %%mm1 \n\t"
853  "pand %4, %%mm2 \n\t"
854  "psllq $5, %%mm0 \n\t"
855  "psrlq $1, %%mm2 \n\t"
856  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
857  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
858  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
859  "movq %%mm0, %%mm3 \n\t"
860  "movq %%mm1, %%mm4 \n\t"
861  "movq %%mm2, %%mm5 \n\t"
862  "punpcklwd %5, %%mm0 \n\t"
863  "punpcklwd %5, %%mm1 \n\t"
864  "punpcklwd %5, %%mm2 \n\t"
865  "punpckhwd %5, %%mm3 \n\t"
866  "punpckhwd %5, %%mm4 \n\t"
867  "punpckhwd %5, %%mm5 \n\t"
868  "psllq $8, %%mm1 \n\t"
869  "psllq $16, %%mm2 \n\t"
870  "por %%mm1, %%mm0 \n\t"
871  "por %%mm2, %%mm0 \n\t"
872  "psllq $8, %%mm4 \n\t"
873  "psllq $16, %%mm5 \n\t"
874  "por %%mm4, %%mm3 \n\t"
875  "por %%mm5, %%mm3 \n\t"
876  :"=m"(*d)
877  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
878  NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
879  :"memory");
880  /* borrowed 32 to 24 */
881  __asm__ volatile(
882  "movq %%mm0, %%mm4 \n\t"
883  "movq %%mm3, %%mm5 \n\t"
884  "movq %%mm6, %%mm0 \n\t"
885  "movq %%mm7, %%mm1 \n\t"
886 
887  "movq %%mm4, %%mm6 \n\t"
888  "movq %%mm5, %%mm7 \n\t"
889  "movq %%mm0, %%mm2 \n\t"
890  "movq %%mm1, %%mm3 \n\t"
891 
893 
894  :: "r"(d), "m"(*s)
895  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
896  :"memory");
897  d += 24;
898  s += 8;
899  }
900  __asm__ volatile(SFENCE:::"memory");
901  __asm__ volatile(EMMS:::"memory");
902  while (s < end) {
903  register uint16_t bgr;
904  bgr = *s++;
905  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
906  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
907  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
908  }
909 }
910 
911 /*
912  * mm0 = 00 B3 00 B2 00 B1 00 B0
913  * mm1 = 00 G3 00 G2 00 G1 00 G0
914  * mm2 = 00 R3 00 R2 00 R1 00 R0
915  * mm6 = FF FF FF FF FF FF FF FF
916  * mm7 = 00 00 00 00 00 00 00 00
917  */
918 #define PACK_RGB32 \
919  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
920  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
921  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
922  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
923  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
924  "movq %%mm0, %%mm3 \n\t" \
925  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
926  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
927  MOVNTQ" %%mm0, (%0) \n\t" \
928  MOVNTQ" %%mm3, 8(%0) \n\t" \
929 
930 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
931 {
932  const uint16_t *end;
933  const uint16_t *mm_end;
934  uint8_t *d = dst;
935  const uint16_t *s = (const uint16_t *)src;
936  end = s + src_size/2;
937  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
938  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
939  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
940  mm_end = end - 3;
941  while (s < mm_end) {
942  __asm__ volatile(
943  PREFETCH" 32(%1) \n\t"
944  "movq (%1), %%mm0 \n\t"
945  "movq (%1), %%mm1 \n\t"
946  "movq (%1), %%mm2 \n\t"
947  "pand %2, %%mm0 \n\t"
948  "pand %3, %%mm1 \n\t"
949  "pand %4, %%mm2 \n\t"
950  "psllq $5, %%mm0 \n\t"
951  "pmulhw %5, %%mm0 \n\t"
952  "pmulhw %5, %%mm1 \n\t"
953  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
954  PACK_RGB32
955  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
956  NAMED_CONSTRAINTS_ADD(mul15_hi)
957  :"memory");
958  d += 16;
959  s += 4;
960  }
961  __asm__ volatile(SFENCE:::"memory");
962  __asm__ volatile(EMMS:::"memory");
963  while (s < end) {
964  register uint16_t bgr;
965  bgr = *s++;
966  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
967  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
968  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
969  *d++ = 255;
970  }
971 }
972 
973 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
974 {
975  const uint16_t *end;
976  const uint16_t *mm_end;
977  uint8_t *d = dst;
978  const uint16_t *s = (const uint16_t*)src;
979  end = s + src_size/2;
980  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
981  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
982  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
983  mm_end = end - 3;
984  while (s < mm_end) {
985  __asm__ volatile(
986  PREFETCH" 32(%1) \n\t"
987  "movq (%1), %%mm0 \n\t"
988  "movq (%1), %%mm1 \n\t"
989  "movq (%1), %%mm2 \n\t"
990  "pand %2, %%mm0 \n\t"
991  "pand %3, %%mm1 \n\t"
992  "pand %4, %%mm2 \n\t"
993  "psllq $5, %%mm0 \n\t"
994  "psrlq $1, %%mm2 \n\t"
995  "pmulhw %5, %%mm0 \n\t"
996  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
997  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
998  PACK_RGB32
999  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1000  NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1001  :"memory");
1002  d += 16;
1003  s += 4;
1004  }
1005  __asm__ volatile(SFENCE:::"memory");
1006  __asm__ volatile(EMMS:::"memory");
1007  while (s < end) {
1008  register uint16_t bgr;
1009  bgr = *s++;
1010  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1011  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1012  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1013  *d++ = 255;
1014  }
1015 }
1016 
1017 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1018 {
1019  unsigned i;
1020  x86_reg mmx_size= 23 - src_size;
1021  __asm__ volatile (
1022  "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
1023  "jns 2f \n\t"
1024  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1025  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1026  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1027  ".p2align 4 \n\t"
1028  "1: \n\t"
1029  PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
1030  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1031  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
1032  "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
1033  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1034  "pand %%mm5, %%mm0 \n\t"
1035  "pand %%mm6, %%mm1 \n\t"
1036  "pand %%mm7, %%mm2 \n\t"
1037  "por %%mm0, %%mm1 \n\t"
1038  "por %%mm2, %%mm1 \n\t"
1039  "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1040  MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
1041  "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
1042  "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
1043  "pand %%mm7, %%mm0 \n\t"
1044  "pand %%mm5, %%mm1 \n\t"
1045  "pand %%mm6, %%mm2 \n\t"
1046  "por %%mm0, %%mm1 \n\t"
1047  "por %%mm2, %%mm1 \n\t"
1048  "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
1049  MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1050  "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
1051  "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
1052  "pand %%mm6, %%mm0 \n\t"
1053  "pand %%mm7, %%mm1 \n\t"
1054  "pand %%mm5, %%mm2 \n\t"
1055  "por %%mm0, %%mm1 \n\t"
1056  "por %%mm2, %%mm1 \n\t"
1057  MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1058  "add $24, %%"FF_REG_a" \n\t"
1059  " js 1b \n\t"
1060  "2: \n\t"
1061  : "+a" (mmx_size)
1062  : "r" (src-mmx_size), "r"(dst-mmx_size)
1063  NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1064  );
1065 
1066  __asm__ volatile(SFENCE:::"memory");
1067  __asm__ volatile(EMMS:::"memory");
1068 
1069  if (mmx_size==23) return; //finished, was multiple of 8
1070 
1071  src+= src_size;
1072  dst+= src_size;
1073  src_size= 23-mmx_size;
1074  src-= src_size;
1075  dst-= src_size;
1076  for (i=0; i<src_size; i+=3) {
1077  register uint8_t x;
1078  x = src[i + 2];
1079  dst[i + 1] = src[i + 1];
1080  dst[i + 2] = src[i + 0];
1081  dst[i + 0] = x;
1082  }
1083 }
1084 
1085 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1086  int width, int height,
1087  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1088 {
1089  int y;
1090  const x86_reg chromWidth= width>>1;
1091  for (y=0; y<height; y++) {
1092  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1093  __asm__ volatile(
1094  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1095  ".p2align 4 \n\t"
1096  "1: \n\t"
1097  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1098  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1099  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1100  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1101  "movq %%mm0, %%mm2 \n\t" // U(0)
1102  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1103  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1104  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1105 
1106  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1107  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1108  "movq %%mm3, %%mm4 \n\t" // Y(0)
1109  "movq %%mm5, %%mm6 \n\t" // Y(8)
1110  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1111  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1112  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1113  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1114 
1115  MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
1116  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1117  MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
1118  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1119 
1120  "add $8, %%"FF_REG_a" \n\t"
1121  "cmp %4, %%"FF_REG_a" \n\t"
1122  " jb 1b \n\t"
1123  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1124  : "%"FF_REG_a
1125  );
1126  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1127  usrc += chromStride;
1128  vsrc += chromStride;
1129  }
1130  ysrc += lumStride;
1131  dst += dstStride;
1132  }
1133  __asm__(EMMS" \n\t"
1134  SFENCE" \n\t"
1135  :::"memory");
1136 }
1137 
1138 /**
1139  * Height should be a multiple of 2 and width should be a multiple of 16.
1140  * (If this is a problem for anyone then tell me, and I will fix it.)
1141  */
1142 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1143  int width, int height,
1144  int lumStride, int chromStride, int dstStride)
1145 {
1146  //FIXME interpolate chroma
1147  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1148 }
1149 
1150 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1151  int width, int height,
1152  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1153 {
1154  int y;
1155  const x86_reg chromWidth= width>>1;
1156  for (y=0; y<height; y++) {
1157  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1158  __asm__ volatile(
1159  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1160  ".p2align 4 \n\t"
1161  "1: \n\t"
1162  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1163  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1164  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1165  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1166  "movq %%mm0, %%mm2 \n\t" // U(0)
1167  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1168  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1169  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1170 
1171  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1172  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1173  "movq %%mm0, %%mm4 \n\t" // Y(0)
1174  "movq %%mm2, %%mm6 \n\t" // Y(8)
1175  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1176  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1177  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1178  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1179 
1180  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
1181  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1182  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
1183  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1184 
1185  "add $8, %%"FF_REG_a" \n\t"
1186  "cmp %4, %%"FF_REG_a" \n\t"
1187  " jb 1b \n\t"
1188  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1189  : "%"FF_REG_a
1190  );
1191  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1192  usrc += chromStride;
1193  vsrc += chromStride;
1194  }
1195  ysrc += lumStride;
1196  dst += dstStride;
1197  }
1198  __asm__(EMMS" \n\t"
1199  SFENCE" \n\t"
1200  :::"memory");
1201 }
1202 
1203 /**
1204  * Height should be a multiple of 2 and width should be a multiple of 16
1205  * (If this is a problem for anyone then tell me, and I will fix it.)
1206  */
1207 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1208  int width, int height,
1209  int lumStride, int chromStride, int dstStride)
1210 {
1211  //FIXME interpolate chroma
1212  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1213 }
1214 
1215 /**
1216  * Width should be a multiple of 16.
1217  */
1218 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1219  int width, int height,
1220  int lumStride, int chromStride, int dstStride)
1221 {
1222  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1223 }
1224 
1225 /**
1226  * Width should be a multiple of 16.
1227  */
1228 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1229  int width, int height,
1230  int lumStride, int chromStride, int dstStride)
1231 {
1232  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1233 }
1234 
1235 /**
1236  * Height should be a multiple of 2 and width should be a multiple of 16.
1237  * (If this is a problem for anyone then tell me, and I will fix it.)
1238  */
1239 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1240  int width, int height,
1241  int lumStride, int chromStride, int srcStride)
1242 {
1243  int y;
1244  const x86_reg chromWidth= width>>1;
1245  for (y=0; y<height; y+=2) {
1246  __asm__ volatile(
1247  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1248  "pcmpeqw %%mm7, %%mm7 \n\t"
1249  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1250  ".p2align 4 \n\t"
1251  "1: \n\t"
1252  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1253  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1254  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1255  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1256  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1257  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1258  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1259  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1260  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1261  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1262  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1263 
1264  MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1265 
1266  "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1267  "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1268  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1269  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1270  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1271  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1272  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1273  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1274  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1275  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1276 
1277  MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1278 
1279  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1280  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1281  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1282  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1283  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1284  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1285  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1286  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1287 
1288  MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
1289  MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
1290 
1291  "add $8, %%"FF_REG_a" \n\t"
1292  "cmp %4, %%"FF_REG_a" \n\t"
1293  " jb 1b \n\t"
1294  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1295  : "memory", "%"FF_REG_a
1296  );
1297 
1298  ydst += lumStride;
1299  src += srcStride;
1300 
1301  __asm__ volatile(
1302  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1303  ".p2align 4 \n\t"
1304  "1: \n\t"
1305  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1306  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1307  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1308  "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1309  "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1310  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1311  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1312  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1313  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1314  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1315  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1316 
1317  MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
1318  MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1319 
1320  "add $8, %%"FF_REG_a"\n\t"
1321  "cmp %4, %%"FF_REG_a"\n\t"
1322  " jb 1b \n\t"
1323 
1324  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1325  : "memory", "%"FF_REG_a
1326  );
1327  udst += chromStride;
1328  vdst += chromStride;
1329  ydst += lumStride;
1330  src += srcStride;
1331  }
1332  __asm__ volatile(EMMS" \n\t"
1333  SFENCE" \n\t"
1334  :::"memory");
1335 }
1336 
1337 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1338 {
1339  int x,y;
1340 
1341  dst[0]= src[0];
1342 
1343  // first line
1344  for (x=0; x<srcWidth-1; x++) {
1345  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1346  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1347  }
1348  dst[2*srcWidth-1]= src[srcWidth-1];
1349 
1350  dst+= dstStride;
1351 
1352  for (y=1; y<srcHeight; y++) {
1353  x86_reg mmxSize= srcWidth&~15;
1354 
1355  if (mmxSize) {
1356  __asm__ volatile(
1357  "mov %4, %%"FF_REG_a" \n\t"
1358  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1359  "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
1360  "movq %%mm4, %%mm2 \n\t"
1361  "psllq $8, %%mm4 \n\t"
1362  "pand %%mm0, %%mm2 \n\t"
1363  "por %%mm2, %%mm4 \n\t"
1364  "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
1365  "movq %%mm5, %%mm3 \n\t"
1366  "psllq $8, %%mm5 \n\t"
1367  "pand %%mm0, %%mm3 \n\t"
1368  "por %%mm3, %%mm5 \n\t"
1369  "1: \n\t"
1370  "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
1371  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
1372  "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
1373  "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
1374  PAVGB" %%mm0, %%mm5 \n\t"
1375  PAVGB" %%mm0, %%mm3 \n\t"
1376  PAVGB" %%mm0, %%mm5 \n\t"
1377  PAVGB" %%mm0, %%mm3 \n\t"
1378  PAVGB" %%mm1, %%mm4 \n\t"
1379  PAVGB" %%mm1, %%mm2 \n\t"
1380  PAVGB" %%mm1, %%mm4 \n\t"
1381  PAVGB" %%mm1, %%mm2 \n\t"
1382  "movq %%mm5, %%mm7 \n\t"
1383  "movq %%mm4, %%mm6 \n\t"
1384  "punpcklbw %%mm3, %%mm5 \n\t"
1385  "punpckhbw %%mm3, %%mm7 \n\t"
1386  "punpcklbw %%mm2, %%mm4 \n\t"
1387  "punpckhbw %%mm2, %%mm6 \n\t"
1388  MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
1389  MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
1390  MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
1391  MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
1392  "add $8, %%"FF_REG_a" \n\t"
1393  "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
1394  "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
1395  " js 1b \n\t"
1396  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1397  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1398  "g" (-mmxSize)
1399  NAMED_CONSTRAINTS_ADD(mmx_ff)
1400  : "%"FF_REG_a
1401  );
1402  } else {
1403  mmxSize = 1;
1404  dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
1405  dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1406  }
1407 
1408  for (x=mmxSize-1; x<srcWidth-1; x++) {
1409  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1410  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1411  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1412  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1413  }
1414  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1415  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1416 
1417  dst+=dstStride*2;
1418  src+=srcStride;
1419  }
1420 
1421  // last line
1422  dst[0]= src[0];
1423 
1424  for (x=0; x<srcWidth-1; x++) {
1425  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1426  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1427  }
1428  dst[2*srcWidth-1]= src[srcWidth-1];
1429 
1430  __asm__ volatile(EMMS" \n\t"
1431  SFENCE" \n\t"
1432  :::"memory");
1433 }
1434 
1435 /**
1436  * Height should be a multiple of 2 and width should be a multiple of 16.
1437  * (If this is a problem for anyone then tell me, and I will fix it.)
1438  * Chrominance data is only taken from every second line, others are ignored.
1439  * FIXME: Write HQ version.
1440  */
1441 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1442  int width, int height,
1443  int lumStride, int chromStride, int srcStride)
1444 {
1445  int y;
1446  const x86_reg chromWidth= width>>1;
1447  for (y=0; y<height; y+=2) {
1448  __asm__ volatile(
1449  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1450  "pcmpeqw %%mm7, %%mm7 \n\t"
1451  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1452  ".p2align 4 \n\t"
1453  "1: \n\t"
1454  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1455  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1456  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1457  "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1458  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1459  "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1460  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1461  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1462  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1463  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1464  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1465 
1466  MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1467 
1468  "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1469  "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1470  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1471  "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1472  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1473  "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1474  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1475  "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1476  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1477  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1478 
1479  MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1480 
1481  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1482  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1483  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1484  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1485  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1486  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1487  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1488  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1489 
1490  MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
1491  MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
1492 
1493  "add $8, %%"FF_REG_a" \n\t"
1494  "cmp %4, %%"FF_REG_a" \n\t"
1495  " jb 1b \n\t"
1496  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1497  : "memory", "%"FF_REG_a
1498  );
1499 
1500  ydst += lumStride;
1501  src += srcStride;
1502 
1503  __asm__ volatile(
1504  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1505  ".p2align 4 \n\t"
1506  "1: \n\t"
1507  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1508  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1509  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1510  "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1511  "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1512  "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1513  "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1514  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1515  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1516  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1517  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1518 
1519  MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
1520  MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1521 
1522  "add $8, %%"FF_REG_a" \n\t"
1523  "cmp %4, %%"FF_REG_a" \n\t"
1524  " jb 1b \n\t"
1525 
1526  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1527  : "memory", "%"FF_REG_a
1528  );
1529  udst += chromStride;
1530  vdst += chromStride;
1531  ydst += lumStride;
1532  src += srcStride;
1533  }
1534  __asm__ volatile(EMMS" \n\t"
1535  SFENCE" \n\t"
1536  :::"memory");
1537 }
1538 
1539 /**
1540  * Height should be a multiple of 2 and width should be a multiple of 2.
1541  * (If this is a problem for anyone then tell me, and I will fix it.)
1542  * Chrominance data is only taken from every second line,
1543  * others are ignored in the C version.
1544  * FIXME: Write HQ version.
1545  */
1546 #if HAVE_7REGS
1547 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1548  int width, int height,
1549  int lumStride, int chromStride, int srcStride,
1550  int32_t *rgb2yuv)
1551 {
1552 #define BGR2Y_IDX "16*4+16*32"
1553 #define BGR2U_IDX "16*4+16*33"
1554 #define BGR2V_IDX "16*4+16*34"
1555  int y;
1556  const x86_reg chromWidth= width>>1;
1557 
1558  if (height > 2) {
1559  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1560  src += 2*srcStride;
1561  ydst += 2*lumStride;
1562  udst += chromStride;
1563  vdst += chromStride;
1564  height -= 2;
1565  }
1566 
1567  for (y=0; y<height-2; y+=2) {
1568  int i;
1569  for (i=0; i<2; i++) {
1570  __asm__ volatile(
1571  "mov %2, %%"FF_REG_a"\n\t"
1572  "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
1573  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1574  "pxor %%mm7, %%mm7 \n\t"
1575  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1576  ".p2align 4 \n\t"
1577  "1: \n\t"
1578  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1579  "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
1580  "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
1581  "punpcklbw %%mm7, %%mm0 \n\t"
1582  "punpcklbw %%mm7, %%mm1 \n\t"
1583  "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1584  "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
1585  "punpcklbw %%mm7, %%mm2 \n\t"
1586  "punpcklbw %%mm7, %%mm3 \n\t"
1587  "pmaddwd %%mm6, %%mm0 \n\t"
1588  "pmaddwd %%mm6, %%mm1 \n\t"
1589  "pmaddwd %%mm6, %%mm2 \n\t"
1590  "pmaddwd %%mm6, %%mm3 \n\t"
1591  "psrad $8, %%mm0 \n\t"
1592  "psrad $8, %%mm1 \n\t"
1593  "psrad $8, %%mm2 \n\t"
1594  "psrad $8, %%mm3 \n\t"
1595  "packssdw %%mm1, %%mm0 \n\t"
1596  "packssdw %%mm3, %%mm2 \n\t"
1597  "pmaddwd %%mm5, %%mm0 \n\t"
1598  "pmaddwd %%mm5, %%mm2 \n\t"
1599  "packssdw %%mm2, %%mm0 \n\t"
1600  "psraw $7, %%mm0 \n\t"
1601 
1602  "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1603  "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
1604  "punpcklbw %%mm7, %%mm4 \n\t"
1605  "punpcklbw %%mm7, %%mm1 \n\t"
1606  "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1607  "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
1608  "punpcklbw %%mm7, %%mm2 \n\t"
1609  "punpcklbw %%mm7, %%mm3 \n\t"
1610  "pmaddwd %%mm6, %%mm4 \n\t"
1611  "pmaddwd %%mm6, %%mm1 \n\t"
1612  "pmaddwd %%mm6, %%mm2 \n\t"
1613  "pmaddwd %%mm6, %%mm3 \n\t"
1614  "psrad $8, %%mm4 \n\t"
1615  "psrad $8, %%mm1 \n\t"
1616  "psrad $8, %%mm2 \n\t"
1617  "psrad $8, %%mm3 \n\t"
1618  "packssdw %%mm1, %%mm4 \n\t"
1619  "packssdw %%mm3, %%mm2 \n\t"
1620  "pmaddwd %%mm5, %%mm4 \n\t"
1621  "pmaddwd %%mm5, %%mm2 \n\t"
1622  "add $24, %%"FF_REG_d"\n\t"
1623  "packssdw %%mm2, %%mm4 \n\t"
1624  "psraw $7, %%mm4 \n\t"
1625 
1626  "packuswb %%mm4, %%mm0 \n\t"
1627  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1628 
1629  MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
1630  "add $8, %%"FF_REG_a" \n\t"
1631  " js 1b \n\t"
1632  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1633  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
1634  : "%"FF_REG_a, "%"FF_REG_d
1635  );
1636  ydst += lumStride;
1637  src += srcStride;
1638  }
1639  src -= srcStride*2;
1640  __asm__ volatile(
1641  "mov %4, %%"FF_REG_a"\n\t"
1642  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643  "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
1644  "pxor %%mm7, %%mm7 \n\t"
1645  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1646  "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
1647  ".p2align 4 \n\t"
1648  "1: \n\t"
1649  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1650  PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
1651  "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
1652  "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
1653  "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1654  "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
1655  PAVGB" %%mm1, %%mm0 \n\t"
1656  PAVGB" %%mm3, %%mm2 \n\t"
1657  "movq %%mm0, %%mm1 \n\t"
1658  "movq %%mm2, %%mm3 \n\t"
1659  "psrlq $24, %%mm0 \n\t"
1660  "psrlq $24, %%mm2 \n\t"
1661  PAVGB" %%mm1, %%mm0 \n\t"
1662  PAVGB" %%mm3, %%mm2 \n\t"
1663  "punpcklbw %%mm7, %%mm0 \n\t"
1664  "punpcklbw %%mm7, %%mm2 \n\t"
1665  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1666  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1667 
1668  "pmaddwd %%mm0, %%mm1 \n\t"
1669  "pmaddwd %%mm2, %%mm3 \n\t"
1670  "pmaddwd %%mm6, %%mm0 \n\t"
1671  "pmaddwd %%mm6, %%mm2 \n\t"
1672  "psrad $8, %%mm0 \n\t"
1673  "psrad $8, %%mm1 \n\t"
1674  "psrad $8, %%mm2 \n\t"
1675  "psrad $8, %%mm3 \n\t"
1676  "packssdw %%mm2, %%mm0 \n\t"
1677  "packssdw %%mm3, %%mm1 \n\t"
1678  "pmaddwd %%mm5, %%mm0 \n\t"
1679  "pmaddwd %%mm5, %%mm1 \n\t"
1680  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1681  "psraw $7, %%mm0 \n\t"
1682 
1683  "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1684  "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
1685  "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1686  "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
1687  PAVGB" %%mm1, %%mm4 \n\t"
1688  PAVGB" %%mm3, %%mm2 \n\t"
1689  "movq %%mm4, %%mm1 \n\t"
1690  "movq %%mm2, %%mm3 \n\t"
1691  "psrlq $24, %%mm4 \n\t"
1692  "psrlq $24, %%mm2 \n\t"
1693  PAVGB" %%mm1, %%mm4 \n\t"
1694  PAVGB" %%mm3, %%mm2 \n\t"
1695  "punpcklbw %%mm7, %%mm4 \n\t"
1696  "punpcklbw %%mm7, %%mm2 \n\t"
1697  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1698  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1699 
1700  "pmaddwd %%mm4, %%mm1 \n\t"
1701  "pmaddwd %%mm2, %%mm3 \n\t"
1702  "pmaddwd %%mm6, %%mm4 \n\t"
1703  "pmaddwd %%mm6, %%mm2 \n\t"
1704  "psrad $8, %%mm4 \n\t"
1705  "psrad $8, %%mm1 \n\t"
1706  "psrad $8, %%mm2 \n\t"
1707  "psrad $8, %%mm3 \n\t"
1708  "packssdw %%mm2, %%mm4 \n\t"
1709  "packssdw %%mm3, %%mm1 \n\t"
1710  "pmaddwd %%mm5, %%mm4 \n\t"
1711  "pmaddwd %%mm5, %%mm1 \n\t"
1712  "add $24, %%"FF_REG_d"\n\t"
1713  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1714  "psraw $7, %%mm4 \n\t"
1715 
1716  "movq %%mm0, %%mm1 \n\t"
1717  "punpckldq %%mm4, %%mm0 \n\t"
1718  "punpckhdq %%mm4, %%mm1 \n\t"
1719  "packsswb %%mm1, %%mm0 \n\t"
1720  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1721  "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
1722  "punpckhdq %%mm0, %%mm0 \n\t"
1723  "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
1724  "add $4, %%"FF_REG_a" \n\t"
1725  " js 1b \n\t"
1726  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1727  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
1728  : "%"FF_REG_a, "%"FF_REG_d
1729  );
1730 
1731  udst += chromStride;
1732  vdst += chromStride;
1733  src += srcStride*2;
1734  }
1735 
1736  __asm__ volatile(EMMS" \n\t"
1737  SFENCE" \n\t"
1738  :::"memory");
1739 
1740  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1741 }
1742 #endif /* HAVE_7REGS */
1743 #endif /* !COMPILE_TEMPLATE_SSE2 */
1744 
1745 #if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2
1746 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1747  int width, int height, int src1Stride,
1748  int src2Stride, int dstStride)
1749 {
1750  int h;
1751 
1752  for (h=0; h < height; h++) {
1753  int w;
1754 
1755  if (width >= 16) {
1756  if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1757  __asm__(
1758  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1759  "1: \n\t"
1760  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
1761  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
1762  "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
1763  "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
1764  "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
1765  "punpcklbw %%xmm2, %%xmm0 \n\t"
1766  "punpckhbw %%xmm2, %%xmm1 \n\t"
1767  "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
1768  "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
1769  "add $16, %%"FF_REG_a" \n\t"
1770  "cmp %3, %%"FF_REG_a" \n\t"
1771  " jb 1b \n\t"
1772  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1773  : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
1774  );
1775  } else
1776  __asm__(
1777  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1778  "1: \n\t"
1779  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
1780  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
1781  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
1782  "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
1783  "movq %%mm0, %%mm1 \n\t"
1784  "movq %%mm2, %%mm3 \n\t"
1785  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
1786  "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
1787  "punpcklbw %%mm4, %%mm0 \n\t"
1788  "punpckhbw %%mm4, %%mm1 \n\t"
1789  "punpcklbw %%mm5, %%mm2 \n\t"
1790  "punpckhbw %%mm5, %%mm3 \n\t"
1791  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
1792  MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
1793  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
1794  MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
1795  "add $16, %%"FF_REG_a" \n\t"
1796  "cmp %3, %%"FF_REG_a" \n\t"
1797  " jb 1b \n\t"
1798  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1799  : "memory", "%"FF_REG_a
1800  );
1801 
1802  }
1803  for (w= (width&(~15)); w < width; w++) {
1804  dest[2*w+0] = src1[w];
1805  dest[2*w+1] = src2[w];
1806  }
1807  dest += dstStride;
1808  src1 += src1Stride;
1809  src2 += src2Stride;
1810  }
1811  __asm__(
1812  EMMS" \n\t"
1813  SFENCE" \n\t"
1814  ::: "memory"
1815  );
1816 }
1817 #endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */
1818 
1819 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1820 #if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM
1821 void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822  const uint8_t *unused,
1823  const uint8_t *src1,
1824  const uint8_t *src2,
1825  int w,
1826  uint32_t *unused2);
1827 static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1828  int width, int height, int srcStride,
1829  int dst1Stride, int dst2Stride)
1830 {
1831  int h;
1832 
1833  for (h = 0; h < height; h++) {
1834  RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
1835  src += srcStride;
1836  dst1 += dst1Stride;
1837  dst2 += dst2Stride;
1838  }
1839  __asm__(
1840  SFENCE" \n\t"
1841  ::: "memory"
1842  );
1843 }
1844 #endif /* COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM */
1845 #endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
1846 
1847 #if !COMPILE_TEMPLATE_SSE2
1848 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1849  uint8_t *dst1, uint8_t *dst2,
1850  int width, int height,
1851  int srcStride1, int srcStride2,
1852  int dstStride1, int dstStride2)
1853 {
1854  x86_reg x, y;
1855  int w,h;
1856  w=width/2; h=height/2;
1857  __asm__ volatile(
1858  PREFETCH" %0 \n\t"
1859  PREFETCH" %1 \n\t"
1860  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1861  for (y=0;y<h;y++) {
1862  const uint8_t* s1=src1+srcStride1*(y>>1);
1863  uint8_t* d=dst1+dstStride1*y;
1864  x=0;
1865  for (;x<w-31;x+=32) {
1866  __asm__ volatile(
1867  PREFETCH" 32(%1,%2) \n\t"
1868  "movq (%1,%2), %%mm0 \n\t"
1869  "movq 8(%1,%2), %%mm2 \n\t"
1870  "movq 16(%1,%2), %%mm4 \n\t"
1871  "movq 24(%1,%2), %%mm6 \n\t"
1872  "movq %%mm0, %%mm1 \n\t"
1873  "movq %%mm2, %%mm3 \n\t"
1874  "movq %%mm4, %%mm5 \n\t"
1875  "movq %%mm6, %%mm7 \n\t"
1876  "punpcklbw %%mm0, %%mm0 \n\t"
1877  "punpckhbw %%mm1, %%mm1 \n\t"
1878  "punpcklbw %%mm2, %%mm2 \n\t"
1879  "punpckhbw %%mm3, %%mm3 \n\t"
1880  "punpcklbw %%mm4, %%mm4 \n\t"
1881  "punpckhbw %%mm5, %%mm5 \n\t"
1882  "punpcklbw %%mm6, %%mm6 \n\t"
1883  "punpckhbw %%mm7, %%mm7 \n\t"
1884  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1885  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1886  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1887  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1888  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1889  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1890  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1891  MOVNTQ" %%mm7, 56(%0,%2,2)"
1892  :: "r"(d), "r"(s1), "r"(x)
1893  :"memory");
1894  }
1895  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1896  }
1897  for (y=0;y<h;y++) {
1898  const uint8_t* s2=src2+srcStride2*(y>>1);
1899  uint8_t* d=dst2+dstStride2*y;
1900  x=0;
1901  for (;x<w-31;x+=32) {
1902  __asm__ volatile(
1903  PREFETCH" 32(%1,%2) \n\t"
1904  "movq (%1,%2), %%mm0 \n\t"
1905  "movq 8(%1,%2), %%mm2 \n\t"
1906  "movq 16(%1,%2), %%mm4 \n\t"
1907  "movq 24(%1,%2), %%mm6 \n\t"
1908  "movq %%mm0, %%mm1 \n\t"
1909  "movq %%mm2, %%mm3 \n\t"
1910  "movq %%mm4, %%mm5 \n\t"
1911  "movq %%mm6, %%mm7 \n\t"
1912  "punpcklbw %%mm0, %%mm0 \n\t"
1913  "punpckhbw %%mm1, %%mm1 \n\t"
1914  "punpcklbw %%mm2, %%mm2 \n\t"
1915  "punpckhbw %%mm3, %%mm3 \n\t"
1916  "punpcklbw %%mm4, %%mm4 \n\t"
1917  "punpckhbw %%mm5, %%mm5 \n\t"
1918  "punpcklbw %%mm6, %%mm6 \n\t"
1919  "punpckhbw %%mm7, %%mm7 \n\t"
1920  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1921  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1922  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1923  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1924  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1925  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1926  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1927  MOVNTQ" %%mm7, 56(%0,%2,2)"
1928  :: "r"(d), "r"(s2), "r"(x)
1929  :"memory");
1930  }
1931  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1932  }
1933  __asm__(
1934  EMMS" \n\t"
1935  SFENCE" \n\t"
1936  ::: "memory"
1937  );
1938 }
1939 
1940 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1941  uint8_t *dst,
1942  int width, int height,
1943  int srcStride1, int srcStride2,
1944  int srcStride3, int dstStride)
1945 {
1946  x86_reg x;
1947  int y,w,h;
1948  w=width/2; h=height;
1949  for (y=0;y<h;y++) {
1950  const uint8_t* yp=src1+srcStride1*y;
1951  const uint8_t* up=src2+srcStride2*(y>>2);
1952  const uint8_t* vp=src3+srcStride3*(y>>2);
1953  uint8_t* d=dst+dstStride*y;
1954  x=0;
1955  for (;x<w-7;x+=8) {
1956  __asm__ volatile(
1957  PREFETCH" 32(%1, %0) \n\t"
1958  PREFETCH" 32(%2, %0) \n\t"
1959  PREFETCH" 32(%3, %0) \n\t"
1960  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1961  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
1962  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
1963  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1964  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
1965  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
1966  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1967  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1968  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1969  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1970 
1971  "movq %%mm1, %%mm6 \n\t"
1972  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1973  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1974  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1975  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
1976  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
1977 
1978  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1979  "movq 8(%1, %0, 4), %%mm0 \n\t"
1980  "movq %%mm0, %%mm3 \n\t"
1981  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1982  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1983  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
1984  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
1985 
1986  "movq %%mm4, %%mm6 \n\t"
1987  "movq 16(%1, %0, 4), %%mm0 \n\t"
1988  "movq %%mm0, %%mm3 \n\t"
1989  "punpcklbw %%mm5, %%mm4 \n\t"
1990  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1991  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1992  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
1993  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
1994 
1995  "punpckhbw %%mm5, %%mm6 \n\t"
1996  "movq 24(%1, %0, 4), %%mm0 \n\t"
1997  "movq %%mm0, %%mm3 \n\t"
1998  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1999  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2000  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2001  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2002 
2003  : "+r" (x)
2004  : "r"(yp), "r" (up), "r"(vp), "r"(d)
2005  :"memory");
2006  }
2007  for (; x<w; x++) {
2008  const int x2 = x<<2;
2009  d[8*x+0] = yp[x2];
2010  d[8*x+1] = up[x];
2011  d[8*x+2] = yp[x2+1];
2012  d[8*x+3] = vp[x];
2013  d[8*x+4] = yp[x2+2];
2014  d[8*x+5] = up[x];
2015  d[8*x+6] = yp[x2+3];
2016  d[8*x+7] = vp[x];
2017  }
2018  }
2019  __asm__(
2020  EMMS" \n\t"
2021  SFENCE" \n\t"
2022  ::: "memory"
2023  );
2024 }
2025 
2026 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2027 {
2028  dst += count;
2029  src += 2*count;
2030  count= - count;
2031 
2032  if(count <= -16) {
2033  count += 15;
2034  __asm__ volatile(
2035  "pcmpeqw %%mm7, %%mm7 \n\t"
2036  "psrlw $8, %%mm7 \n\t"
2037  "1: \n\t"
2038  "movq -30(%1, %0, 2), %%mm0 \n\t"
2039  "movq -22(%1, %0, 2), %%mm1 \n\t"
2040  "movq -14(%1, %0, 2), %%mm2 \n\t"
2041  "movq -6(%1, %0, 2), %%mm3 \n\t"
2042  "pand %%mm7, %%mm0 \n\t"
2043  "pand %%mm7, %%mm1 \n\t"
2044  "pand %%mm7, %%mm2 \n\t"
2045  "pand %%mm7, %%mm3 \n\t"
2046  "packuswb %%mm1, %%mm0 \n\t"
2047  "packuswb %%mm3, %%mm2 \n\t"
2048  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2049  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2050  "add $16, %0 \n\t"
2051  " js 1b \n\t"
2052  : "+r"(count)
2053  : "r"(src), "r"(dst)
2054  );
2055  count -= 15;
2056  }
2057  while(count<0) {
2058  dst[count]= src[2*count];
2059  count++;
2060  }
2061 }
2062 
2063 static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
2064 {
2065  src ++;
2066  dst += count;
2067  src += 2*count;
2068  count= - count;
2069 
2070  if(count < -16) {
2071  count += 16;
2072  __asm__ volatile(
2073  "pcmpeqw %%mm7, %%mm7 \n\t"
2074  "psrlw $8, %%mm7 \n\t"
2075  "1: \n\t"
2076  "movq -32(%1, %0, 2), %%mm0 \n\t"
2077  "movq -24(%1, %0, 2), %%mm1 \n\t"
2078  "movq -16(%1, %0, 2), %%mm2 \n\t"
2079  "movq -8(%1, %0, 2), %%mm3 \n\t"
2080  "pand %%mm7, %%mm0 \n\t"
2081  "pand %%mm7, %%mm1 \n\t"
2082  "pand %%mm7, %%mm2 \n\t"
2083  "pand %%mm7, %%mm3 \n\t"
2084  "packuswb %%mm1, %%mm0 \n\t"
2085  "packuswb %%mm3, %%mm2 \n\t"
2086  MOVNTQ" %%mm0,-16(%2, %0) \n\t"
2087  MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
2088  "add $16, %0 \n\t"
2089  " js 1b \n\t"
2090  : "+r"(count)
2091  : "r"(src), "r"(dst)
2092  );
2093  count -= 16;
2094  }
2095  while(count<0) {
2096  dst[count]= src[2*count];
2097  count++;
2098  }
2099 }
2100 
2101 #if ARCH_X86_32
2102 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2103 {
2104  dst0+= count;
2105  dst1+= count;
2106  src += 4*count;
2107  count= - count;
2108  if(count <= -8) {
2109  count += 7;
2110  __asm__ volatile(
2111  "pcmpeqw %%mm7, %%mm7 \n\t"
2112  "psrlw $8, %%mm7 \n\t"
2113  "1: \n\t"
2114  "movq -28(%1, %0, 4), %%mm0 \n\t"
2115  "movq -20(%1, %0, 4), %%mm1 \n\t"
2116  "movq -12(%1, %0, 4), %%mm2 \n\t"
2117  "movq -4(%1, %0, 4), %%mm3 \n\t"
2118  "pand %%mm7, %%mm0 \n\t"
2119  "pand %%mm7, %%mm1 \n\t"
2120  "pand %%mm7, %%mm2 \n\t"
2121  "pand %%mm7, %%mm3 \n\t"
2122  "packuswb %%mm1, %%mm0 \n\t"
2123  "packuswb %%mm3, %%mm2 \n\t"
2124  "movq %%mm0, %%mm1 \n\t"
2125  "movq %%mm2, %%mm3 \n\t"
2126  "psrlw $8, %%mm0 \n\t"
2127  "psrlw $8, %%mm2 \n\t"
2128  "pand %%mm7, %%mm1 \n\t"
2129  "pand %%mm7, %%mm3 \n\t"
2130  "packuswb %%mm2, %%mm0 \n\t"
2131  "packuswb %%mm3, %%mm1 \n\t"
2132  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2133  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2134  "add $8, %0 \n\t"
2135  " js 1b \n\t"
2136  : "+r"(count)
2137  : "r"(src), "r"(dst0), "r"(dst1)
2138  );
2139  count -= 7;
2140  }
2141  while(count<0) {
2142  dst0[count]= src[4*count+0];
2143  dst1[count]= src[4*count+2];
2144  count++;
2145  }
2146 }
2147 #endif /* ARCH_X86_32 */
2148 
2149 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2150 {
2151  dst0 += count;
2152  dst1 += count;
2153  src0 += 4*count;
2154  src1 += 4*count;
2155  count= - count;
2156 #ifdef PAVGB
2157  if(count <= -8) {
2158  count += 7;
2159  __asm__ volatile(
2160  "pcmpeqw %%mm7, %%mm7 \n\t"
2161  "psrlw $8, %%mm7 \n\t"
2162  "1: \n\t"
2163  "movq -28(%1, %0, 4), %%mm0 \n\t"
2164  "movq -20(%1, %0, 4), %%mm1 \n\t"
2165  "movq -12(%1, %0, 4), %%mm2 \n\t"
2166  "movq -4(%1, %0, 4), %%mm3 \n\t"
2167  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2168  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2169  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2170  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2171  "pand %%mm7, %%mm0 \n\t"
2172  "pand %%mm7, %%mm1 \n\t"
2173  "pand %%mm7, %%mm2 \n\t"
2174  "pand %%mm7, %%mm3 \n\t"
2175  "packuswb %%mm1, %%mm0 \n\t"
2176  "packuswb %%mm3, %%mm2 \n\t"
2177  "movq %%mm0, %%mm1 \n\t"
2178  "movq %%mm2, %%mm3 \n\t"
2179  "psrlw $8, %%mm0 \n\t"
2180  "psrlw $8, %%mm2 \n\t"
2181  "pand %%mm7, %%mm1 \n\t"
2182  "pand %%mm7, %%mm3 \n\t"
2183  "packuswb %%mm2, %%mm0 \n\t"
2184  "packuswb %%mm3, %%mm1 \n\t"
2185  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2186  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2187  "add $8, %0 \n\t"
2188  " js 1b \n\t"
2189  : "+r"(count)
2190  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2191  );
2192  count -= 7;
2193  }
2194 #endif
2195  while(count<0) {
2196  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2197  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2198  count++;
2199  }
2200 }
2201 
2202 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2203 {
2204  dst0+= count;
2205  dst1+= count;
2206  src += 4*count;
2207  count= - count;
2208  if(count <= -8) {
2209  count += 7;
2210  __asm__ volatile(
2211  "pcmpeqw %%mm7, %%mm7 \n\t"
2212  "psrlw $8, %%mm7 \n\t"
2213  "1: \n\t"
2214  "movq -28(%1, %0, 4), %%mm0 \n\t"
2215  "movq -20(%1, %0, 4), %%mm1 \n\t"
2216  "movq -12(%1, %0, 4), %%mm2 \n\t"
2217  "movq -4(%1, %0, 4), %%mm3 \n\t"
2218  "psrlw $8, %%mm0 \n\t"
2219  "psrlw $8, %%mm1 \n\t"
2220  "psrlw $8, %%mm2 \n\t"
2221  "psrlw $8, %%mm3 \n\t"
2222  "packuswb %%mm1, %%mm0 \n\t"
2223  "packuswb %%mm3, %%mm2 \n\t"
2224  "movq %%mm0, %%mm1 \n\t"
2225  "movq %%mm2, %%mm3 \n\t"
2226  "psrlw $8, %%mm0 \n\t"
2227  "psrlw $8, %%mm2 \n\t"
2228  "pand %%mm7, %%mm1 \n\t"
2229  "pand %%mm7, %%mm3 \n\t"
2230  "packuswb %%mm2, %%mm0 \n\t"
2231  "packuswb %%mm3, %%mm1 \n\t"
2232  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2233  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2234  "add $8, %0 \n\t"
2235  " js 1b \n\t"
2236  : "+r"(count)
2237  : "r"(src), "r"(dst0), "r"(dst1)
2238  );
2239  count -= 7;
2240  }
2241  src++;
2242  while(count<0) {
2243  dst0[count]= src[4*count+0];
2244  dst1[count]= src[4*count+2];
2245  count++;
2246  }
2247 }
2248 
2249 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2250 {
2251  dst0 += count;
2252  dst1 += count;
2253  src0 += 4*count;
2254  src1 += 4*count;
2255  count= - count;
2256 #ifdef PAVGB
2257  if(count <= -8) {
2258  count += 7;
2259  __asm__ volatile(
2260  "pcmpeqw %%mm7, %%mm7 \n\t"
2261  "psrlw $8, %%mm7 \n\t"
2262  "1: \n\t"
2263  "movq -28(%1, %0, 4), %%mm0 \n\t"
2264  "movq -20(%1, %0, 4), %%mm1 \n\t"
2265  "movq -12(%1, %0, 4), %%mm2 \n\t"
2266  "movq -4(%1, %0, 4), %%mm3 \n\t"
2267  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2268  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2269  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2270  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2271  "psrlw $8, %%mm0 \n\t"
2272  "psrlw $8, %%mm1 \n\t"
2273  "psrlw $8, %%mm2 \n\t"
2274  "psrlw $8, %%mm3 \n\t"
2275  "packuswb %%mm1, %%mm0 \n\t"
2276  "packuswb %%mm3, %%mm2 \n\t"
2277  "movq %%mm0, %%mm1 \n\t"
2278  "movq %%mm2, %%mm3 \n\t"
2279  "psrlw $8, %%mm0 \n\t"
2280  "psrlw $8, %%mm2 \n\t"
2281  "pand %%mm7, %%mm1 \n\t"
2282  "pand %%mm7, %%mm3 \n\t"
2283  "packuswb %%mm2, %%mm0 \n\t"
2284  "packuswb %%mm3, %%mm1 \n\t"
2285  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2286  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2287  "add $8, %0 \n\t"
2288  " js 1b \n\t"
2289  : "+r"(count)
2290  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2291  );
2292  count -= 7;
2293  }
2294 #endif
2295  src0++;
2296  src1++;
2297  while(count<0) {
2298  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2299  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2300  count++;
2301  }
2302 }
2303 
2304 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2305  int width, int height,
2306  int lumStride, int chromStride, int srcStride)
2307 {
2308  int y;
2309  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2310 
2311  for (y=0; y<height; y++) {
2312  RENAME(extract_even)(src, ydst, width);
2313  if(y&1) {
2314  RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2315  udst+= chromStride;
2316  vdst+= chromStride;
2317  }
2318 
2319  src += srcStride;
2320  ydst+= lumStride;
2321  }
2322  __asm__(
2323  EMMS" \n\t"
2324  SFENCE" \n\t"
2325  ::: "memory"
2326  );
2327 }
2328 
2329 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2330  int width, int height,
2331  int lumStride, int chromStride, int srcStride)
2332 {
2333  int y;
2334  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2335 
2336  for (y=0; y<height; y++) {
2337  RENAME(extract_even)(src, ydst, width);
2338  RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2339 
2340  src += srcStride;
2341  ydst+= lumStride;
2342  udst+= chromStride;
2343  vdst+= chromStride;
2344  }
2345  __asm__(
2346  EMMS" \n\t"
2347  SFENCE" \n\t"
2348  ::: "memory"
2349  );
2350 }
2351 
2352 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2353  int width, int height,
2354  int lumStride, int chromStride, int srcStride)
2355 {
2356  int y;
2357  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2358 
2359  for (y=0; y<height; y++) {
2360  RENAME(extract_odd)(src, ydst, width);
2361  if(y&1) {
2362  RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2363  udst+= chromStride;
2364  vdst+= chromStride;
2365  }
2366 
2367  src += srcStride;
2368  ydst+= lumStride;
2369  }
2370  __asm__(
2371  EMMS" \n\t"
2372  SFENCE" \n\t"
2373  ::: "memory"
2374  );
2375 }
2376 
2377 #if ARCH_X86_32
2378 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2379  int width, int height,
2380  int lumStride, int chromStride, int srcStride)
2381 {
2382  int y;
2383  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2384 
2385  for (y=0; y<height; y++) {
2386  RENAME(extract_odd)(src, ydst, width);
2387  RENAME(extract_even2)(src, udst, vdst, chromWidth);
2388 
2389  src += srcStride;
2390  ydst+= lumStride;
2391  udst+= chromStride;
2392  vdst+= chromStride;
2393  }
2394  __asm__(
2395  EMMS" \n\t"
2396  SFENCE" \n\t"
2397  ::: "memory"
2398  );
2399 }
2400 #endif /* ARCH_X86_32 */
2401 #endif /* !COMPILE_TEMPLATE_SSE2 */
2402 
2403 static av_cold void RENAME(rgb2rgb_init)(void)
2404 {
2405 #if !COMPILE_TEMPLATE_SSE2
2430 #if ARCH_X86_32
2432 #endif
2434 
2436 #if HAVE_7REGS
2437  ff_rgb24toyv12 = RENAME(rgb24toyv12);
2438 #endif /* HAVE_7REGS */
2439 
2442 #endif /* !COMPILE_TEMPLATE_SSE2 */
2443 
2444 #if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2
2446 #endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */
2447 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2448 #if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM
2450 #endif
2451 #endif
2452 }
STORE_BGR24_MMX
#define STORE_BGR24_MMX
Definition: rgb2rgb_template.c:92
rgb32tobgr24
void(* rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:35
r
const char * r
Definition: vf_curves.c:116
PACK_RGB32
#define PACK_RGB32
Definition: rgb2rgb_template.c:918
yv12toyuy2
void(* yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:61
PAVGB
#define PAVGB
Definition: rgb2rgb_template.c:40
src1
const pixel * src1
Definition: h264pred_template.c:421
w
uint8_t w
Definition: llviddspenc.c:38
yuy2toyv12
void(* yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:77
b
#define b
Definition: input.c:41
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
rgb2yuv
static const char rgb2yuv[]
Definition: vf_scale_vulkan.c:68
SFENCE
#define SFENCE
Definition: rgb2rgb_template.c:42
rgb32tobgr16
void(* rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:36
yuyvtoyuv422
void(* yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:113
rgb24tobgr16
void(* rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:40
rgb15to32
void(* rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:52
yv12touyvy
void(* yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:65
rgb32to16
void(* rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:45
rgb
Definition: rpzaenc.c:59
ff_rgb24toyv12
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
Definition: rgb2rgb.c:81
av_cold
#define av_cold
Definition: attributes.h:90
rgb16tobgr24
void(* rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:42
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:256
AV_CEIL_RSHIFT
#define AV_CEIL_RSHIFT(a, b)
Definition: common.h:50
g
const char * g
Definition: vf_curves.c:117
s1
#define s1
Definition: regdef.h:38
rgb15tobgr24
void(* rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:43
yuv422ptoyuy2
void(* yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:69
rgb32tobgr15
void(* rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:37
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
interleaveBytes
void(* interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, int width, int height, int src1Stride, int src2Stride, int dstStride)
Definition: rgb2rgb.c:88
yvu9_to_yuy2
void(* yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
Definition: rgb2rgb.c:99
NULL
#define NULL
Definition: coverity.c:32
rgb16to15
void(* rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:50
yuyvtoyuv420
void(* yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:110
rgb24tobgr32
void(* rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:38
s2
#define s2
Definition: regdef.h:39
asm.h
height
#define height
ff_rgb24toyv12_c
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
Definition: rgb2rgb_template.c:649
attributes.h
rgb24to16
void(* rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:47
EMMS
#define EMMS
Definition: rgb2rgb_template.c:44
uyvytoyuv422
void(* uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:107
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
rgb24to15
void(* rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:48
src2
const pixel * src2
Definition: h264pred_template.c:422
PREFETCH
#define PREFETCH
Definition: rgb2rgb_template.c:39
RENAME
#define RENAME(name)
Definition: ffv1.h:192
rgb32to15
void(* rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:46
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
deinterleaveBytes
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:91
uyvytoyuv420
void(* uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:104
rgb16to32
void(* rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:49
rgb24tobgr15
void(* rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:41
rgb15to16
void(* rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:51
MOVNTQ
#define MOVNTQ
Definition: rgb2rgb_template.c:41
yuv422ptouyvy
void(* yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:73
vu9_to_vu12
void(* vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
Definition: rgb2rgb.c:94
rgb24tobgr24
void(* rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:39
src0
const pixel *const src0
Definition: h264pred_template.c:420
MANGLE
#define MANGLE(a)
Definition: asm.h:127
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
x86_reg
int x86_reg
Definition: asm.h:72
d
d
Definition: ffmpeg_filter.c:156
int32_t
int32_t
Definition: audioconvert.c:56
planar2x
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:86
h
h
Definition: vp9dsp_template.c:2038