FFmpeg
yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24  * Convert I420 YV12 to RGB in various formats,
25  * it rejects images that are not in 420 formats,
26  * it rejects images that don't have widths of multiples of 16,
27  * it rejects images that don't have heights of multiples of 2.
28  * Reject defers to C simulation code.
29  *
30  * Lots of optimizations to be done here.
31  *
32  * 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  * and adds, so we currently use max/min to clip.
34  *
35  * 2. The inefficient use of chroma loading needs a bit of brushing up.
36  *
37  * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  * pipeline stalls.
39  *
40  *
41  * MODIFIED to calculate coeffs from currently selected color space.
42  * MODIFIED core to be a macro where you specify the output format.
43  * ADDED UYVY conversion which is never called due to some thing in swscale.
44  * CORRECTED algorithim selection to be strict on input formats.
45  * ADDED runtime detection of AltiVec.
46  *
47  * ADDED altivec_yuv2packedX vertical scl + RGB converter
48  *
49  * March 27,2004
50  * PERFORMANCE ANALYSIS
51  *
52  * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53  * used as test.
54  * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55  * same sequence.
56  *
57  * 720 * 480 * 30 ~10MPS
58  *
59  * so we have roughly 10 clocks per pixel. This is too high, something has
60  * to be wrong.
61  *
62  * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63  * need for vec_min.
64  *
65  * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
66  * have the input video frame, it was just decompressed so it probably resides
67  * in L1 caches. However, we are creating the output video stream. This needs
68  * to use the DSTST instruction to optimize for the cache. We couple this with
69  * the fact that we are not going to be visiting the input buffer again so we
70  * mark it Least Recently Used. This shaves 25% of the processor cycles off.
71  *
72  * Now memcpy is the largest mips consumer in the system, probably due
73  * to the inefficient X11 stuff.
74  *
75  * GL libraries seem to be very slow on this machine 1.33Ghz PB running
76  * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77  * a versioning issue, however I have libGL.1.2.dylib for both
78  * machines. (We need to figure this out now.)
79  *
80  * GL2 libraries work now with patch for RGB32.
81  *
82  * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83  *
84  * Integrated luma prescaling adjustment for saturation/contrast/brightness
85  * adjustment.
86  */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
97 #include "libavutil/attributes.h"
98 #include "libavutil/cpu.h"
99 #include "libavutil/mem_internal.h"
100 #include "libavutil/pixdesc.h"
101 #include "yuv2rgb_altivec.h"
102 
103 #if HAVE_ALTIVEC
104 
105 #undef PROFILE_THE_BEAST
106 #undef INC_SCALING
107 
108 typedef unsigned char ubyte;
109 typedef signed char sbyte;
110 
111 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
112  * homogeneous vector registers x0,x1,x2 are interleaved with the
113  * following technique:
114  *
115  * o0 = vec_mergeh(x0, x1);
116  * o1 = vec_perm(o0, x2, perm_rgb_0);
117  * o2 = vec_perm(o0, x2, perm_rgb_1);
118  * o3 = vec_mergel(x0, x1);
119  * o4 = vec_perm(o3, o2, perm_rgb_2);
120  * o5 = vec_perm(o3, o2, perm_rgb_3);
121  *
122  * perm_rgb_0: o0(RG).h v1(B) --> o1*
123  * 0 1 2 3 4
124  * rgbr|gbrg|brgb|rgbr
125  * 0010 0100 1001 0010
126  * 0102 3145 2673 894A
127  *
128  * perm_rgb_1: o0(RG).h v1(B) --> o2
129  * 0 1 2 3 4
130  * gbrg|brgb|bbbb|bbbb
131  * 0100 1001 1111 1111
132  * B5CD 6EF7 89AB CDEF
133  *
134  * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
135  * 0 1 2 3 4
136  * gbrg|brgb|rgbr|gbrg
137  * 1111 1111 0010 0100
138  * 89AB CDEF 0182 3945
139  *
140  * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
141  * 0 1 2 3 4
142  * brgb|rgbr|gbrg|brgb
143  * 1001 0010 0100 1001
144  * a67b 89cA BdCD eEFf
145  */
146 static const vector unsigned char
147  perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
148  0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
149  perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
150  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
151  perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
152  0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
153  perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
154  0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
155 
156 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
157  do { \
158  __typeof__(x0) o0, o2, o3; \
159  o0 = vec_mergeh(x0, x1); \
160  y0 = vec_perm(o0, x2, perm_rgb_0); \
161  o2 = vec_perm(o0, x2, perm_rgb_1); \
162  o3 = vec_mergel(x0, x1); \
163  y1 = vec_perm(o3, o2, perm_rgb_2); \
164  y2 = vec_perm(o3, o2, perm_rgb_3); \
165  } while (0)
166 
167 #define vec_mstbgr24(x0, x1, x2, ptr) \
168  do { \
169  __typeof__(x0) _0, _1, _2; \
170  vec_merge3(x0, x1, x2, _0, _1, _2); \
171  vec_st(_0, 0, ptr++); \
172  vec_st(_1, 0, ptr++); \
173  vec_st(_2, 0, ptr++); \
174  } while (0)
175 
176 #define vec_mstrgb24(x0, x1, x2, ptr) \
177  do { \
178  __typeof__(x0) _0, _1, _2; \
179  vec_merge3(x2, x1, x0, _0, _1, _2); \
180  vec_st(_0, 0, ptr++); \
181  vec_st(_1, 0, ptr++); \
182  vec_st(_2, 0, ptr++); \
183  } while (0)
184 
185 /* pack the pixels in rgb0 format
186  * msb R
187  * lsb 0
188  */
189 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
190  do { \
191  T _0, _1, _2, _3; \
192  _0 = vec_mergeh(x0, x1); \
193  _1 = vec_mergeh(x2, x3); \
194  _2 = (T) vec_mergeh((vector unsigned short) _0, \
195  (vector unsigned short) _1); \
196  _3 = (T) vec_mergel((vector unsigned short) _0, \
197  (vector unsigned short) _1); \
198  vec_st(_2, 0 * 16, (T *) ptr); \
199  vec_st(_3, 1 * 16, (T *) ptr); \
200  _0 = vec_mergel(x0, x1); \
201  _1 = vec_mergel(x2, x3); \
202  _2 = (T) vec_mergeh((vector unsigned short) _0, \
203  (vector unsigned short) _1); \
204  _3 = (T) vec_mergel((vector unsigned short) _0, \
205  (vector unsigned short) _1); \
206  vec_st(_2, 2 * 16, (T *) ptr); \
207  vec_st(_3, 3 * 16, (T *) ptr); \
208  ptr += 4; \
209  } while (0)
210 
211 /*
212  * 1 0 1.4021 | | Y |
213  * 1 -0.3441 -0.7142 |x| Cb|
214  * 1 1.7718 0 | | Cr|
215  *
216  *
217  * Y: [-128 127]
218  * Cb/Cr : [-128 127]
219  *
220  * typical YUV conversion works on Y: 0-255 this version has been
221  * optimized for JPEG decoding.
222  */
223 
224 #if HAVE_BIGENDIAN
225 #define vec_unh(x) \
226  (vector signed short) \
227  vec_perm(x, (__typeof__(x)) { 0 }, \
228  ((vector unsigned char) { \
229  0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
230  0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
231 
232 #define vec_unl(x) \
233  (vector signed short) \
234  vec_perm(x, (__typeof__(x)) { 0 }, \
235  ((vector unsigned char) { \
236  0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
237  0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
238 #else
239 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
240 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
241 #endif
242 
243 #define vec_clip_s16(x) \
244  vec_max(vec_min(x, ((vector signed short) { \
245  235, 235, 235, 235, 235, 235, 235, 235 })), \
246  ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
247 
248 #define vec_packclp(x, y) \
249  (vector unsigned char) \
250  vec_packs((vector unsigned short) \
251  vec_max(x, ((vector signed short) { 0 })), \
252  (vector unsigned short) \
253  vec_max(y, ((vector signed short) { 0 })))
254 
255 static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
256  vector signed short U, vector signed short V,
257  vector signed short *R, vector signed short *G,
258  vector signed short *B)
259 {
260  vector signed short vx, ux, uvx;
261 
262  Y = vec_mradds(Y, c->CY, c->OY);
263  U = vec_sub(U, (vector signed short)
264  vec_splat((vector signed short) { 128 }, 0));
265  V = vec_sub(V, (vector signed short)
266  vec_splat((vector signed short) { 128 }, 0));
267 
268  // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
269  ux = vec_sl(U, c->CSHIFT);
270  *B = vec_mradds(ux, c->CBU, Y);
271 
272  // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
273  vx = vec_sl(V, c->CSHIFT);
274  *R = vec_mradds(vx, c->CRV, Y);
275 
276  // uvx = ((CGU * u) + (CGV * v)) >> 15;
277  uvx = vec_mradds(U, c->CGU, Y);
278  *G = vec_mradds(V, c->CGV, uvx);
279 }
280 
281 /*
282  * ------------------------------------------------------------------------------
283  * CS converters
284  * ------------------------------------------------------------------------------
285  */
286 
287 #define DEFCSP420_CVT(name, out_pixels) \
288 static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
289  int *instrides, int srcSliceY, int srcSliceH, \
290  unsigned char **oplanes, int *outstrides) \
291 { \
292  int w = c->srcW; \
293  int h = srcSliceH; \
294  int i, j; \
295  int instrides_scl[3]; \
296  vector unsigned char y0, y1; \
297  \
298  vector signed char u, v; \
299  \
300  vector signed short Y0, Y1, Y2, Y3; \
301  vector signed short U, V; \
302  vector signed short vx, ux, uvx; \
303  vector signed short vx0, ux0, uvx0; \
304  vector signed short vx1, ux1, uvx1; \
305  vector signed short R0, G0, B0; \
306  vector signed short R1, G1, B1; \
307  vector unsigned char R, G, B; \
308  \
309  vector signed short lCY = c->CY; \
310  vector signed short lOY = c->OY; \
311  vector signed short lCRV = c->CRV; \
312  vector signed short lCBU = c->CBU; \
313  vector signed short lCGU = c->CGU; \
314  vector signed short lCGV = c->CGV; \
315  vector unsigned short lCSHIFT = c->CSHIFT; \
316  \
317  const ubyte *y1i = in[0]; \
318  const ubyte *y2i = in[0] + instrides[0]; \
319  const ubyte *ui = in[1]; \
320  const ubyte *vi = in[2]; \
321  \
322  vector unsigned char *oute, *outo; \
323  \
324  /* loop moves y{1, 2}i by w */ \
325  instrides_scl[0] = instrides[0] * 2 - w; \
326  /* loop moves ui by w / 2 */ \
327  instrides_scl[1] = instrides[1] - w / 2; \
328  /* loop moves vi by w / 2 */ \
329  instrides_scl[2] = instrides[2] - w / 2; \
330  \
331  for (i = 0; i < h / 2; i++) { \
332  oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
333  (srcSliceY + i * 2)); \
334  outo = oute + (outstrides[0] >> 4); \
335  vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
336  vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
337  \
338  for (j = 0; j < w / 16; j++) { \
339  y0 = vec_xl(0, y1i); \
340  \
341  y1 = vec_xl(0, y2i); \
342  \
343  u = (vector signed char) vec_xl(0, ui); \
344  \
345  v = (vector signed char) vec_xl(0, vi); \
346  \
347  u = (vector signed char) \
348  vec_sub(u, \
349  (vector signed char) \
350  vec_splat((vector signed char) { 128 }, 0)); \
351  v = (vector signed char) \
352  vec_sub(v, \
353  (vector signed char) \
354  vec_splat((vector signed char) { 128 }, 0)); \
355  \
356  U = vec_unpackh(u); \
357  V = vec_unpackh(v); \
358  \
359  Y0 = vec_unh(y0); \
360  Y1 = vec_unl(y0); \
361  Y2 = vec_unh(y1); \
362  Y3 = vec_unl(y1); \
363  \
364  Y0 = vec_mradds(Y0, lCY, lOY); \
365  Y1 = vec_mradds(Y1, lCY, lOY); \
366  Y2 = vec_mradds(Y2, lCY, lOY); \
367  Y3 = vec_mradds(Y3, lCY, lOY); \
368  \
369  /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
370  ux = vec_sl(U, lCSHIFT); \
371  ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
372  ux0 = vec_mergeh(ux, ux); \
373  ux1 = vec_mergel(ux, ux); \
374  \
375  /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
376  vx = vec_sl(V, lCSHIFT); \
377  vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
378  vx0 = vec_mergeh(vx, vx); \
379  vx1 = vec_mergel(vx, vx); \
380  \
381  /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
382  uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
383  uvx = vec_mradds(V, lCGV, uvx); \
384  uvx0 = vec_mergeh(uvx, uvx); \
385  uvx1 = vec_mergel(uvx, uvx); \
386  \
387  R0 = vec_add(Y0, vx0); \
388  G0 = vec_add(Y0, uvx0); \
389  B0 = vec_add(Y0, ux0); \
390  R1 = vec_add(Y1, vx1); \
391  G1 = vec_add(Y1, uvx1); \
392  B1 = vec_add(Y1, ux1); \
393  \
394  R = vec_packclp(R0, R1); \
395  G = vec_packclp(G0, G1); \
396  B = vec_packclp(B0, B1); \
397  \
398  out_pixels(R, G, B, oute); \
399  \
400  R0 = vec_add(Y2, vx0); \
401  G0 = vec_add(Y2, uvx0); \
402  B0 = vec_add(Y2, ux0); \
403  R1 = vec_add(Y3, vx1); \
404  G1 = vec_add(Y3, uvx1); \
405  B1 = vec_add(Y3, ux1); \
406  R = vec_packclp(R0, R1); \
407  G = vec_packclp(G0, G1); \
408  B = vec_packclp(B0, B1); \
409  \
410  \
411  out_pixels(R, G, B, outo); \
412  \
413  y1i += 16; \
414  y2i += 16; \
415  ui += 8; \
416  vi += 8; \
417  } \
418  \
419  ui += instrides_scl[1]; \
420  vi += instrides_scl[2]; \
421  y1i += instrides_scl[0]; \
422  y2i += instrides_scl[0]; \
423  } \
424  return srcSliceH; \
425 }
426 
427 #define out_abgr(a, b, c, ptr) \
428  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
429 #define out_bgra(a, b, c, ptr) \
430  vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
431 #define out_rgba(a, b, c, ptr) \
432  vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
433 #define out_argb(a, b, c, ptr) \
434  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
435 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
436 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
437 
438 DEFCSP420_CVT(yuv2_abgr, out_abgr)
439 DEFCSP420_CVT(yuv2_bgra, out_bgra)
440 DEFCSP420_CVT(yuv2_rgba, out_rgba)
441 DEFCSP420_CVT(yuv2_argb, out_argb)
442 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
443 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
444 
445 // uyvy|uyvy|uyvy|uyvy
446 // 0123 4567 89ab cdef
447 static const vector unsigned char
448  demux_u = { 0x10, 0x00, 0x10, 0x00,
449  0x10, 0x04, 0x10, 0x04,
450  0x10, 0x08, 0x10, 0x08,
451  0x10, 0x0c, 0x10, 0x0c },
452  demux_v = { 0x10, 0x02, 0x10, 0x02,
453  0x10, 0x06, 0x10, 0x06,
454  0x10, 0x0A, 0x10, 0x0A,
455  0x10, 0x0E, 0x10, 0x0E },
456  demux_y = { 0x10, 0x01, 0x10, 0x03,
457  0x10, 0x05, 0x10, 0x07,
458  0x10, 0x09, 0x10, 0x0B,
459  0x10, 0x0D, 0x10, 0x0F };
460 
461 /*
462  * this is so I can play live CCIR raw video
463  */
464 static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
465  int *instrides, int srcSliceY, int srcSliceH,
466  unsigned char **oplanes, int *outstrides)
467 {
468  int w = c->srcW;
469  int h = srcSliceH;
470  int i, j;
471  vector unsigned char uyvy;
472  vector signed short Y, U, V;
473  vector signed short R0, G0, B0, R1, G1, B1;
474  vector unsigned char R, G, B;
475  vector unsigned char *out;
476  const ubyte *img;
477 
478  img = in[0];
479  out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
480 
481  for (i = 0; i < h; i++)
482  for (j = 0; j < w / 16; j++) {
483  uyvy = vec_ld(0, img);
484 
485  U = (vector signed short)
486  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
487  V = (vector signed short)
488  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
489  Y = (vector signed short)
490  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
491 
492  cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
493 
494  uyvy = vec_ld(16, img);
495 
496  U = (vector signed short)
497  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
498  V = (vector signed short)
499  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
500  Y = (vector signed short)
501  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
502 
503  cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
504 
505  R = vec_packclp(R0, R1);
506  G = vec_packclp(G0, G1);
507  B = vec_packclp(B0, B1);
508 
509  // vec_mstbgr24 (R,G,B, out);
510  out_rgba(R, G, B, out);
511 
512  img += 32;
513  }
514  return srcSliceH;
515 }
516 
517 #endif /* HAVE_ALTIVEC */
518 
519 /* Ok currently the acceleration routine only supports
520  * inputs of widths a multiple of 16
521  * and heights a multiple 2
522  *
523  * So we just fall back to the C codes for this.
524  */
526 {
527 #if HAVE_ALTIVEC
529  return NULL;
530 
531  /*
532  * and this seems not to matter too much I tried a bunch of
533  * videos with abnormal widths and MPlayer crashes elsewhere.
534  * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
535  * boom with X11 bad match.
536  *
537  */
538  if ((c->srcW & 0xf) != 0)
539  return NULL;
540 
541  switch (c->srcFormat) {
542  case AV_PIX_FMT_YUV410P:
543  case AV_PIX_FMT_YUV420P:
544  /*case IMGFMT_CLPL: ??? */
545  case AV_PIX_FMT_GRAY8:
546  case AV_PIX_FMT_NV12:
547  case AV_PIX_FMT_NV21:
548  if ((c->srcH & 0x1) != 0)
549  return NULL;
550 
551  switch (c->dstFormat) {
552  case AV_PIX_FMT_RGB24:
553  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
554  return altivec_yuv2_rgb24;
555  case AV_PIX_FMT_BGR24:
556  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
557  return altivec_yuv2_bgr24;
558  case AV_PIX_FMT_ARGB:
559  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
560  return altivec_yuv2_argb;
561  case AV_PIX_FMT_ABGR:
562  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
563  return altivec_yuv2_abgr;
564  case AV_PIX_FMT_RGBA:
565  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
566  return altivec_yuv2_rgba;
567  case AV_PIX_FMT_BGRA:
568  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
569  return altivec_yuv2_bgra;
570  default: return NULL;
571  }
572  break;
573 
574  case AV_PIX_FMT_UYVY422:
575  switch (c->dstFormat) {
576  case AV_PIX_FMT_BGR32:
577  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
578  return altivec_uyvy_rgb32;
579  default: return NULL;
580  }
581  break;
582  }
583 #endif /* HAVE_ALTIVEC */
584 
585  return NULL;
586 }
587 
589  const int inv_table[4],
590  int brightness,
591  int contrast,
592  int saturation)
593 {
594 #if HAVE_ALTIVEC
595  union {
596  DECLARE_ALIGNED(16, signed short, tmp)[8];
597  vector signed short vec;
598  } buf;
599 
601  return;
602 
603  buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
604  buf.tmp[1] = -256 * brightness; // oy
605  buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
606  buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
607  buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
608  buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
609 
610  c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
611  c->CY = vec_splat((vector signed short) buf.vec, 0);
612  c->OY = vec_splat((vector signed short) buf.vec, 1);
613  c->CRV = vec_splat((vector signed short) buf.vec, 2);
614  c->CBU = vec_splat((vector signed short) buf.vec, 3);
615  c->CGU = vec_splat((vector signed short) buf.vec, 4);
616  c->CGV = vec_splat((vector signed short) buf.vec, 5);
617  return;
618 #endif /* HAVE_ALTIVEC */
619 }
620 
621 #if HAVE_ALTIVEC
622 
623 static av_always_inline void yuv2packedX_altivec(SwsContext *c,
624  const int16_t *lumFilter,
625  const int16_t **lumSrc,
626  int lumFilterSize,
627  const int16_t *chrFilter,
628  const int16_t **chrUSrc,
629  const int16_t **chrVSrc,
630  int chrFilterSize,
631  const int16_t **alpSrc,
632  uint8_t *dest,
633  int dstW, int dstY,
634  enum AVPixelFormat target)
635 {
636  int i, j;
637  vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
638  vector signed short R0, G0, B0, R1, G1, B1;
639 
640  vector unsigned char R, G, B;
641  vector unsigned char *out, *nout;
642 
643  vector signed short RND = vec_splat_s16(1 << 3);
644  vector unsigned short SCL = vec_splat_u16(4);
645  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
646 
647  vector signed short *YCoeffs, *CCoeffs;
648 
649  YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
650  CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
651 
652  out = (vector unsigned char *) dest;
653 
654  for (i = 0; i < dstW; i += 16) {
655  Y0 = RND;
656  Y1 = RND;
657  /* extract 16 coeffs from lumSrc */
658  for (j = 0; j < lumFilterSize; j++) {
659  X0 = vec_ld(0, &lumSrc[j][i]);
660  X1 = vec_ld(16, &lumSrc[j][i]);
661  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
662  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
663  }
664 
665  U = RND;
666  V = RND;
667  /* extract 8 coeffs from U,V */
668  for (j = 0; j < chrFilterSize; j++) {
669  X = vec_ld(0, &chrUSrc[j][i / 2]);
670  U = vec_mradds(X, CCoeffs[j], U);
671  X = vec_ld(0, &chrVSrc[j][i / 2]);
672  V = vec_mradds(X, CCoeffs[j], V);
673  }
674 
675  /* scale and clip signals */
676  Y0 = vec_sra(Y0, SCL);
677  Y1 = vec_sra(Y1, SCL);
678  U = vec_sra(U, SCL);
679  V = vec_sra(V, SCL);
680 
681  Y0 = vec_clip_s16(Y0);
682  Y1 = vec_clip_s16(Y1);
683  U = vec_clip_s16(U);
684  V = vec_clip_s16(V);
685 
686  /* now we have
687  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
688  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
689  *
690  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
691  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
692  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
693  */
694 
695  U0 = vec_mergeh(U, U);
696  V0 = vec_mergeh(V, V);
697 
698  U1 = vec_mergel(U, U);
699  V1 = vec_mergel(V, V);
700 
701  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
702  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
703 
704  R = vec_packclp(R0, R1);
705  G = vec_packclp(G0, G1);
706  B = vec_packclp(B0, B1);
707 
708  switch (target) {
709  case AV_PIX_FMT_ABGR:
710  out_abgr(R, G, B, out);
711  break;
712  case AV_PIX_FMT_BGRA:
713  out_bgra(R, G, B, out);
714  break;
715  case AV_PIX_FMT_RGBA:
716  out_rgba(R, G, B, out);
717  break;
718  case AV_PIX_FMT_ARGB:
719  out_argb(R, G, B, out);
720  break;
721  case AV_PIX_FMT_RGB24:
722  out_rgb24(R, G, B, out);
723  break;
724  case AV_PIX_FMT_BGR24:
725  out_bgr24(R, G, B, out);
726  break;
727  default:
728  {
729  /* If this is reached, the caller should have called yuv2packedXinC
730  * instead. */
731  static int printed_error_message;
732  if (!printed_error_message) {
733  av_log(c, AV_LOG_ERROR,
734  "altivec_yuv2packedX doesn't support %s output\n",
736  printed_error_message = 1;
737  }
738  return;
739  }
740  }
741  }
742 
743  if (i < dstW) {
744  i -= 16;
745 
746  Y0 = RND;
747  Y1 = RND;
748  /* extract 16 coeffs from lumSrc */
749  for (j = 0; j < lumFilterSize; j++) {
750  X0 = vec_ld(0, &lumSrc[j][i]);
751  X1 = vec_ld(16, &lumSrc[j][i]);
752  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
753  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
754  }
755 
756  U = RND;
757  V = RND;
758  /* extract 8 coeffs from U,V */
759  for (j = 0; j < chrFilterSize; j++) {
760  X = vec_ld(0, &chrUSrc[j][i / 2]);
761  U = vec_mradds(X, CCoeffs[j], U);
762  X = vec_ld(0, &chrVSrc[j][i / 2]);
763  V = vec_mradds(X, CCoeffs[j], V);
764  }
765 
766  /* scale and clip signals */
767  Y0 = vec_sra(Y0, SCL);
768  Y1 = vec_sra(Y1, SCL);
769  U = vec_sra(U, SCL);
770  V = vec_sra(V, SCL);
771 
772  Y0 = vec_clip_s16(Y0);
773  Y1 = vec_clip_s16(Y1);
774  U = vec_clip_s16(U);
775  V = vec_clip_s16(V);
776 
777  /* now we have
778  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
779  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
780  *
781  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
782  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
783  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
784  */
785 
786  U0 = vec_mergeh(U, U);
787  V0 = vec_mergeh(V, V);
788 
789  U1 = vec_mergel(U, U);
790  V1 = vec_mergel(V, V);
791 
792  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
793  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
794 
795  R = vec_packclp(R0, R1);
796  G = vec_packclp(G0, G1);
797  B = vec_packclp(B0, B1);
798 
799  nout = (vector unsigned char *) scratch;
800  switch (target) {
801  case AV_PIX_FMT_ABGR:
802  out_abgr(R, G, B, nout);
803  break;
804  case AV_PIX_FMT_BGRA:
805  out_bgra(R, G, B, nout);
806  break;
807  case AV_PIX_FMT_RGBA:
808  out_rgba(R, G, B, nout);
809  break;
810  case AV_PIX_FMT_ARGB:
811  out_argb(R, G, B, nout);
812  break;
813  case AV_PIX_FMT_RGB24:
814  out_rgb24(R, G, B, nout);
815  break;
816  case AV_PIX_FMT_BGR24:
817  out_bgr24(R, G, B, nout);
818  break;
819  default:
820  /* Unreachable, I think. */
821  av_log(c, AV_LOG_ERROR,
822  "altivec_yuv2packedX doesn't support %s output\n",
824  return;
825  }
826 
827  memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
828  }
829 }
830 
831 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
832 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
833  const int16_t *lumFilter, \
834  const int16_t **lumSrc, \
835  int lumFilterSize, \
836  const int16_t *chrFilter, \
837  const int16_t **chrUSrc, \
838  const int16_t **chrVSrc, \
839  int chrFilterSize, \
840  const int16_t **alpSrc, \
841  uint8_t *dest, int dstW, int dstY) \
842 { \
843  yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
844  chrFilter, chrUSrc, chrVSrc, \
845  chrFilterSize, alpSrc, \
846  dest, dstW, dstY, pixfmt); \
847 }
848 
849 YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
850 YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
851 YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
852 YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
853 YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
854 YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
855 
856 #endif /* HAVE_ALTIVEC */
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
#define NULL
Definition: coverity.c:32
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:200
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
return srcSliceH
Macro definitions for various function/variable attributes.
int srcH
Height of source luma/alpha planes.
#define B1
Definition: faandct.c:41
#define img
uint8_t
#define av_cold
Definition: attributes.h:88
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:117
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
#define av_log(a,...)
#define U(x)
Definition: vp56_arith.h:37
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:194
#define R
Definition: huffyuvdsp.h:34
#define B
Definition: huffyuvdsp.h:32
#define R0(v, w, x, y, z, i)
Definition: sha.c:56
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:89
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
as above, but U and V bytes are swapped
Definition: pixfmt.h:90
#define Y
Definition: boxblur.h:38
uint8_t w
Definition: llviddspenc.c:39
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
int(* SwsFunc)(struct SwsContext *context, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:374
#define R1
Definition: simple_idct.c:172
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31))))#define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac){}void ff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map){AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);return NULL;}return ac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;}int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){int use_generic=1;int len=in->nb_samples;int p;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:72
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
Definition: vf_addroi.c:26
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:66
Y , 8bpp.
Definition: pixfmt.h:74
#define G
Definition: huffyuvdsp.h:33
enum AVPixelFormat srcFormat
Source pixel format.
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
#define xf(width, name, var, range_min, range_max, subs,...)
Definition: cbs_av1.c:664
FILE * out
Definition: movenc.c:54
#define av_always_inline
Definition: attributes.h:45
const char * av_get_pix_fmt_name(enum AVPixelFormat pix_fmt)
Return the short name for a pixel format, NULL in case pix_fmt is unknown.
Definition: pixdesc.c:2489
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
#define B0
Definition: faandct.c:40
int i
Definition: input.c:407
#define V
Definition: avdct.c:30
static uint8_t tmp[11]
Definition: aes_ctr.c:27