FFmpeg
yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24  * Convert I420 YV12 to RGB in various formats,
25  * it rejects images that are not in 420 formats,
26  * it rejects images that don't have widths of multiples of 16,
27  * it rejects images that don't have heights of multiples of 2.
28  * Reject defers to C simulation code.
29  *
30  * Lots of optimizations to be done here.
31  *
32  * 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  * and adds, so we currently use max/min to clip.
34  *
35  * 2. The inefficient use of chroma loading needs a bit of brushing up.
36  *
37  * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  * pipeline stalls.
39  *
40  *
41  * MODIFIED to calculate coeffs from currently selected color space.
42  * MODIFIED core to be a macro where you specify the output format.
43  * ADDED UYVY conversion which is never called due to some thing in swscale.
44  * CORRECTED algorithim selection to be strict on input formats.
45  * ADDED runtime detection of AltiVec.
46  *
47  * ADDED altivec_yuv2packedX vertical scl + RGB converter
48  *
49  * March 27,2004
50  * PERFORMANCE ANALYSIS
51  *
52  * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53  * used as test.
54  * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55  * same sequence.
56  *
57  * 720 * 480 * 30 ~10MPS
58  *
59  * so we have roughly 10 clocks per pixel. This is too high, something has
60  * to be wrong.
61  *
62  * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63  * need for vec_min.
64  *
65  * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
66  * have the input video frame, it was just decompressed so it probably resides
67  * in L1 caches. However, we are creating the output video stream. This needs
68  * to use the DSTST instruction to optimize for the cache. We couple this with
69  * the fact that we are not going to be visiting the input buffer again so we
70  * mark it Least Recently Used. This shaves 25% of the processor cycles off.
71  *
72  * Now memcpy is the largest mips consumer in the system, probably due
73  * to the inefficient X11 stuff.
74  *
75  * GL libraries seem to be very slow on this machine 1.33Ghz PB running
76  * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77  * a versioning issue, however I have libGL.1.2.dylib for both
78  * machines. (We need to figure this out now.)
79  *
80  * GL2 libraries work now with patch for RGB32.
81  *
82  * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83  *
84  * Integrated luma prescaling adjustment for saturation/contrast/brightness
85  * adjustment.
86  */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
97 #include "libavutil/attributes.h"
98 #include "libavutil/cpu.h"
99 #include "libavutil/pixdesc.h"
100 #include "yuv2rgb_altivec.h"
101 
102 #if HAVE_ALTIVEC
103 
104 #undef PROFILE_THE_BEAST
105 #undef INC_SCALING
106 
107 typedef unsigned char ubyte;
108 typedef signed char sbyte;
109 
110 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
111  * homogeneous vector registers x0,x1,x2 are interleaved with the
112  * following technique:
113  *
114  * o0 = vec_mergeh(x0, x1);
115  * o1 = vec_perm(o0, x2, perm_rgb_0);
116  * o2 = vec_perm(o0, x2, perm_rgb_1);
117  * o3 = vec_mergel(x0, x1);
118  * o4 = vec_perm(o3, o2, perm_rgb_2);
119  * o5 = vec_perm(o3, o2, perm_rgb_3);
120  *
121  * perm_rgb_0: o0(RG).h v1(B) --> o1*
122  * 0 1 2 3 4
123  * rgbr|gbrg|brgb|rgbr
124  * 0010 0100 1001 0010
125  * 0102 3145 2673 894A
126  *
127  * perm_rgb_1: o0(RG).h v1(B) --> o2
128  * 0 1 2 3 4
129  * gbrg|brgb|bbbb|bbbb
130  * 0100 1001 1111 1111
131  * B5CD 6EF7 89AB CDEF
132  *
133  * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
134  * 0 1 2 3 4
135  * gbrg|brgb|rgbr|gbrg
136  * 1111 1111 0010 0100
137  * 89AB CDEF 0182 3945
138  *
139  * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
140  * 0 1 2 3 4
141  * brgb|rgbr|gbrg|brgb
142  * 1001 0010 0100 1001
143  * a67b 89cA BdCD eEFf
144  */
145 static const vector unsigned char
146  perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
147  0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
148  perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
149  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
150  perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
151  0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
152  perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
153  0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
154 
155 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
156  do { \
157  __typeof__(x0) o0, o2, o3; \
158  o0 = vec_mergeh(x0, x1); \
159  y0 = vec_perm(o0, x2, perm_rgb_0); \
160  o2 = vec_perm(o0, x2, perm_rgb_1); \
161  o3 = vec_mergel(x0, x1); \
162  y1 = vec_perm(o3, o2, perm_rgb_2); \
163  y2 = vec_perm(o3, o2, perm_rgb_3); \
164  } while (0)
165 
166 #define vec_mstbgr24(x0, x1, x2, ptr) \
167  do { \
168  __typeof__(x0) _0, _1, _2; \
169  vec_merge3(x0, x1, x2, _0, _1, _2); \
170  vec_st(_0, 0, ptr++); \
171  vec_st(_1, 0, ptr++); \
172  vec_st(_2, 0, ptr++); \
173  } while (0)
174 
175 #define vec_mstrgb24(x0, x1, x2, ptr) \
176  do { \
177  __typeof__(x0) _0, _1, _2; \
178  vec_merge3(x2, x1, x0, _0, _1, _2); \
179  vec_st(_0, 0, ptr++); \
180  vec_st(_1, 0, ptr++); \
181  vec_st(_2, 0, ptr++); \
182  } while (0)
183 
184 /* pack the pixels in rgb0 format
185  * msb R
186  * lsb 0
187  */
188 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
189  do { \
190  T _0, _1, _2, _3; \
191  _0 = vec_mergeh(x0, x1); \
192  _1 = vec_mergeh(x2, x3); \
193  _2 = (T) vec_mergeh((vector unsigned short) _0, \
194  (vector unsigned short) _1); \
195  _3 = (T) vec_mergel((vector unsigned short) _0, \
196  (vector unsigned short) _1); \
197  vec_st(_2, 0 * 16, (T *) ptr); \
198  vec_st(_3, 1 * 16, (T *) ptr); \
199  _0 = vec_mergel(x0, x1); \
200  _1 = vec_mergel(x2, x3); \
201  _2 = (T) vec_mergeh((vector unsigned short) _0, \
202  (vector unsigned short) _1); \
203  _3 = (T) vec_mergel((vector unsigned short) _0, \
204  (vector unsigned short) _1); \
205  vec_st(_2, 2 * 16, (T *) ptr); \
206  vec_st(_3, 3 * 16, (T *) ptr); \
207  ptr += 4; \
208  } while (0)
209 
210 /*
211  * 1 0 1.4021 | | Y |
212  * 1 -0.3441 -0.7142 |x| Cb|
213  * 1 1.7718 0 | | Cr|
214  *
215  *
216  * Y: [-128 127]
217  * Cb/Cr : [-128 127]
218  *
219  * typical YUV conversion works on Y: 0-255 this version has been
220  * optimized for JPEG decoding.
221  */
222 
223 #if HAVE_BIGENDIAN
224 #define vec_unh(x) \
225  (vector signed short) \
226  vec_perm(x, (__typeof__(x)) { 0 }, \
227  ((vector unsigned char) { \
228  0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
229  0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
230 
231 #define vec_unl(x) \
232  (vector signed short) \
233  vec_perm(x, (__typeof__(x)) { 0 }, \
234  ((vector unsigned char) { \
235  0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
236  0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
237 #else
238 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
239 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
240 #endif
241 
242 #define vec_clip_s16(x) \
243  vec_max(vec_min(x, ((vector signed short) { \
244  235, 235, 235, 235, 235, 235, 235, 235 })), \
245  ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
246 
247 #define vec_packclp(x, y) \
248  (vector unsigned char) \
249  vec_packs((vector unsigned short) \
250  vec_max(x, ((vector signed short) { 0 })), \
251  (vector unsigned short) \
252  vec_max(y, ((vector signed short) { 0 })))
253 
254 static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
255  vector signed short U, vector signed short V,
256  vector signed short *R, vector signed short *G,
257  vector signed short *B)
258 {
259  vector signed short vx, ux, uvx;
260 
261  Y = vec_mradds(Y, c->CY, c->OY);
262  U = vec_sub(U, (vector signed short)
263  vec_splat((vector signed short) { 128 }, 0));
264  V = vec_sub(V, (vector signed short)
265  vec_splat((vector signed short) { 128 }, 0));
266 
267  // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
268  ux = vec_sl(U, c->CSHIFT);
269  *B = vec_mradds(ux, c->CBU, Y);
270 
271  // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
272  vx = vec_sl(V, c->CSHIFT);
273  *R = vec_mradds(vx, c->CRV, Y);
274 
275  // uvx = ((CGU * u) + (CGV * v)) >> 15;
276  uvx = vec_mradds(U, c->CGU, Y);
277  *G = vec_mradds(V, c->CGV, uvx);
278 }
279 
280 /*
281  * ------------------------------------------------------------------------------
282  * CS converters
283  * ------------------------------------------------------------------------------
284  */
285 
286 #define DEFCSP420_CVT(name, out_pixels) \
287 static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
288  int *instrides, int srcSliceY, int srcSliceH, \
289  unsigned char **oplanes, int *outstrides) \
290 { \
291  int w = c->srcW; \
292  int h = srcSliceH; \
293  int i, j; \
294  int instrides_scl[3]; \
295  vector unsigned char y0, y1; \
296  \
297  vector signed char u, v; \
298  \
299  vector signed short Y0, Y1, Y2, Y3; \
300  vector signed short U, V; \
301  vector signed short vx, ux, uvx; \
302  vector signed short vx0, ux0, uvx0; \
303  vector signed short vx1, ux1, uvx1; \
304  vector signed short R0, G0, B0; \
305  vector signed short R1, G1, B1; \
306  vector unsigned char R, G, B; \
307  \
308  vector signed short lCY = c->CY; \
309  vector signed short lOY = c->OY; \
310  vector signed short lCRV = c->CRV; \
311  vector signed short lCBU = c->CBU; \
312  vector signed short lCGU = c->CGU; \
313  vector signed short lCGV = c->CGV; \
314  vector unsigned short lCSHIFT = c->CSHIFT; \
315  \
316  const ubyte *y1i = in[0]; \
317  const ubyte *y2i = in[0] + instrides[0]; \
318  const ubyte *ui = in[1]; \
319  const ubyte *vi = in[2]; \
320  \
321  vector unsigned char *oute, *outo; \
322  \
323  /* loop moves y{1, 2}i by w */ \
324  instrides_scl[0] = instrides[0] * 2 - w; \
325  /* loop moves ui by w / 2 */ \
326  instrides_scl[1] = instrides[1] - w / 2; \
327  /* loop moves vi by w / 2 */ \
328  instrides_scl[2] = instrides[2] - w / 2; \
329  \
330  for (i = 0; i < h / 2; i++) { \
331  oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
332  (srcSliceY + i * 2)); \
333  outo = oute + (outstrides[0] >> 4); \
334  vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
335  vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
336  \
337  for (j = 0; j < w / 16; j++) { \
338  y0 = vec_xl(0, y1i); \
339  \
340  y1 = vec_xl(0, y2i); \
341  \
342  u = (vector signed char) vec_xl(0, ui); \
343  \
344  v = (vector signed char) vec_xl(0, vi); \
345  \
346  u = (vector signed char) \
347  vec_sub(u, \
348  (vector signed char) \
349  vec_splat((vector signed char) { 128 }, 0)); \
350  v = (vector signed char) \
351  vec_sub(v, \
352  (vector signed char) \
353  vec_splat((vector signed char) { 128 }, 0)); \
354  \
355  U = vec_unpackh(u); \
356  V = vec_unpackh(v); \
357  \
358  Y0 = vec_unh(y0); \
359  Y1 = vec_unl(y0); \
360  Y2 = vec_unh(y1); \
361  Y3 = vec_unl(y1); \
362  \
363  Y0 = vec_mradds(Y0, lCY, lOY); \
364  Y1 = vec_mradds(Y1, lCY, lOY); \
365  Y2 = vec_mradds(Y2, lCY, lOY); \
366  Y3 = vec_mradds(Y3, lCY, lOY); \
367  \
368  /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
369  ux = vec_sl(U, lCSHIFT); \
370  ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
371  ux0 = vec_mergeh(ux, ux); \
372  ux1 = vec_mergel(ux, ux); \
373  \
374  /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
375  vx = vec_sl(V, lCSHIFT); \
376  vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
377  vx0 = vec_mergeh(vx, vx); \
378  vx1 = vec_mergel(vx, vx); \
379  \
380  /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
381  uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
382  uvx = vec_mradds(V, lCGV, uvx); \
383  uvx0 = vec_mergeh(uvx, uvx); \
384  uvx1 = vec_mergel(uvx, uvx); \
385  \
386  R0 = vec_add(Y0, vx0); \
387  G0 = vec_add(Y0, uvx0); \
388  B0 = vec_add(Y0, ux0); \
389  R1 = vec_add(Y1, vx1); \
390  G1 = vec_add(Y1, uvx1); \
391  B1 = vec_add(Y1, ux1); \
392  \
393  R = vec_packclp(R0, R1); \
394  G = vec_packclp(G0, G1); \
395  B = vec_packclp(B0, B1); \
396  \
397  out_pixels(R, G, B, oute); \
398  \
399  R0 = vec_add(Y2, vx0); \
400  G0 = vec_add(Y2, uvx0); \
401  B0 = vec_add(Y2, ux0); \
402  R1 = vec_add(Y3, vx1); \
403  G1 = vec_add(Y3, uvx1); \
404  B1 = vec_add(Y3, ux1); \
405  R = vec_packclp(R0, R1); \
406  G = vec_packclp(G0, G1); \
407  B = vec_packclp(B0, B1); \
408  \
409  \
410  out_pixels(R, G, B, outo); \
411  \
412  y1i += 16; \
413  y2i += 16; \
414  ui += 8; \
415  vi += 8; \
416  } \
417  \
418  ui += instrides_scl[1]; \
419  vi += instrides_scl[2]; \
420  y1i += instrides_scl[0]; \
421  y2i += instrides_scl[0]; \
422  } \
423  return srcSliceH; \
424 }
425 
426 #define out_abgr(a, b, c, ptr) \
427  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
428 #define out_bgra(a, b, c, ptr) \
429  vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
430 #define out_rgba(a, b, c, ptr) \
431  vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
432 #define out_argb(a, b, c, ptr) \
433  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
434 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
435 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
436 
437 DEFCSP420_CVT(yuv2_abgr, out_abgr)
438 DEFCSP420_CVT(yuv2_bgra, out_bgra)
439 DEFCSP420_CVT(yuv2_rgba, out_rgba)
440 DEFCSP420_CVT(yuv2_argb, out_argb)
441 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
442 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
443 
444 // uyvy|uyvy|uyvy|uyvy
445 // 0123 4567 89ab cdef
446 static const vector unsigned char
447  demux_u = { 0x10, 0x00, 0x10, 0x00,
448  0x10, 0x04, 0x10, 0x04,
449  0x10, 0x08, 0x10, 0x08,
450  0x10, 0x0c, 0x10, 0x0c },
451  demux_v = { 0x10, 0x02, 0x10, 0x02,
452  0x10, 0x06, 0x10, 0x06,
453  0x10, 0x0A, 0x10, 0x0A,
454  0x10, 0x0E, 0x10, 0x0E },
455  demux_y = { 0x10, 0x01, 0x10, 0x03,
456  0x10, 0x05, 0x10, 0x07,
457  0x10, 0x09, 0x10, 0x0B,
458  0x10, 0x0D, 0x10, 0x0F };
459 
460 /*
461  * this is so I can play live CCIR raw video
462  */
463 static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
464  int *instrides, int srcSliceY, int srcSliceH,
465  unsigned char **oplanes, int *outstrides)
466 {
467  int w = c->srcW;
468  int h = srcSliceH;
469  int i, j;
470  vector unsigned char uyvy;
471  vector signed short Y, U, V;
472  vector signed short R0, G0, B0, R1, G1, B1;
473  vector unsigned char R, G, B;
474  vector unsigned char *out;
475  const ubyte *img;
476 
477  img = in[0];
478  out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
479 
480  for (i = 0; i < h; i++)
481  for (j = 0; j < w / 16; j++) {
482  uyvy = vec_ld(0, img);
483 
484  U = (vector signed short)
485  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
486  V = (vector signed short)
487  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
488  Y = (vector signed short)
489  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
490 
491  cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
492 
493  uyvy = vec_ld(16, img);
494 
495  U = (vector signed short)
496  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
497  V = (vector signed short)
498  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
499  Y = (vector signed short)
500  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
501 
502  cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
503 
504  R = vec_packclp(R0, R1);
505  G = vec_packclp(G0, G1);
506  B = vec_packclp(B0, B1);
507 
508  // vec_mstbgr24 (R,G,B, out);
509  out_rgba(R, G, B, out);
510 
511  img += 32;
512  }
513  return srcSliceH;
514 }
515 
516 #endif /* HAVE_ALTIVEC */
517 
518 /* Ok currently the acceleration routine only supports
519  * inputs of widths a multiple of 16
520  * and heights a multiple 2
521  *
522  * So we just fall back to the C codes for this.
523  */
525 {
526 #if HAVE_ALTIVEC
528  return NULL;
529 
530  /*
531  * and this seems not to matter too much I tried a bunch of
532  * videos with abnormal widths and MPlayer crashes elsewhere.
533  * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
534  * boom with X11 bad match.
535  *
536  */
537  if ((c->srcW & 0xf) != 0)
538  return NULL;
539 
540  switch (c->srcFormat) {
541  case AV_PIX_FMT_YUV410P:
542  case AV_PIX_FMT_YUV420P:
543  /*case IMGFMT_CLPL: ??? */
544  case AV_PIX_FMT_GRAY8:
545  case AV_PIX_FMT_NV12:
546  case AV_PIX_FMT_NV21:
547  if ((c->srcH & 0x1) != 0)
548  return NULL;
549 
550  switch (c->dstFormat) {
551  case AV_PIX_FMT_RGB24:
552  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
553  return altivec_yuv2_rgb24;
554  case AV_PIX_FMT_BGR24:
555  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
556  return altivec_yuv2_bgr24;
557  case AV_PIX_FMT_ARGB:
558  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
559  return altivec_yuv2_argb;
560  case AV_PIX_FMT_ABGR:
561  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
562  return altivec_yuv2_abgr;
563  case AV_PIX_FMT_RGBA:
564  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
565  return altivec_yuv2_rgba;
566  case AV_PIX_FMT_BGRA:
567  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
568  return altivec_yuv2_bgra;
569  default: return NULL;
570  }
571  break;
572 
573  case AV_PIX_FMT_UYVY422:
574  switch (c->dstFormat) {
575  case AV_PIX_FMT_BGR32:
576  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
577  return altivec_uyvy_rgb32;
578  default: return NULL;
579  }
580  break;
581  }
582 #endif /* HAVE_ALTIVEC */
583 
584  return NULL;
585 }
586 
588  const int inv_table[4],
589  int brightness,
590  int contrast,
591  int saturation)
592 {
593 #if HAVE_ALTIVEC
594  union {
595  DECLARE_ALIGNED(16, signed short, tmp)[8];
596  vector signed short vec;
597  } buf;
598 
600  return;
601 
602  buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
603  buf.tmp[1] = -256 * brightness; // oy
604  buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
605  buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
606  buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
607  buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
608 
609  c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
610  c->CY = vec_splat((vector signed short) buf.vec, 0);
611  c->OY = vec_splat((vector signed short) buf.vec, 1);
612  c->CRV = vec_splat((vector signed short) buf.vec, 2);
613  c->CBU = vec_splat((vector signed short) buf.vec, 3);
614  c->CGU = vec_splat((vector signed short) buf.vec, 4);
615  c->CGV = vec_splat((vector signed short) buf.vec, 5);
616  return;
617 #endif /* HAVE_ALTIVEC */
618 }
619 
620 #if HAVE_ALTIVEC
621 
622 static av_always_inline void yuv2packedX_altivec(SwsContext *c,
623  const int16_t *lumFilter,
624  const int16_t **lumSrc,
625  int lumFilterSize,
626  const int16_t *chrFilter,
627  const int16_t **chrUSrc,
628  const int16_t **chrVSrc,
629  int chrFilterSize,
630  const int16_t **alpSrc,
631  uint8_t *dest,
632  int dstW, int dstY,
633  enum AVPixelFormat target)
634 {
635  int i, j;
636  vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
637  vector signed short R0, G0, B0, R1, G1, B1;
638 
639  vector unsigned char R, G, B;
640  vector unsigned char *out, *nout;
641 
642  vector signed short RND = vec_splat_s16(1 << 3);
643  vector unsigned short SCL = vec_splat_u16(4);
644  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
645 
646  vector signed short *YCoeffs, *CCoeffs;
647 
648  YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
649  CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
650 
651  out = (vector unsigned char *) dest;
652 
653  for (i = 0; i < dstW; i += 16) {
654  Y0 = RND;
655  Y1 = RND;
656  /* extract 16 coeffs from lumSrc */
657  for (j = 0; j < lumFilterSize; j++) {
658  X0 = vec_ld(0, &lumSrc[j][i]);
659  X1 = vec_ld(16, &lumSrc[j][i]);
660  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
661  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
662  }
663 
664  U = RND;
665  V = RND;
666  /* extract 8 coeffs from U,V */
667  for (j = 0; j < chrFilterSize; j++) {
668  X = vec_ld(0, &chrUSrc[j][i / 2]);
669  U = vec_mradds(X, CCoeffs[j], U);
670  X = vec_ld(0, &chrVSrc[j][i / 2]);
671  V = vec_mradds(X, CCoeffs[j], V);
672  }
673 
674  /* scale and clip signals */
675  Y0 = vec_sra(Y0, SCL);
676  Y1 = vec_sra(Y1, SCL);
677  U = vec_sra(U, SCL);
678  V = vec_sra(V, SCL);
679 
680  Y0 = vec_clip_s16(Y0);
681  Y1 = vec_clip_s16(Y1);
682  U = vec_clip_s16(U);
683  V = vec_clip_s16(V);
684 
685  /* now we have
686  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
687  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
688  *
689  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
690  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
691  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
692  */
693 
694  U0 = vec_mergeh(U, U);
695  V0 = vec_mergeh(V, V);
696 
697  U1 = vec_mergel(U, U);
698  V1 = vec_mergel(V, V);
699 
700  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
701  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
702 
703  R = vec_packclp(R0, R1);
704  G = vec_packclp(G0, G1);
705  B = vec_packclp(B0, B1);
706 
707  switch (target) {
708  case AV_PIX_FMT_ABGR:
709  out_abgr(R, G, B, out);
710  break;
711  case AV_PIX_FMT_BGRA:
712  out_bgra(R, G, B, out);
713  break;
714  case AV_PIX_FMT_RGBA:
715  out_rgba(R, G, B, out);
716  break;
717  case AV_PIX_FMT_ARGB:
718  out_argb(R, G, B, out);
719  break;
720  case AV_PIX_FMT_RGB24:
721  out_rgb24(R, G, B, out);
722  break;
723  case AV_PIX_FMT_BGR24:
724  out_bgr24(R, G, B, out);
725  break;
726  default:
727  {
728  /* If this is reached, the caller should have called yuv2packedXinC
729  * instead. */
730  static int printed_error_message;
731  if (!printed_error_message) {
733  "altivec_yuv2packedX doesn't support %s output\n",
734  av_get_pix_fmt_name(c->dstFormat));
735  printed_error_message = 1;
736  }
737  return;
738  }
739  }
740  }
741 
742  if (i < dstW) {
743  i -= 16;
744 
745  Y0 = RND;
746  Y1 = RND;
747  /* extract 16 coeffs from lumSrc */
748  for (j = 0; j < lumFilterSize; j++) {
749  X0 = vec_ld(0, &lumSrc[j][i]);
750  X1 = vec_ld(16, &lumSrc[j][i]);
751  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
752  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
753  }
754 
755  U = RND;
756  V = RND;
757  /* extract 8 coeffs from U,V */
758  for (j = 0; j < chrFilterSize; j++) {
759  X = vec_ld(0, &chrUSrc[j][i / 2]);
760  U = vec_mradds(X, CCoeffs[j], U);
761  X = vec_ld(0, &chrVSrc[j][i / 2]);
762  V = vec_mradds(X, CCoeffs[j], V);
763  }
764 
765  /* scale and clip signals */
766  Y0 = vec_sra(Y0, SCL);
767  Y1 = vec_sra(Y1, SCL);
768  U = vec_sra(U, SCL);
769  V = vec_sra(V, SCL);
770 
771  Y0 = vec_clip_s16(Y0);
772  Y1 = vec_clip_s16(Y1);
773  U = vec_clip_s16(U);
774  V = vec_clip_s16(V);
775 
776  /* now we have
777  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
778  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
779  *
780  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
781  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
782  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
783  */
784 
785  U0 = vec_mergeh(U, U);
786  V0 = vec_mergeh(V, V);
787 
788  U1 = vec_mergel(U, U);
789  V1 = vec_mergel(V, V);
790 
791  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
792  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
793 
794  R = vec_packclp(R0, R1);
795  G = vec_packclp(G0, G1);
796  B = vec_packclp(B0, B1);
797 
798  nout = (vector unsigned char *) scratch;
799  switch (target) {
800  case AV_PIX_FMT_ABGR:
801  out_abgr(R, G, B, nout);
802  break;
803  case AV_PIX_FMT_BGRA:
804  out_bgra(R, G, B, nout);
805  break;
806  case AV_PIX_FMT_RGBA:
807  out_rgba(R, G, B, nout);
808  break;
809  case AV_PIX_FMT_ARGB:
810  out_argb(R, G, B, nout);
811  break;
812  case AV_PIX_FMT_RGB24:
813  out_rgb24(R, G, B, nout);
814  break;
815  case AV_PIX_FMT_BGR24:
816  out_bgr24(R, G, B, nout);
817  break;
818  default:
819  /* Unreachable, I think. */
821  "altivec_yuv2packedX doesn't support %s output\n",
822  av_get_pix_fmt_name(c->dstFormat));
823  return;
824  }
825 
826  memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
827  }
828 }
829 
830 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
831 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
832  const int16_t *lumFilter, \
833  const int16_t **lumSrc, \
834  int lumFilterSize, \
835  const int16_t *chrFilter, \
836  const int16_t **chrUSrc, \
837  const int16_t **chrVSrc, \
838  int chrFilterSize, \
839  const int16_t **alpSrc, \
840  uint8_t *dest, int dstW, int dstY) \
841 { \
842  yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
843  chrFilter, chrUSrc, chrVSrc, \
844  chrFilterSize, alpSrc, \
845  dest, dstW, dstY, pixfmt); \
846 }
847 
848 YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
849 YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
850 YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
851 YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
852 YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
853 YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
854 
855 #endif /* HAVE_ALTIVEC */
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:182
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
B1
#define B1
Definition: faandct.c:41
out
FILE * out
Definition: movenc.c:54
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:372
R0
#define R0(v, w, x, y, z, i)
Definition: sha.c:56
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
pixdesc.h
w
uint8_t w
Definition: llviddspenc.c:38
R
#define R
Definition: huffyuvdsp.h:34
B0
#define B0
Definition: faandct.c:40
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
R1
#define R1
Definition: simple_idct.c:172
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
SwsFunc
int(* SwsFunc)(struct SwsContext *context, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
Definition: swscale_internal.h:82
U
#define U(x)
Definition: vp56_arith.h:37
X
@ X
Definition: vf_addroi.c:26
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
av_cold
#define av_cold
Definition: attributes.h:90
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:66
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
NULL
#define NULL
Definition: coverity.c:32
V
#define V
Definition: avdct.c:30
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:74
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
cpu.h
srcSliceH
return srcSliceH
Definition: yuv2rgb_template.c:87
img
#define img
Definition: vf_colormatrix.c:116
yuv2rgb_altivec.h
attributes.h
ff_yuv2rgb_init_ppc
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
Definition: yuv2rgb_altivec.c:524
Y
#define Y
Definition: boxblur.h:38
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
in
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
Definition: audio_convert.c:326
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
xf
#define xf(width, name, var, range_min, range_max, subs,...)
Definition: cbs_av1.c:668
av_always_inline
#define av_always_inline
Definition: attributes.h:49
swscale_internal.h
uint8_t
uint8_t
Definition: audio_convert.c:194
AV_PIX_FMT_NV21
@ AV_PIX_FMT_NV21
as above, but U and V bytes are swapped
Definition: pixfmt.h:90
G
#define G
Definition: huffyuvdsp.h:33
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:89
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
B
#define B
Definition: huffyuvdsp.h:32
ff_yuv2rgb_init_tables_ppc
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
Definition: yuv2rgb_altivec.c:587
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:72
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:28
h
h
Definition: vp9dsp_template.c:2038
SwsContext
Definition: swscale_internal.h:280
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
rgb2rgb.h
swscale.h
av_get_pix_fmt_name
const char * av_get_pix_fmt_name(enum AVPixelFormat pix_fmt)
Return the short name for a pixel format, NULL in case pix_fmt is unknown.
Definition: pixdesc.c:2465