FFmpeg
yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24  * Convert I420 YV12 to RGB in various formats,
25  * it rejects images that are not in 420 formats,
26  * it rejects images that don't have widths of multiples of 16,
27  * it rejects images that don't have heights of multiples of 2.
28  * Reject defers to C simulation code.
29  *
30  * Lots of optimizations to be done here.
31  *
32  * 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  * and adds, so we currently use max/min to clip.
34  *
35  * 2. The inefficient use of chroma loading needs a bit of brushing up.
36  *
37  * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  * pipeline stalls.
39  *
40  *
41  * MODIFIED to calculate coeffs from currently selected color space.
42  * MODIFIED core to be a macro where you specify the output format.
43  * ADDED UYVY conversion which is never called due to some thing in swscale.
44  * CORRECTED algorithim selection to be strict on input formats.
45  * ADDED runtime detection of AltiVec.
46  *
47  * ADDED altivec_yuv2packedX vertical scl + RGB converter
48  *
49  * March 27,2004
50  * PERFORMANCE ANALYSIS
51  *
52  * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53  * used as test.
54  * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55  * same sequence.
56  *
57  * 720 * 480 * 30 ~10MPS
58  *
59  * so we have roughly 10 clocks per pixel. This is too high, something has
60  * to be wrong.
61  *
62  * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63  * need for vec_min.
64  *
65  * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
66  * have the input video frame, it was just decompressed so it probably resides
67  * in L1 caches. However, we are creating the output video stream. This needs
68  * to use the DSTST instruction to optimize for the cache. We couple this with
69  * the fact that we are not going to be visiting the input buffer again so we
70  * mark it Least Recently Used. This shaves 25% of the processor cycles off.
71  *
72  * Now memcpy is the largest mips consumer in the system, probably due
73  * to the inefficient X11 stuff.
74  *
75  * GL libraries seem to be very slow on this machine 1.33Ghz PB running
76  * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77  * a versioning issue, however I have libGL.1.2.dylib for both
78  * machines. (We need to figure this out now.)
79  *
80  * GL2 libraries work now with patch for RGB32.
81  *
82  * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83  *
84  * Integrated luma prescaling adjustment for saturation/contrast/brightness
85  * adjustment.
86  */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
97 #include "libavutil/attributes.h"
98 #include "libavutil/cpu.h"
99 #include "libavutil/mem_internal.h"
100 #include "libavutil/pixdesc.h"
101 #include "yuv2rgb_altivec.h"
102 
103 #if HAVE_ALTIVEC
104 
105 #undef PROFILE_THE_BEAST
106 #undef INC_SCALING
107 
108 typedef unsigned char ubyte;
109 typedef signed char sbyte;
110 
111 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
112  * homogeneous vector registers x0,x1,x2 are interleaved with the
113  * following technique:
114  *
115  * o0 = vec_mergeh(x0, x1);
116  * o1 = vec_perm(o0, x2, perm_rgb_0);
117  * o2 = vec_perm(o0, x2, perm_rgb_1);
118  * o3 = vec_mergel(x0, x1);
119  * o4 = vec_perm(o3, o2, perm_rgb_2);
120  * o5 = vec_perm(o3, o2, perm_rgb_3);
121  *
122  * perm_rgb_0: o0(RG).h v1(B) --> o1*
123  * 0 1 2 3 4
124  * rgbr|gbrg|brgb|rgbr
125  * 0010 0100 1001 0010
126  * 0102 3145 2673 894A
127  *
128  * perm_rgb_1: o0(RG).h v1(B) --> o2
129  * 0 1 2 3 4
130  * gbrg|brgb|bbbb|bbbb
131  * 0100 1001 1111 1111
132  * B5CD 6EF7 89AB CDEF
133  *
134  * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
135  * 0 1 2 3 4
136  * gbrg|brgb|rgbr|gbrg
137  * 1111 1111 0010 0100
138  * 89AB CDEF 0182 3945
139  *
140  * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
141  * 0 1 2 3 4
142  * brgb|rgbr|gbrg|brgb
143  * 1001 0010 0100 1001
144  * a67b 89cA BdCD eEFf
145  */
146 static const vector unsigned char
147  perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
148  0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
149  perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
150  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
151  perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
152  0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
153  perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
154  0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
155 
156 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
157  do { \
158  __typeof__(x0) o0, o2, o3; \
159  o0 = vec_mergeh(x0, x1); \
160  y0 = vec_perm(o0, x2, perm_rgb_0); \
161  o2 = vec_perm(o0, x2, perm_rgb_1); \
162  o3 = vec_mergel(x0, x1); \
163  y1 = vec_perm(o3, o2, perm_rgb_2); \
164  y2 = vec_perm(o3, o2, perm_rgb_3); \
165  } while (0)
166 
167 #define vec_mstbgr24(x0, x1, x2, ptr) \
168  do { \
169  __typeof__(x0) _0, _1, _2; \
170  vec_merge3(x0, x1, x2, _0, _1, _2); \
171  vec_st(_0, 0, ptr++); \
172  vec_st(_1, 0, ptr++); \
173  vec_st(_2, 0, ptr++); \
174  } while (0)
175 
176 #define vec_mstrgb24(x0, x1, x2, ptr) \
177  do { \
178  __typeof__(x0) _0, _1, _2; \
179  vec_merge3(x2, x1, x0, _0, _1, _2); \
180  vec_st(_0, 0, ptr++); \
181  vec_st(_1, 0, ptr++); \
182  vec_st(_2, 0, ptr++); \
183  } while (0)
184 
185 /* pack the pixels in rgb0 format
186  * msb R
187  * lsb 0
188  */
189 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
190  do { \
191  T _0, _1, _2, _3; \
192  _0 = vec_mergeh(x0, x1); \
193  _1 = vec_mergeh(x2, x3); \
194  _2 = (T) vec_mergeh((vector unsigned short) _0, \
195  (vector unsigned short) _1); \
196  _3 = (T) vec_mergel((vector unsigned short) _0, \
197  (vector unsigned short) _1); \
198  vec_st(_2, 0 * 16, (T *) ptr); \
199  vec_st(_3, 1 * 16, (T *) ptr); \
200  _0 = vec_mergel(x0, x1); \
201  _1 = vec_mergel(x2, x3); \
202  _2 = (T) vec_mergeh((vector unsigned short) _0, \
203  (vector unsigned short) _1); \
204  _3 = (T) vec_mergel((vector unsigned short) _0, \
205  (vector unsigned short) _1); \
206  vec_st(_2, 2 * 16, (T *) ptr); \
207  vec_st(_3, 3 * 16, (T *) ptr); \
208  ptr += 4; \
209  } while (0)
210 
211 /*
212  * 1 0 1.4021 | | Y |
213  * 1 -0.3441 -0.7142 |x| Cb|
214  * 1 1.7718 0 | | Cr|
215  *
216  *
217  * Y: [-128 127]
218  * Cb/Cr : [-128 127]
219  *
220  * typical YUV conversion works on Y: 0-255 this version has been
221  * optimized for JPEG decoding.
222  */
223 
224 #if HAVE_BIGENDIAN
225 #define vec_unh(x) \
226  (vector signed short) \
227  vec_perm(x, (__typeof__(x)) { 0 }, \
228  ((vector unsigned char) { \
229  0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
230  0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
231 
232 #define vec_unl(x) \
233  (vector signed short) \
234  vec_perm(x, (__typeof__(x)) { 0 }, \
235  ((vector unsigned char) { \
236  0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
237  0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
238 #else
239 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
240 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
241 #endif
242 
243 #define vec_clip_s16(x) \
244  vec_max(vec_min(x, ((vector signed short) { \
245  235, 235, 235, 235, 235, 235, 235, 235 })), \
246  ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
247 
248 #define vec_packclp(x, y) \
249  (vector unsigned char) \
250  vec_packs((vector unsigned short) \
251  vec_max(x, ((vector signed short) { 0 })), \
252  (vector unsigned short) \
253  vec_max(y, ((vector signed short) { 0 })))
254 
255 static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
256  vector signed short U, vector signed short V,
257  vector signed short *R, vector signed short *G,
258  vector signed short *B)
259 {
260  vector signed short vx, ux, uvx;
261 
262  Y = vec_mradds(Y, c->CY, c->OY);
263  U = vec_sub(U, (vector signed short)
264  vec_splat((vector signed short) { 128 }, 0));
265  V = vec_sub(V, (vector signed short)
266  vec_splat((vector signed short) { 128 }, 0));
267 
268  // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
269  ux = vec_sl(U, c->CSHIFT);
270  *B = vec_mradds(ux, c->CBU, Y);
271 
272  // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
273  vx = vec_sl(V, c->CSHIFT);
274  *R = vec_mradds(vx, c->CRV, Y);
275 
276  // uvx = ((CGU * u) + (CGV * v)) >> 15;
277  uvx = vec_mradds(U, c->CGU, Y);
278  *G = vec_mradds(V, c->CGV, uvx);
279 }
280 
281 /*
282  * ------------------------------------------------------------------------------
283  * CS converters
284  * ------------------------------------------------------------------------------
285  */
286 
287 #if !HAVE_VSX
288 static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
289 {
290  const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
291  vector unsigned char align_perm = vec_lvsl(offset, addr);
292 
293  return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
294 }
295 #endif /* !HAVE_VSX */
296 
297 #define DEFCSP420_CVT(name, out_pixels) \
298 static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
299  int *instrides, int srcSliceY, int srcSliceH, \
300  unsigned char **oplanes, int *outstrides) \
301 { \
302  int w = c->srcW; \
303  int h = srcSliceH; \
304  int i, j; \
305  int instrides_scl[3]; \
306  vector unsigned char y0, y1; \
307  \
308  vector signed char u, v; \
309  \
310  vector signed short Y0, Y1, Y2, Y3; \
311  vector signed short U, V; \
312  vector signed short vx, ux, uvx; \
313  vector signed short vx0, ux0, uvx0; \
314  vector signed short vx1, ux1, uvx1; \
315  vector signed short R0, G0, B0; \
316  vector signed short R1, G1, B1; \
317  vector unsigned char R, G, B; \
318  \
319  vector signed short lCY = c->CY; \
320  vector signed short lOY = c->OY; \
321  vector signed short lCRV = c->CRV; \
322  vector signed short lCBU = c->CBU; \
323  vector signed short lCGU = c->CGU; \
324  vector signed short lCGV = c->CGV; \
325  vector unsigned short lCSHIFT = c->CSHIFT; \
326  \
327  const ubyte *y1i = in[0]; \
328  const ubyte *y2i = in[0] + instrides[0]; \
329  const ubyte *ui = in[1]; \
330  const ubyte *vi = in[2]; \
331  \
332  vector unsigned char *oute, *outo; \
333  \
334  /* loop moves y{1, 2}i by w */ \
335  instrides_scl[0] = instrides[0] * 2 - w; \
336  /* loop moves ui by w / 2 */ \
337  instrides_scl[1] = instrides[1] - w / 2; \
338  /* loop moves vi by w / 2 */ \
339  instrides_scl[2] = instrides[2] - w / 2; \
340  \
341  for (i = 0; i < h / 2; i++) { \
342  oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
343  (srcSliceY + i * 2)); \
344  outo = oute + (outstrides[0] >> 4); \
345  vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
346  vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
347  \
348  for (j = 0; j < w / 16; j++) { \
349  y0 = vec_xl(0, y1i); \
350  \
351  y1 = vec_xl(0, y2i); \
352  \
353  u = (vector signed char) vec_xl(0, ui); \
354  \
355  v = (vector signed char) vec_xl(0, vi); \
356  \
357  u = (vector signed char) \
358  vec_sub(u, \
359  (vector signed char) \
360  vec_splat((vector signed char) { 128 }, 0)); \
361  v = (vector signed char) \
362  vec_sub(v, \
363  (vector signed char) \
364  vec_splat((vector signed char) { 128 }, 0)); \
365  \
366  U = vec_unpackh(u); \
367  V = vec_unpackh(v); \
368  \
369  Y0 = vec_unh(y0); \
370  Y1 = vec_unl(y0); \
371  Y2 = vec_unh(y1); \
372  Y3 = vec_unl(y1); \
373  \
374  Y0 = vec_mradds(Y0, lCY, lOY); \
375  Y1 = vec_mradds(Y1, lCY, lOY); \
376  Y2 = vec_mradds(Y2, lCY, lOY); \
377  Y3 = vec_mradds(Y3, lCY, lOY); \
378  \
379  /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
380  ux = vec_sl(U, lCSHIFT); \
381  ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
382  ux0 = vec_mergeh(ux, ux); \
383  ux1 = vec_mergel(ux, ux); \
384  \
385  /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
386  vx = vec_sl(V, lCSHIFT); \
387  vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
388  vx0 = vec_mergeh(vx, vx); \
389  vx1 = vec_mergel(vx, vx); \
390  \
391  /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
392  uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
393  uvx = vec_mradds(V, lCGV, uvx); \
394  uvx0 = vec_mergeh(uvx, uvx); \
395  uvx1 = vec_mergel(uvx, uvx); \
396  \
397  R0 = vec_add(Y0, vx0); \
398  G0 = vec_add(Y0, uvx0); \
399  B0 = vec_add(Y0, ux0); \
400  R1 = vec_add(Y1, vx1); \
401  G1 = vec_add(Y1, uvx1); \
402  B1 = vec_add(Y1, ux1); \
403  \
404  R = vec_packclp(R0, R1); \
405  G = vec_packclp(G0, G1); \
406  B = vec_packclp(B0, B1); \
407  \
408  out_pixels(R, G, B, oute); \
409  \
410  R0 = vec_add(Y2, vx0); \
411  G0 = vec_add(Y2, uvx0); \
412  B0 = vec_add(Y2, ux0); \
413  R1 = vec_add(Y3, vx1); \
414  G1 = vec_add(Y3, uvx1); \
415  B1 = vec_add(Y3, ux1); \
416  R = vec_packclp(R0, R1); \
417  G = vec_packclp(G0, G1); \
418  B = vec_packclp(B0, B1); \
419  \
420  \
421  out_pixels(R, G, B, outo); \
422  \
423  y1i += 16; \
424  y2i += 16; \
425  ui += 8; \
426  vi += 8; \
427  } \
428  \
429  ui += instrides_scl[1]; \
430  vi += instrides_scl[2]; \
431  y1i += instrides_scl[0]; \
432  y2i += instrides_scl[0]; \
433  } \
434  return srcSliceH; \
435 }
436 
437 #define out_abgr(a, b, c, ptr) \
438  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr)
439 #define out_bgra(a, b, c, ptr) \
440  vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
441 #define out_rgba(a, b, c, ptr) \
442  vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
443 #define out_argb(a, b, c, ptr) \
444  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr)
445 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
446 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
447 
448 DEFCSP420_CVT(yuv2_abgr, out_abgr)
449 DEFCSP420_CVT(yuv2_bgra, out_bgra)
450 DEFCSP420_CVT(yuv2_rgba, out_rgba)
451 DEFCSP420_CVT(yuv2_argb, out_argb)
452 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
453 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
454 
455 // uyvy|uyvy|uyvy|uyvy
456 // 0123 4567 89ab cdef
457 static const vector unsigned char
458  demux_u = { 0x10, 0x00, 0x10, 0x00,
459  0x10, 0x04, 0x10, 0x04,
460  0x10, 0x08, 0x10, 0x08,
461  0x10, 0x0c, 0x10, 0x0c },
462  demux_v = { 0x10, 0x02, 0x10, 0x02,
463  0x10, 0x06, 0x10, 0x06,
464  0x10, 0x0A, 0x10, 0x0A,
465  0x10, 0x0E, 0x10, 0x0E },
466  demux_y = { 0x10, 0x01, 0x10, 0x03,
467  0x10, 0x05, 0x10, 0x07,
468  0x10, 0x09, 0x10, 0x0B,
469  0x10, 0x0D, 0x10, 0x0F };
470 
471 /*
472  * this is so I can play live CCIR raw video
473  */
474 static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
475  int *instrides, int srcSliceY, int srcSliceH,
476  unsigned char **oplanes, int *outstrides)
477 {
478  int w = c->srcW;
479  int h = srcSliceH;
480  int i, j;
481  vector unsigned char uyvy;
482  vector signed short Y, U, V;
483  vector signed short R0, G0, B0, R1, G1, B1;
484  vector unsigned char R, G, B;
485  vector unsigned char *out;
486  const ubyte *img;
487 
488  img = in[0];
489  out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
490 
491  for (i = 0; i < h; i++)
492  for (j = 0; j < w / 16; j++) {
493  uyvy = vec_ld(0, img);
494 
495  U = (vector signed short)
496  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
497  V = (vector signed short)
498  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
499  Y = (vector signed short)
500  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
501 
502  cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
503 
504  uyvy = vec_ld(16, img);
505 
506  U = (vector signed short)
507  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
508  V = (vector signed short)
509  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
510  Y = (vector signed short)
511  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
512 
513  cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
514 
515  R = vec_packclp(R0, R1);
516  G = vec_packclp(G0, G1);
517  B = vec_packclp(B0, B1);
518 
519  // vec_mstbgr24 (R,G,B, out);
520  out_rgba(R, G, B, out);
521 
522  img += 32;
523  }
524  return srcSliceH;
525 }
526 
527 #endif /* HAVE_ALTIVEC */
528 
529 /* Ok currently the acceleration routine only supports
530  * inputs of widths a multiple of 16
531  * and heights a multiple 2
532  *
533  * So we just fall back to the C codes for this.
534  */
536 {
537 #if HAVE_ALTIVEC
539  return NULL;
540 
541  /*
542  * and this seems not to matter too much I tried a bunch of
543  * videos with abnormal widths and MPlayer crashes elsewhere.
544  * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
545  * boom with X11 bad match.
546  *
547  */
548  if ((c->srcW & 0xf) != 0)
549  return NULL;
550 
551  switch (c->srcFormat) {
552  case AV_PIX_FMT_YUV410P:
553  case AV_PIX_FMT_YUV420P:
554  /*case IMGFMT_CLPL: ??? */
555  case AV_PIX_FMT_GRAY8:
556  case AV_PIX_FMT_NV12:
557  case AV_PIX_FMT_NV21:
558  if ((c->srcH & 0x1) != 0)
559  return NULL;
560 
561  switch (c->dstFormat) {
562  case AV_PIX_FMT_RGB24:
563  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
564  return altivec_yuv2_rgb24;
565  case AV_PIX_FMT_BGR24:
566  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
567  return altivec_yuv2_bgr24;
568  case AV_PIX_FMT_ARGB:
569  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
570  return altivec_yuv2_argb;
571  case AV_PIX_FMT_ABGR:
572  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
573  return altivec_yuv2_abgr;
574  case AV_PIX_FMT_RGBA:
575  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
576  return altivec_yuv2_rgba;
577  case AV_PIX_FMT_BGRA:
578  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
579  return altivec_yuv2_bgra;
580  default: return NULL;
581  }
582  break;
583 
584  case AV_PIX_FMT_UYVY422:
585  switch (c->dstFormat) {
586  case AV_PIX_FMT_BGR32:
587  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
588  return altivec_uyvy_rgb32;
589  default: return NULL;
590  }
591  break;
592  }
593 #endif /* HAVE_ALTIVEC */
594 
595  return NULL;
596 }
597 
599  const int inv_table[4],
600  int brightness,
601  int contrast,
602  int saturation)
603 {
604 #if HAVE_ALTIVEC
605  union {
606  DECLARE_ALIGNED(16, signed short, tmp)[8];
607  vector signed short vec;
608  } buf;
609 
611  return;
612 
613  buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
614  buf.tmp[1] = -256 * brightness; // oy
615  buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
616  buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
617  buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
618  buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
619 
620  c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
621  c->CY = vec_splat((vector signed short) buf.vec, 0);
622  c->OY = vec_splat((vector signed short) buf.vec, 1);
623  c->CRV = vec_splat((vector signed short) buf.vec, 2);
624  c->CBU = vec_splat((vector signed short) buf.vec, 3);
625  c->CGU = vec_splat((vector signed short) buf.vec, 4);
626  c->CGV = vec_splat((vector signed short) buf.vec, 5);
627  return;
628 #endif /* HAVE_ALTIVEC */
629 }
630 
631 #if HAVE_ALTIVEC
632 
633 static av_always_inline void yuv2packedX_altivec(SwsContext *c,
634  const int16_t *lumFilter,
635  const int16_t **lumSrc,
636  int lumFilterSize,
637  const int16_t *chrFilter,
638  const int16_t **chrUSrc,
639  const int16_t **chrVSrc,
640  int chrFilterSize,
641  const int16_t **alpSrc,
642  uint8_t *dest,
643  int dstW, int dstY,
644  enum AVPixelFormat target)
645 {
646  int i, j;
647  vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
648  vector signed short R0, G0, B0, R1, G1, B1;
649 
650  vector unsigned char R, G, B;
651  vector unsigned char *out, *nout;
652 
653  vector signed short RND = vec_splat_s16(1 << 3);
654  vector unsigned short SCL = vec_splat_u16(4);
655  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
656 
657  vector signed short *YCoeffs, *CCoeffs;
658 
659  YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
660  CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
661 
662  out = (vector unsigned char *) dest;
663 
664  for (i = 0; i < dstW; i += 16) {
665  Y0 = RND;
666  Y1 = RND;
667  /* extract 16 coeffs from lumSrc */
668  for (j = 0; j < lumFilterSize; j++) {
669  X0 = vec_ld(0, &lumSrc[j][i]);
670  X1 = vec_ld(16, &lumSrc[j][i]);
671  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
672  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
673  }
674 
675  U = RND;
676  V = RND;
677  /* extract 8 coeffs from U,V */
678  for (j = 0; j < chrFilterSize; j++) {
679  X = vec_ld(0, &chrUSrc[j][i / 2]);
680  U = vec_mradds(X, CCoeffs[j], U);
681  X = vec_ld(0, &chrVSrc[j][i / 2]);
682  V = vec_mradds(X, CCoeffs[j], V);
683  }
684 
685  /* scale and clip signals */
686  Y0 = vec_sra(Y0, SCL);
687  Y1 = vec_sra(Y1, SCL);
688  U = vec_sra(U, SCL);
689  V = vec_sra(V, SCL);
690 
691  Y0 = vec_clip_s16(Y0);
692  Y1 = vec_clip_s16(Y1);
693  U = vec_clip_s16(U);
694  V = vec_clip_s16(V);
695 
696  /* now we have
697  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
698  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
699  *
700  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
701  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
702  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
703  */
704 
705  U0 = vec_mergeh(U, U);
706  V0 = vec_mergeh(V, V);
707 
708  U1 = vec_mergel(U, U);
709  V1 = vec_mergel(V, V);
710 
711  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
712  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
713 
714  R = vec_packclp(R0, R1);
715  G = vec_packclp(G0, G1);
716  B = vec_packclp(B0, B1);
717 
718  switch (target) {
719  case AV_PIX_FMT_ABGR:
720  out_abgr(R, G, B, out);
721  break;
722  case AV_PIX_FMT_BGRA:
723  out_bgra(R, G, B, out);
724  break;
725  case AV_PIX_FMT_RGBA:
726  out_rgba(R, G, B, out);
727  break;
728  case AV_PIX_FMT_ARGB:
729  out_argb(R, G, B, out);
730  break;
731  case AV_PIX_FMT_RGB24:
732  out_rgb24(R, G, B, out);
733  break;
734  case AV_PIX_FMT_BGR24:
735  out_bgr24(R, G, B, out);
736  break;
737  default:
738  {
739  /* If this is reached, the caller should have called yuv2packedXinC
740  * instead. */
741  static int printed_error_message;
742  if (!printed_error_message) {
744  "altivec_yuv2packedX doesn't support %s output\n",
745  av_get_pix_fmt_name(c->dstFormat));
746  printed_error_message = 1;
747  }
748  return;
749  }
750  }
751  }
752 
753  if (i < dstW) {
754  i -= 16;
755 
756  Y0 = RND;
757  Y1 = RND;
758  /* extract 16 coeffs from lumSrc */
759  for (j = 0; j < lumFilterSize; j++) {
760  X0 = vec_ld(0, &lumSrc[j][i]);
761  X1 = vec_ld(16, &lumSrc[j][i]);
762  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
763  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
764  }
765 
766  U = RND;
767  V = RND;
768  /* extract 8 coeffs from U,V */
769  for (j = 0; j < chrFilterSize; j++) {
770  X = vec_ld(0, &chrUSrc[j][i / 2]);
771  U = vec_mradds(X, CCoeffs[j], U);
772  X = vec_ld(0, &chrVSrc[j][i / 2]);
773  V = vec_mradds(X, CCoeffs[j], V);
774  }
775 
776  /* scale and clip signals */
777  Y0 = vec_sra(Y0, SCL);
778  Y1 = vec_sra(Y1, SCL);
779  U = vec_sra(U, SCL);
780  V = vec_sra(V, SCL);
781 
782  Y0 = vec_clip_s16(Y0);
783  Y1 = vec_clip_s16(Y1);
784  U = vec_clip_s16(U);
785  V = vec_clip_s16(V);
786 
787  /* now we have
788  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
789  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
790  *
791  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
792  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
793  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
794  */
795 
796  U0 = vec_mergeh(U, U);
797  V0 = vec_mergeh(V, V);
798 
799  U1 = vec_mergel(U, U);
800  V1 = vec_mergel(V, V);
801 
802  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
803  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
804 
805  R = vec_packclp(R0, R1);
806  G = vec_packclp(G0, G1);
807  B = vec_packclp(B0, B1);
808 
809  nout = (vector unsigned char *) scratch;
810  switch (target) {
811  case AV_PIX_FMT_ABGR:
812  out_abgr(R, G, B, nout);
813  break;
814  case AV_PIX_FMT_BGRA:
815  out_bgra(R, G, B, nout);
816  break;
817  case AV_PIX_FMT_RGBA:
818  out_rgba(R, G, B, nout);
819  break;
820  case AV_PIX_FMT_ARGB:
821  out_argb(R, G, B, nout);
822  break;
823  case AV_PIX_FMT_RGB24:
824  out_rgb24(R, G, B, nout);
825  break;
826  case AV_PIX_FMT_BGR24:
827  out_bgr24(R, G, B, nout);
828  break;
829  default:
830  /* Unreachable, I think. */
832  "altivec_yuv2packedX doesn't support %s output\n",
833  av_get_pix_fmt_name(c->dstFormat));
834  return;
835  }
836 
837  memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
838  }
839 }
840 
841 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
842 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
843  const int16_t *lumFilter, \
844  const int16_t **lumSrc, \
845  int lumFilterSize, \
846  const int16_t *chrFilter, \
847  const int16_t **chrUSrc, \
848  const int16_t **chrVSrc, \
849  int chrFilterSize, \
850  const int16_t **alpSrc, \
851  uint8_t *dest, int dstW, int dstY) \
852 { \
853  yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
854  chrFilter, chrUSrc, chrVSrc, \
855  chrFilterSize, alpSrc, \
856  dest, dstW, dstY, pixfmt); \
857 }
858 
859 YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
860 YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
861 YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
862 YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
863 YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
864 YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
865 
866 #endif /* HAVE_ALTIVEC */
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:186
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
mem_internal.h
B1
#define B1
Definition: faandct.c:41
out
FILE * out
Definition: movenc.c:54
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:381
R0
#define R0(v, w, x, y, z, i)
Definition: sha.c:57
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
pixdesc.h
w
uint8_t w
Definition: llviddspenc.c:38
R
#define R
Definition: huffyuvdsp.h:34
B0
#define B0
Definition: faandct.c:40
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
R1
#define R1
Definition: simple_idct.c:171
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
SwsFunc
int(* SwsFunc)(struct SwsContext *context, const uint8_t *src[], int srcStride[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[])
Definition: swscale_internal.h:98
U
#define U(x)
Definition: vp56_arith.h:37
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:66
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
NULL
#define NULL
Definition: coverity.c:32
V
#define V
Definition: avdct.c:30
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:74
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
cpu.h
srcSliceH
return srcSliceH
Definition: yuv2rgb_template.c:87
img
#define img
Definition: vf_colormatrix.c:116
yuv2rgb_altivec.h
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
ff_yuv2rgb_init_ppc
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
Definition: yuv2rgb_altivec.c:535
Y
#define Y
Definition: boxblur.h:37
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:116
X
@ X
Definition: vf_addroi.c:26
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
xf
#define xf(width, name, var, range_min, range_max, subs,...)
Definition: cbs_av1.c:664
av_always_inline
#define av_always_inline
Definition: attributes.h:49
swscale_internal.h
AV_PIX_FMT_NV21
@ AV_PIX_FMT_NV21
as above, but U and V bytes are swapped
Definition: pixfmt.h:90
G
#define G
Definition: huffyuvdsp.h:33
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:89
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
B
#define B
Definition: huffyuvdsp.h:32
ff_yuv2rgb_init_tables_ppc
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
Definition: yuv2rgb_altivec.c:598
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:72
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
h
h
Definition: vp9dsp_template.c:2038
SwsContext
Definition: swscale_internal.h:298
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
rgb2rgb.h
swscale.h
av_get_pix_fmt_name
const char * av_get_pix_fmt_name(enum AVPixelFormat pix_fmt)
Return the short name for a pixel format, NULL in case pix_fmt is unknown.
Definition: pixdesc.c:2582