FFmpeg
yuv2rgb_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /*
24  * Convert I420 YV12 to RGB in various formats,
25  * it rejects images that are not in 420 formats,
26  * it rejects images that don't have widths of multiples of 16,
27  * it rejects images that don't have heights of multiples of 2.
28  * Reject defers to C simulation code.
29  *
30  * Lots of optimizations to be done here.
31  *
32  * 1. Need to fix saturation code. I just couldn't get it to fly with packs
33  * and adds, so we currently use max/min to clip.
34  *
35  * 2. The inefficient use of chroma loading needs a bit of brushing up.
36  *
37  * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38  * pipeline stalls.
39  *
40  *
41  * MODIFIED to calculate coeffs from currently selected color space.
42  * MODIFIED core to be a macro where you specify the output format.
43  * ADDED UYVY conversion which is never called due to some thing in swscale.
44  * CORRECTED algorithim selection to be strict on input formats.
45  * ADDED runtime detection of AltiVec.
46  *
47  * ADDED altivec_yuv2packedX vertical scl + RGB converter
48  *
49  * March 27,2004
50  * PERFORMANCE ANALYSIS
51  *
52  * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53  * used as test.
54  * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55  * same sequence.
56  *
57  * 720 * 480 * 30 ~10MPS
58  *
59  * so we have roughly 10 clocks per pixel. This is too high, something has
60  * to be wrong.
61  *
62  * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63  * need for vec_min.
64  *
65  * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
66  * have the input video frame, it was just decompressed so it probably resides
67  * in L1 caches. However, we are creating the output video stream. This needs
68  * to use the DSTST instruction to optimize for the cache. We couple this with
69  * the fact that we are not going to be visiting the input buffer again so we
70  * mark it Least Recently Used. This shaves 25% of the processor cycles off.
71  *
72  * Now memcpy is the largest mips consumer in the system, probably due
73  * to the inefficient X11 stuff.
74  *
75  * GL libraries seem to be very slow on this machine 1.33Ghz PB running
76  * Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77  * a versioning issue, however I have libGL.1.2.dylib for both
78  * machines. (We need to figure this out now.)
79  *
80  * GL2 libraries work now with patch for RGB32.
81  *
82  * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83  *
84  * Integrated luma prescaling adjustment for saturation/contrast/brightness
85  * adjustment.
86  */
87 
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
97 #include "libavutil/attributes.h"
98 #include "libavutil/cpu.h"
99 #include "libavutil/mem_internal.h"
100 #include "libavutil/pixdesc.h"
101 #include "yuv2rgb_altivec.h"
102 
103 #if HAVE_ALTIVEC
104 
105 #undef PROFILE_THE_BEAST
106 #undef INC_SCALING
107 
108 typedef unsigned char ubyte;
109 typedef signed char sbyte;
110 
111 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
112  * homogeneous vector registers x0,x1,x2 are interleaved with the
113  * following technique:
114  *
115  * o0 = vec_mergeh(x0, x1);
116  * o1 = vec_perm(o0, x2, perm_rgb_0);
117  * o2 = vec_perm(o0, x2, perm_rgb_1);
118  * o3 = vec_mergel(x0, x1);
119  * o4 = vec_perm(o3, o2, perm_rgb_2);
120  * o5 = vec_perm(o3, o2, perm_rgb_3);
121  *
122  * perm_rgb_0: o0(RG).h v1(B) --> o1*
123  * 0 1 2 3 4
124  * rgbr|gbrg|brgb|rgbr
125  * 0010 0100 1001 0010
126  * 0102 3145 2673 894A
127  *
128  * perm_rgb_1: o0(RG).h v1(B) --> o2
129  * 0 1 2 3 4
130  * gbrg|brgb|bbbb|bbbb
131  * 0100 1001 1111 1111
132  * B5CD 6EF7 89AB CDEF
133  *
134  * perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
135  * 0 1 2 3 4
136  * gbrg|brgb|rgbr|gbrg
137  * 1111 1111 0010 0100
138  * 89AB CDEF 0182 3945
139  *
140  * perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
141  * 0 1 2 3 4
142  * brgb|rgbr|gbrg|brgb
143  * 1001 0010 0100 1001
144  * a67b 89cA BdCD eEFf
145  */
146 static const vector unsigned char
147  perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
148  0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
149  perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
150  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
151  perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
152  0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
153  perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
154  0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
155 
156 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
157  do { \
158  __typeof__(x0) o0, o2, o3; \
159  o0 = vec_mergeh(x0, x1); \
160  y0 = vec_perm(o0, x2, perm_rgb_0); \
161  o2 = vec_perm(o0, x2, perm_rgb_1); \
162  o3 = vec_mergel(x0, x1); \
163  y1 = vec_perm(o3, o2, perm_rgb_2); \
164  y2 = vec_perm(o3, o2, perm_rgb_3); \
165  } while (0)
166 
167 #define vec_mstbgr24(x0, x1, x2, ptr) \
168  do { \
169  __typeof__(x0) _0, _1, _2; \
170  vec_merge3(x0, x1, x2, _0, _1, _2); \
171  vec_st(_0, 0, ptr++); \
172  vec_st(_1, 0, ptr++); \
173  vec_st(_2, 0, ptr++); \
174  } while (0)
175 
176 #define vec_mstrgb24(x0, x1, x2, ptr) \
177  do { \
178  __typeof__(x0) _0, _1, _2; \
179  vec_merge3(x2, x1, x0, _0, _1, _2); \
180  vec_st(_0, 0, ptr++); \
181  vec_st(_1, 0, ptr++); \
182  vec_st(_2, 0, ptr++); \
183  } while (0)
184 
185 /* pack the pixels in rgb0 format
186  * msb R
187  * lsb 0
188  */
189 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
190  do { \
191  T _0, _1, _2, _3; \
192  _0 = vec_mergeh(x0, x1); \
193  _1 = vec_mergeh(x2, x3); \
194  _2 = (T) vec_mergeh((vector unsigned short) _0, \
195  (vector unsigned short) _1); \
196  _3 = (T) vec_mergel((vector unsigned short) _0, \
197  (vector unsigned short) _1); \
198  vec_st(_2, 0 * 16, (T *) ptr); \
199  vec_st(_3, 1 * 16, (T *) ptr); \
200  _0 = vec_mergel(x0, x1); \
201  _1 = vec_mergel(x2, x3); \
202  _2 = (T) vec_mergeh((vector unsigned short) _0, \
203  (vector unsigned short) _1); \
204  _3 = (T) vec_mergel((vector unsigned short) _0, \
205  (vector unsigned short) _1); \
206  vec_st(_2, 2 * 16, (T *) ptr); \
207  vec_st(_3, 3 * 16, (T *) ptr); \
208  ptr += 4; \
209  } while (0)
210 
211 /*
212  * 1 0 1.4021 | | Y |
213  * 1 -0.3441 -0.7142 |x| Cb|
214  * 1 1.7718 0 | | Cr|
215  *
216  *
217  * Y: [-128 127]
218  * Cb/Cr : [-128 127]
219  *
220  * typical YUV conversion works on Y: 0-255 this version has been
221  * optimized for JPEG decoding.
222  */
223 
224 #if HAVE_BIGENDIAN
225 #define vec_unh(x) \
226  (vector signed short) \
227  vec_perm(x, (__typeof__(x)) { 0 }, \
228  ((vector unsigned char) { \
229  0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
230  0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
231 
232 #define vec_unl(x) \
233  (vector signed short) \
234  vec_perm(x, (__typeof__(x)) { 0 }, \
235  ((vector unsigned char) { \
236  0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
237  0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
238 #else
239 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
240 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
241 #endif
242 
243 #define vec_clip_s16(x) \
244  vec_max(vec_min(x, ((vector signed short) { \
245  235, 235, 235, 235, 235, 235, 235, 235 })), \
246  ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
247 
248 #define vec_packclp(x, y) \
249  (vector unsigned char) \
250  vec_packs((vector unsigned short) \
251  vec_max(x, ((vector signed short) { 0 })), \
252  (vector unsigned short) \
253  vec_max(y, ((vector signed short) { 0 })))
254 
255 static inline void cvtyuvtoRGB(SwsInternal *c, vector signed short Y,
256  vector signed short U, vector signed short V,
257  vector signed short *R, vector signed short *G,
258  vector signed short *B)
259 {
260  vector signed short vx, ux, uvx;
261 
262  Y = vec_mradds(Y, c->CY, c->OY);
263  U = vec_sub(U, (vector signed short)
264  vec_splat((vector signed short) { 128 }, 0));
265  V = vec_sub(V, (vector signed short)
266  vec_splat((vector signed short) { 128 }, 0));
267 
268  // ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
269  ux = vec_sl(U, c->CSHIFT);
270  *B = vec_mradds(ux, c->CBU, Y);
271 
272  // vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
273  vx = vec_sl(V, c->CSHIFT);
274  *R = vec_mradds(vx, c->CRV, Y);
275 
276  // uvx = ((CGU * u) + (CGV * v)) >> 15;
277  uvx = vec_mradds(U, c->CGU, Y);
278  *G = vec_mradds(V, c->CGV, uvx);
279 }
280 
281 /*
282  * ------------------------------------------------------------------------------
283  * CS converters
284  * ------------------------------------------------------------------------------
285  */
286 
287 #if !HAVE_VSX
288 static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
289 {
290  const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
291  vector unsigned char align_perm = vec_lvsl(offset, addr);
292 
293  return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
294 }
295 #endif /* !HAVE_VSX */
296 
297 #define DEFCSP420_CVT(name, out_pixels) \
298 static int altivec_ ## name(SwsInternal *c, const unsigned char *const *in, \
299  const int *instrides, int srcSliceY, int srcSliceH, \
300  unsigned char *const *oplanes, const int *outstrides) \
301 { \
302  int w = c->opts.src_w; \
303  int h = srcSliceH; \
304  int i, j; \
305  int instrides_scl[3]; \
306  vector unsigned char y0, y1; \
307  \
308  vector signed char u, v; \
309  \
310  vector signed short Y0, Y1, Y2, Y3; \
311  vector signed short U, V; \
312  vector signed short vx, ux, uvx; \
313  vector signed short vx0, ux0, uvx0; \
314  vector signed short vx1, ux1, uvx1; \
315  vector signed short R0, G0, B0; \
316  vector signed short R1, G1, B1; \
317  vector unsigned char R, G, B; \
318  \
319  vector signed short lCY = c->CY; \
320  vector signed short lOY = c->OY; \
321  vector signed short lCRV = c->CRV; \
322  vector signed short lCBU = c->CBU; \
323  vector signed short lCGU = c->CGU; \
324  vector signed short lCGV = c->CGV; \
325  vector unsigned short lCSHIFT = c->CSHIFT; \
326  \
327  const ubyte *y1i = in[0]; \
328  const ubyte *y2i = in[0] + instrides[0]; \
329  const ubyte *ui = in[1]; \
330  const ubyte *vi = in[2]; \
331  \
332  vector unsigned char *oute, *outo; \
333  \
334  /* loop moves y{1, 2}i by w */ \
335  instrides_scl[0] = instrides[0] * 2 - w; \
336  /* loop moves ui by w / 2 */ \
337  instrides_scl[1] = instrides[1] - w / 2; \
338  /* loop moves vi by w / 2 */ \
339  instrides_scl[2] = instrides[2] - w / 2; \
340  \
341  for (i = 0; i < h / 2; i++) { \
342  oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
343  (srcSliceY + i * 2)); \
344  outo = oute + (outstrides[0] >> 4); \
345  vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
346  vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
347  \
348  for (j = 0; j < w / 16; j++) { \
349  y0 = vec_xl(0, y1i); \
350  \
351  y1 = vec_xl(0, y2i); \
352  \
353  u = (vector signed char) vec_xl(0, ui); \
354  \
355  v = (vector signed char) vec_xl(0, vi); \
356  \
357  u = (vector signed char) \
358  vec_sub(u, \
359  (vector signed char) \
360  vec_splat((vector signed char) { 128 }, 0)); \
361  v = (vector signed char) \
362  vec_sub(v, \
363  (vector signed char) \
364  vec_splat((vector signed char) { 128 }, 0)); \
365  \
366  U = vec_unpackh(u); \
367  V = vec_unpackh(v); \
368  \
369  Y0 = vec_unh(y0); \
370  Y1 = vec_unl(y0); \
371  Y2 = vec_unh(y1); \
372  Y3 = vec_unl(y1); \
373  \
374  Y0 = vec_mradds(Y0, lCY, lOY); \
375  Y1 = vec_mradds(Y1, lCY, lOY); \
376  Y2 = vec_mradds(Y2, lCY, lOY); \
377  Y3 = vec_mradds(Y3, lCY, lOY); \
378  \
379  /* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
380  ux = vec_sl(U, lCSHIFT); \
381  ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
382  ux0 = vec_mergeh(ux, ux); \
383  ux1 = vec_mergel(ux, ux); \
384  \
385  /* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
386  vx = vec_sl(V, lCSHIFT); \
387  vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
388  vx0 = vec_mergeh(vx, vx); \
389  vx1 = vec_mergel(vx, vx); \
390  \
391  /* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
392  uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
393  uvx = vec_mradds(V, lCGV, uvx); \
394  uvx0 = vec_mergeh(uvx, uvx); \
395  uvx1 = vec_mergel(uvx, uvx); \
396  \
397  R0 = vec_add(Y0, vx0); \
398  G0 = vec_add(Y0, uvx0); \
399  B0 = vec_add(Y0, ux0); \
400  R1 = vec_add(Y1, vx1); \
401  G1 = vec_add(Y1, uvx1); \
402  B1 = vec_add(Y1, ux1); \
403  \
404  R = vec_packclp(R0, R1); \
405  G = vec_packclp(G0, G1); \
406  B = vec_packclp(B0, B1); \
407  \
408  out_pixels(R, G, B, oute); \
409  \
410  R0 = vec_add(Y2, vx0); \
411  G0 = vec_add(Y2, uvx0); \
412  B0 = vec_add(Y2, ux0); \
413  R1 = vec_add(Y3, vx1); \
414  G1 = vec_add(Y3, uvx1); \
415  B1 = vec_add(Y3, ux1); \
416  R = vec_packclp(R0, R1); \
417  G = vec_packclp(G0, G1); \
418  B = vec_packclp(B0, B1); \
419  \
420  \
421  out_pixels(R, G, B, outo); \
422  \
423  y1i += 16; \
424  y2i += 16; \
425  ui += 8; \
426  vi += 8; \
427  } \
428  \
429  ui += instrides_scl[1]; \
430  vi += instrides_scl[2]; \
431  y1i += instrides_scl[0]; \
432  y2i += instrides_scl[0]; \
433  } \
434  return srcSliceH; \
435 }
436 
437 #define out_abgr(a, b, c, ptr) \
438  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr)
439 #define out_bgra(a, b, c, ptr) \
440  vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
441 #define out_rgba(a, b, c, ptr) \
442  vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
443 #define out_argb(a, b, c, ptr) \
444  vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr)
445 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
446 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
447 
448 DEFCSP420_CVT(yuv2_abgr, out_abgr)
449 DEFCSP420_CVT(yuv2_bgra, out_bgra)
450 DEFCSP420_CVT(yuv2_rgba, out_rgba)
451 DEFCSP420_CVT(yuv2_argb, out_argb)
452 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
453 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
454 
455 // uyvy|uyvy|uyvy|uyvy
456 // 0123 4567 89ab cdef
457 static const vector unsigned char
458  demux_u = { 0x10, 0x00, 0x10, 0x00,
459  0x10, 0x04, 0x10, 0x04,
460  0x10, 0x08, 0x10, 0x08,
461  0x10, 0x0c, 0x10, 0x0c },
462  demux_v = { 0x10, 0x02, 0x10, 0x02,
463  0x10, 0x06, 0x10, 0x06,
464  0x10, 0x0A, 0x10, 0x0A,
465  0x10, 0x0E, 0x10, 0x0E },
466  demux_y = { 0x10, 0x01, 0x10, 0x03,
467  0x10, 0x05, 0x10, 0x07,
468  0x10, 0x09, 0x10, 0x0B,
469  0x10, 0x0D, 0x10, 0x0F };
470 
471 /*
472  * this is so I can play live CCIR raw video
473  */
474 static int altivec_uyvy_rgb32(SwsInternal *c, const unsigned char *const *in,
475  const int *instrides, int srcSliceY, int srcSliceH,
476  unsigned char *const *oplanes, const int *outstrides)
477 {
478  int w = c->opts.src_w;
479  int h = srcSliceH;
480  int i, j;
481  vector unsigned char uyvy;
482  vector signed short Y, U, V;
483  vector signed short R0, G0, B0, R1, G1, B1;
484  vector unsigned char R, G, B;
485  vector unsigned char *out;
486  const ubyte *img;
487 
488  img = in[0];
489  out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
490 
491  for (i = 0; i < h; i++)
492  for (j = 0; j < w / 16; j++) {
493  uyvy = vec_ld(0, img);
494 
495  U = (vector signed short)
496  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
497  V = (vector signed short)
498  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
499  Y = (vector signed short)
500  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
501 
502  cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
503 
504  uyvy = vec_ld(16, img);
505 
506  U = (vector signed short)
507  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
508  V = (vector signed short)
509  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
510  Y = (vector signed short)
511  vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
512 
513  cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
514 
515  R = vec_packclp(R0, R1);
516  G = vec_packclp(G0, G1);
517  B = vec_packclp(B0, B1);
518 
519  // vec_mstbgr24 (R,G,B, out);
520  out_rgba(R, G, B, out);
521 
522  img += 32;
523  }
524  return srcSliceH;
525 }
526 
527 #endif /* HAVE_ALTIVEC */
528 
529 /* Ok currently the acceleration routine only supports
530  * inputs of widths a multiple of 16
531  * and heights a multiple 2
532  *
533  * So we just fall back to the C codes for this.
534  */
536 {
537 #if HAVE_ALTIVEC
539  return NULL;
540 
541  /*
542  * and this seems not to matter too much I tried a bunch of
543  * videos with abnormal widths and MPlayer crashes elsewhere.
544  * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
545  * boom with X11 bad match.
546  *
547  */
548  if ((c->opts.src_w & 0xf) != 0)
549  return NULL;
550 
551  switch (c->opts.src_format) {
552  case AV_PIX_FMT_YUV410P:
553  case AV_PIX_FMT_YUV420P:
554  /*case IMGFMT_CLPL: ??? */
555  case AV_PIX_FMT_GRAY8:
556  case AV_PIX_FMT_NV12:
557  case AV_PIX_FMT_NV21:
558  if ((c->opts.src_h & 0x1) != 0)
559  return NULL;
560 
561 /*
562  * The below accelerations for YUV2RGB are known broken.
563  * See: 'fate-checkasm-sw_yuv2rgb' with --enable-altivec
564  * They are disabled for the moment, until such time as
565  * they can be repaired.
566  */
567 #if 0
568  switch (c->opts.dst_format) {
569  case AV_PIX_FMT_RGB24:
570  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
571  return altivec_yuv2_rgb24;
572  case AV_PIX_FMT_BGR24:
573  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
574  return altivec_yuv2_bgr24;
575  case AV_PIX_FMT_ARGB:
576  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
577  return altivec_yuv2_argb;
578  case AV_PIX_FMT_ABGR:
579  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
580  return altivec_yuv2_abgr;
581  case AV_PIX_FMT_RGBA:
582  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
583  return altivec_yuv2_rgba;
584  case AV_PIX_FMT_BGRA:
585  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
586  return altivec_yuv2_bgra;
587  default: return NULL;
588  }
589 #endif /* disabled YUV2RGB acceleration */
590  break;
591 
592  case AV_PIX_FMT_UYVY422:
593  switch (c->opts.dst_format) {
594  case AV_PIX_FMT_BGR32:
595  av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
596  return altivec_uyvy_rgb32;
597  default: return NULL;
598  }
599  break;
600  }
601 #endif /* HAVE_ALTIVEC */
602 
603  return NULL;
604 }
605 
607  const int inv_table[4],
608  int brightness,
609  int contrast,
610  int saturation)
611 {
612 #if HAVE_ALTIVEC
613  union {
614  DECLARE_ALIGNED(16, signed short, tmp)[8];
615  vector signed short vec;
616  } buf;
617 
619  return;
620 
621  buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
622  buf.tmp[1] = -256 * brightness; // oy
623  buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
624  buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
625  buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
626  buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
627 
628  c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
629  c->CY = vec_splat((vector signed short) buf.vec, 0);
630  c->OY = vec_splat((vector signed short) buf.vec, 1);
631  c->CRV = vec_splat((vector signed short) buf.vec, 2);
632  c->CBU = vec_splat((vector signed short) buf.vec, 3);
633  c->CGU = vec_splat((vector signed short) buf.vec, 4);
634  c->CGV = vec_splat((vector signed short) buf.vec, 5);
635  return;
636 #endif /* HAVE_ALTIVEC */
637 }
638 
639 #if HAVE_ALTIVEC
640 
641 static av_always_inline void yuv2packedX_altivec(SwsInternal *c,
642  const int16_t *lumFilter,
643  const int16_t **lumSrc,
644  int lumFilterSize,
645  const int16_t *chrFilter,
646  const int16_t **chrUSrc,
647  const int16_t **chrVSrc,
648  int chrFilterSize,
649  const int16_t **alpSrc,
650  uint8_t *dest,
651  int dstW, int dstY,
652  enum AVPixelFormat target)
653 {
654  int i, j;
655  vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
656  vector signed short R0, G0, B0, R1, G1, B1;
657 
658  vector unsigned char R, G, B;
659  vector unsigned char *out, *nout;
660 
661  vector signed short RND = vec_splat_s16(1 << 3);
662  vector unsigned short SCL = vec_splat_u16(4);
663  DECLARE_ALIGNED(16, unsigned int, scratch)[16];
664 
665  vector signed short *YCoeffs, *CCoeffs;
666 
667  YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
668  CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
669 
670  out = (vector unsigned char *) dest;
671 
672  for (i = 0; i < dstW; i += 16) {
673  Y0 = RND;
674  Y1 = RND;
675  /* extract 16 coeffs from lumSrc */
676  for (j = 0; j < lumFilterSize; j++) {
677  X0 = vec_ld(0, &lumSrc[j][i]);
678  X1 = vec_ld(16, &lumSrc[j][i]);
679  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
680  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
681  }
682 
683  U = RND;
684  V = RND;
685  /* extract 8 coeffs from U,V */
686  for (j = 0; j < chrFilterSize; j++) {
687  X = vec_ld(0, &chrUSrc[j][i / 2]);
688  U = vec_mradds(X, CCoeffs[j], U);
689  X = vec_ld(0, &chrVSrc[j][i / 2]);
690  V = vec_mradds(X, CCoeffs[j], V);
691  }
692 
693  /* scale and clip signals */
694  Y0 = vec_sra(Y0, SCL);
695  Y1 = vec_sra(Y1, SCL);
696  U = vec_sra(U, SCL);
697  V = vec_sra(V, SCL);
698 
699  Y0 = vec_clip_s16(Y0);
700  Y1 = vec_clip_s16(Y1);
701  U = vec_clip_s16(U);
702  V = vec_clip_s16(V);
703 
704  /* now we have
705  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
706  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
707  *
708  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
709  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
710  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
711  */
712 
713  U0 = vec_mergeh(U, U);
714  V0 = vec_mergeh(V, V);
715 
716  U1 = vec_mergel(U, U);
717  V1 = vec_mergel(V, V);
718 
719  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
720  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
721 
722  R = vec_packclp(R0, R1);
723  G = vec_packclp(G0, G1);
724  B = vec_packclp(B0, B1);
725 
726  switch (target) {
727  case AV_PIX_FMT_ABGR:
728  out_abgr(R, G, B, out);
729  break;
730  case AV_PIX_FMT_BGRA:
731  out_bgra(R, G, B, out);
732  break;
733  case AV_PIX_FMT_RGBA:
734  out_rgba(R, G, B, out);
735  break;
736  case AV_PIX_FMT_ARGB:
737  out_argb(R, G, B, out);
738  break;
739  case AV_PIX_FMT_RGB24:
740  out_rgb24(R, G, B, out);
741  break;
742  case AV_PIX_FMT_BGR24:
743  out_bgr24(R, G, B, out);
744  break;
745  default:
746  {
747  /* If this is reached, the caller should have called yuv2packedXinC
748  * instead. */
749  static int printed_error_message;
750  if (!printed_error_message) {
752  "altivec_yuv2packedX doesn't support %s output\n",
753  av_get_pix_fmt_name(c->opts.dst_format));
754  printed_error_message = 1;
755  }
756  return;
757  }
758  }
759  }
760 
761  if (i < dstW) {
762  i -= 16;
763 
764  Y0 = RND;
765  Y1 = RND;
766  /* extract 16 coeffs from lumSrc */
767  for (j = 0; j < lumFilterSize; j++) {
768  X0 = vec_ld(0, &lumSrc[j][i]);
769  X1 = vec_ld(16, &lumSrc[j][i]);
770  Y0 = vec_mradds(X0, YCoeffs[j], Y0);
771  Y1 = vec_mradds(X1, YCoeffs[j], Y1);
772  }
773 
774  U = RND;
775  V = RND;
776  /* extract 8 coeffs from U,V */
777  for (j = 0; j < chrFilterSize; j++) {
778  X = vec_ld(0, &chrUSrc[j][i / 2]);
779  U = vec_mradds(X, CCoeffs[j], U);
780  X = vec_ld(0, &chrVSrc[j][i / 2]);
781  V = vec_mradds(X, CCoeffs[j], V);
782  }
783 
784  /* scale and clip signals */
785  Y0 = vec_sra(Y0, SCL);
786  Y1 = vec_sra(Y1, SCL);
787  U = vec_sra(U, SCL);
788  V = vec_sra(V, SCL);
789 
790  Y0 = vec_clip_s16(Y0);
791  Y1 = vec_clip_s16(Y1);
792  U = vec_clip_s16(U);
793  V = vec_clip_s16(V);
794 
795  /* now we have
796  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
797  * U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
798  *
799  * Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
800  * U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
801  * V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
802  */
803 
804  U0 = vec_mergeh(U, U);
805  V0 = vec_mergeh(V, V);
806 
807  U1 = vec_mergel(U, U);
808  V1 = vec_mergel(V, V);
809 
810  cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
811  cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
812 
813  R = vec_packclp(R0, R1);
814  G = vec_packclp(G0, G1);
815  B = vec_packclp(B0, B1);
816 
817  nout = (vector unsigned char *) scratch;
818  switch (target) {
819  case AV_PIX_FMT_ABGR:
820  out_abgr(R, G, B, nout);
821  break;
822  case AV_PIX_FMT_BGRA:
823  out_bgra(R, G, B, nout);
824  break;
825  case AV_PIX_FMT_RGBA:
826  out_rgba(R, G, B, nout);
827  break;
828  case AV_PIX_FMT_ARGB:
829  out_argb(R, G, B, nout);
830  break;
831  case AV_PIX_FMT_RGB24:
832  out_rgb24(R, G, B, nout);
833  break;
834  case AV_PIX_FMT_BGR24:
835  out_bgr24(R, G, B, nout);
836  break;
837  default:
838  /* Unreachable, I think. */
840  "altivec_yuv2packedX doesn't support %s output\n",
841  av_get_pix_fmt_name(c->opts.dst_format));
842  return;
843  }
844 
845  memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
846  }
847 }
848 
849 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
850 void ff_yuv2 ## suffix ## _X_altivec(SwsInternal *c, \
851  const int16_t *lumFilter, \
852  const int16_t **lumSrc, \
853  int lumFilterSize, \
854  const int16_t *chrFilter, \
855  const int16_t **chrUSrc, \
856  const int16_t **chrVSrc, \
857  int chrFilterSize, \
858  const int16_t **alpSrc, \
859  uint8_t *dest, int dstW, int dstY) \
860 { \
861  yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
862  chrFilter, chrUSrc, chrVSrc, \
863  chrFilterSize, alpSrc, \
864  dest, dstW, dstY, pixfmt); \
865 }
866 
867 YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
868 YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
869 YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
870 YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
871 YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
872 YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
873 
874 #endif /* HAVE_ALTIVEC */
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:215
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
mem_internal.h
out
FILE * out
Definition: movenc.c:55
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:477
R0
#define R0(v, w, x, y, z, i)
Definition: sha.c:57
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
pixdesc.h
w
uint8_t w
Definition: llviddspenc.c:38
R
#define R
Definition: huffyuv.h:44
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
R1
#define R1
Definition: simple_idct.c:171
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
B1
@ B1
Definition: mvs.c:527
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:209
av_cold
#define av_cold
Definition: attributes.h:90
B
#define B
Definition: huffyuv.h:42
ff_yuv2rgb_init_tables_ppc
av_cold void ff_yuv2rgb_init_tables_ppc(SwsInternal *c, const int inv_table[4], int brightness, int contrast, int saturation)
Definition: yuv2rgb_altivec.c:606
AV_PIX_FMT_YUV420P
@ AV_PIX_FMT_YUV420P
planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
Definition: pixfmt.h:73
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
ff_yuv2rgb_init_ppc
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsInternal *c)
Definition: yuv2rgb_altivec.c:535
NULL
#define NULL
Definition: coverity.c:32
V
#define V
Definition: avdct.c:31
AV_PIX_FMT_GRAY8
@ AV_PIX_FMT_GRAY8
Y , 8bpp.
Definition: pixfmt.h:81
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:61
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:102
cpu.h
X
@ X
Definition: vf_addroi.c:27
img
#define img
Definition: vf_colormatrix.c:114
yuv2rgb_altivec.h
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
Y
#define Y
Definition: boxblur.h:37
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
xf
#define xf(width, name, var, range_min, range_max, subs,...)
Definition: cbs_av1.c:598
av_always_inline
#define av_always_inline
Definition: attributes.h:49
swscale_internal.h
AV_PIX_FMT_NV21
@ AV_PIX_FMT_NV21
as above, but U and V bytes are swapped
Definition: pixfmt.h:97
B0
@ B0
Definition: mvs.c:526
SwsInternal
Definition: swscale_internal.h:317
AV_PIX_FMT_NV12
@ AV_PIX_FMT_NV12
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
Definition: pixfmt.h:96
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:88
U
#define U(x)
Definition: vpx_arith.h:37
G
#define G
Definition: huffyuv.h:43
SwsFunc
int(* SwsFunc)(SwsInternal *c, const uint8_t *const src[], const int srcStride[], int srcSliceY, int srcSliceH, uint8_t *const dst[], const int dstStride[])
Definition: swscale_internal.h:92
AV_PIX_FMT_YUV410P
@ AV_PIX_FMT_YUV410P
planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples)
Definition: pixfmt.h:79
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
h
h
Definition: vp9dsp_template.c:2070
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
rgb2rgb.h
swscale.h
av_get_pix_fmt_name
const char * av_get_pix_fmt_name(enum AVPixelFormat pix_fmt)
Return the short name for a pixel format, NULL in case pix_fmt is unknown.
Definition: pixdesc.c:3090