FFmpeg
swscale_vsx.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_VSX
35 #define vzero vec_splat_s32(0)
36 
37 #if !HAVE_BIGENDIAN
38 #define GET_LS(a,b,c,s) {\
39  ls = a;\
40  a = vec_vsx_ld(((b) << 1) + 16, s);\
41  }
42 
43 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
44  vector signed short ls;\
45  vector signed int vf1, vf2, i1, i2;\
46  GET_LS(l1, x, perm, src);\
47  i1 = vec_mule(filter, ls);\
48  i2 = vec_mulo(filter, ls);\
49  vf1 = vec_mergeh(i1, i2);\
50  vf2 = vec_mergel(i1, i2);\
51  d1 = vec_add(d1, vf1);\
52  d2 = vec_add(d2, vf2);\
53  } while (0)
54 
55 #define LOAD_FILTER(vf,f) {\
56  vf = vec_vsx_ld(joffset, f);\
57 }
58 #define LOAD_L1(ll1,s,p){\
59  ll1 = vec_vsx_ld(xoffset, s);\
60 }
61 
62 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
63 
64 // The neat trick: We only care for half the elements,
65 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
66 // and we're going to use vec_mule, so we choose
67 // carefully how to "unpack" the elements into the even slots.
68 #define GET_VF4(a, vf, f) {\
69  vf = (vector signed short)vec_vsx_ld(a << 3, f);\
70  vf = vec_mergeh(vf, (vector signed short)vzero);\
71 }
72 #define FIRST_LOAD(sv, pos, s, per) {}
73 #define UPDATE_PTR(s0, d0, s1, d1) {}
74 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
75  vf = vec_vsx_ld(pos + a, s);\
76 }
77 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
78 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
79  vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
80 }
81 
82 #define FUNC(name) name ## _vsx
83 #include "swscale_ppc_template.c"
84 #undef FUNC
85 
86 #undef vzero
87 
88 #endif /* !HAVE_BIGENDIAN */
89 
90 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
91  const uint8_t *dither, int offset, int start)
92 {
93  int i;
94  for (i = start; i < dstW; i++) {
95  int val = (src[i] + dither[(i + offset) & 7]) >> 7;
96  dest[i] = av_clip_uint8(val);
97  }
98 }
99 
100 static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
101  const uint8_t *dither, int offset)
102 {
103  const int dst_u = -(uintptr_t)dest & 15;
104  int i, j;
105  LOCAL_ALIGNED(16, int16_t, val, [16]);
106  const vec_u16 shifts = (vec_u16) {7, 7, 7, 7, 7, 7, 7, 7};
107  vec_s16 vi, vileft, ditherleft, ditherright;
108  vec_u8 vd;
109 
110  for (j = 0; j < 16; j++) {
111  val[j] = dither[(dst_u + offset + j) & 7];
112  }
113 
114  ditherleft = vec_ld(0, val);
115  ditherright = vec_ld(0, &val[8]);
116 
117  yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
118 
119  for (i = dst_u; i < dstW - 15; i += 16) {
120 
121  vi = vec_vsx_ld(0, &src[i]);
122  vi = vec_adds(ditherleft, vi);
123  vileft = vec_sra(vi, shifts);
124 
125  vi = vec_vsx_ld(0, &src[i + 8]);
126  vi = vec_adds(ditherright, vi);
127  vi = vec_sra(vi, shifts);
128 
129  vd = vec_packsu(vileft, vi);
130  vec_st(vd, 0, &dest[i]);
131  }
132 
133  yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
134 }
135 
136 #if !HAVE_BIGENDIAN
137 
138 #define output_pixel(pos, val) \
139  if (big_endian) { \
140  AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
141  } else { \
142  AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
143  }
144 
145 static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
146  int big_endian, int output_bits, int start)
147 {
148  int i;
149  int shift = 15 - output_bits;
150 
151  for (i = start; i < dstW; i++) {
152  int val = src[i] + (1 << (shift - 1));
153  output_pixel(&dest[i], val);
154  }
155 }
156 
157 static av_always_inline void yuv2plane1_nbps_vsx(const int16_t *src,
158  uint16_t *dest, int dstW,
159  const int big_endian,
160  const int output_bits)
161 {
162  const int dst_u = -(uintptr_t)dest & 7;
163  const int shift = 15 - output_bits;
164  const int add = (1 << (shift - 1));
165  const int clip = (1 << output_bits) - 1;
166  const vec_u16 vadd = (vec_u16) {add, add, add, add, add, add, add, add};
167  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
168  const vec_u16 vshift = (vec_u16) vec_splat_u16(shift);
169  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
170  vec_u16 v;
171  int i;
172 
173  yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
174 
175  for (i = dst_u; i < dstW - 7; i += 8) {
176  v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
177  v = vec_add(v, vadd);
178  v = vec_sr(v, vshift);
179  v = vec_min(v, vlargest);
180  v = vec_rl(v, vswap);
181  vec_st(v, 0, &dest[i]);
182  }
183 
184  yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
185 }
186 
187 static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
188  const int16_t **src, uint16_t *dest, int dstW,
189  int big_endian, int output_bits, int start)
190 {
191  int i;
192  int shift = 11 + 16 - output_bits;
193 
194  for (i = start; i < dstW; i++) {
195  int val = 1 << (shift - 1);
196  int j;
197 
198  for (j = 0; j < filterSize; j++)
199  val += src[j][i] * filter[j];
200 
201  output_pixel(&dest[i], val);
202  }
203 }
204 
205 static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
206  const int16_t **src, uint16_t *dest, int dstW,
207  int big_endian, int output_bits)
208 {
209  const int dst_u = -(uintptr_t)dest & 7;
210  const int shift = 11 + 16 - output_bits;
211  const int add = (1 << (shift - 1));
212  const int clip = (1 << output_bits) - 1;
213  const uint16_t swap = big_endian ? 8 : 0;
214  const vec_u32 vadd = (vec_u32) {add, add, add, add};
215  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
216  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
217  const vec_u16 vlargest = (vec_u16) {clip, clip, clip, clip, clip, clip, clip, clip};
218  const vec_s16 vzero = vec_splat_s16(0);
219  const vec_u8 vperm = (vec_u8) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
220  vec_s16 vfilter[MAX_FILTER_SIZE], vin;
221  vec_u16 v;
222  vec_u32 vleft, vright, vtmp;
223  int i, j;
224 
225  for (i = 0; i < filterSize; i++) {
226  vfilter[i] = (vec_s16) {filter[i], filter[i], filter[i], filter[i],
227  filter[i], filter[i], filter[i], filter[i]};
228  }
229 
230  yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
231 
232  for (i = dst_u; i < dstW - 7; i += 8) {
233  vleft = vright = vadd;
234 
235  for (j = 0; j < filterSize; j++) {
236  vin = vec_vsx_ld(0, &src[j][i]);
237  vtmp = (vec_u32) vec_mule(vin, vfilter[j]);
238  vleft = vec_add(vleft, vtmp);
239  vtmp = (vec_u32) vec_mulo(vin, vfilter[j]);
240  vright = vec_add(vright, vtmp);
241  }
242 
243  vleft = vec_sra(vleft, vshift);
244  vright = vec_sra(vright, vshift);
245  v = vec_packsu(vleft, vright);
246  v = (vec_u16) vec_max((vec_s16) v, vzero);
247  v = vec_min(v, vlargest);
248  v = vec_rl(v, vswap);
249  v = vec_perm(v, v, vperm);
250  vec_st(v, 0, &dest[i]);
251  }
252 
253  yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
254 }
255 
256 
257 #undef output_pixel
258 
259 #define output_pixel(pos, val, bias, signedness) \
260  if (big_endian) { \
261  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
262  } else { \
263  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
264  }
265 
266 static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
267  int big_endian, int output_bits, int start)
268 {
269  int i;
270  const int shift = 3;
271 
272  for (i = start; i < dstW; i++) {
273  int val = src[i] + (1 << (shift - 1));
274  output_pixel(&dest[i], val, 0, uint);
275  }
276 }
277 
278 static av_always_inline void yuv2plane1_16_vsx(const int32_t *src,
279  uint16_t *dest, int dstW,
280  const int big_endian,
281  int output_bits)
282 {
283  const int dst_u = -(uintptr_t)dest & 7;
284  const int shift = 3;
285  const int add = (1 << (shift - 1));
286  const vec_u32 vadd = (vec_u32) {add, add, add, add};
287  const vec_u16 vswap = (vec_u16) vec_splat_u16(big_endian ? 8 : 0);
288  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
289  vec_u32 v, v2;
290  vec_u16 vd;
291  int i;
292 
293  yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
294 
295  for (i = dst_u; i < dstW - 7; i += 8) {
296  v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
297  v = vec_add(v, vadd);
298  v = vec_sr(v, vshift);
299 
300  v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
301  v2 = vec_add(v2, vadd);
302  v2 = vec_sr(v2, vshift);
303 
304  vd = vec_packsu(v, v2);
305  vd = vec_rl(vd, vswap);
306 
307  vec_st(vd, 0, &dest[i]);
308  }
309 
310  yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
311 }
312 
313 #if HAVE_POWER8
314 
315 static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
316  const int32_t **src, uint16_t *dest, int dstW,
317  int big_endian, int output_bits, int start)
318 {
319  int i;
320  int shift = 15;
321 
322  for (i = start; i < dstW; i++) {
323  int val = 1 << (shift - 1);
324  int j;
325 
326  /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
327  * filters (or anything with negative coeffs, the range can be slightly
328  * wider in both directions. To account for this overflow, we subtract
329  * a constant so it always fits in the signed range (assuming a
330  * reasonable filterSize), and re-add that at the end. */
331  val -= 0x40000000;
332  for (j = 0; j < filterSize; j++)
333  val += src[j][i] * (unsigned)filter[j];
334 
335  output_pixel(&dest[i], val, 0x8000, int);
336  }
337 }
338 
339 static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
340  const int32_t **src, uint16_t *dest, int dstW,
341  int big_endian, int output_bits)
342 {
343  const int dst_u = -(uintptr_t)dest & 7;
344  const int shift = 15;
345  const int bias = 0x8000;
346  const int add = (1 << (shift - 1)) - 0x40000000;
347  const uint16_t swap = big_endian ? 8 : 0;
348  const vec_u32 vadd = (vec_u32) {add, add, add, add};
349  const vec_u32 vshift = (vec_u32) {shift, shift, shift, shift};
350  const vec_u16 vswap = (vec_u16) {swap, swap, swap, swap, swap, swap, swap, swap};
351  const vec_u16 vbias = (vec_u16) {bias, bias, bias, bias, bias, bias, bias, bias};
352  vec_s32 vfilter[MAX_FILTER_SIZE];
353  vec_u16 v;
354  vec_u32 vleft, vright, vtmp;
355  vec_s32 vin32l, vin32r;
356  int i, j;
357 
358  for (i = 0; i < filterSize; i++) {
359  vfilter[i] = (vec_s32) {filter[i], filter[i], filter[i], filter[i]};
360  }
361 
362  yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
363 
364  for (i = dst_u; i < dstW - 7; i += 8) {
365  vleft = vright = vadd;
366 
367  for (j = 0; j < filterSize; j++) {
368  vin32l = vec_vsx_ld(0, &src[j][i]);
369  vin32r = vec_vsx_ld(0, &src[j][i + 4]);
370 
371  vtmp = (vec_u32) vec_mul(vin32l, vfilter[j]);
372  vleft = vec_add(vleft, vtmp);
373  vtmp = (vec_u32) vec_mul(vin32r, vfilter[j]);
374  vright = vec_add(vright, vtmp);
375  }
376 
377  vleft = vec_sra(vleft, vshift);
378  vright = vec_sra(vright, vshift);
379  v = (vec_u16) vec_packs((vec_s32) vleft, (vec_s32) vright);
380  v = vec_add(v, vbias);
381  v = vec_rl(v, vswap);
382  vec_st(v, 0, &dest[i]);
383  }
384 
385  yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
386 }
387 
388 #endif /* HAVE_POWER8 */
389 
390 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
391  yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
392  yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
393 
394 #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
395 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
396  uint8_t *dest, int dstW, \
397  const uint8_t *dither, int offset) \
398 { \
399  yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
400  (uint16_t *) dest, dstW, is_be, bits); \
401 }
402 
403 #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
404 static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
405  const int16_t **src, uint8_t *dest, int dstW, \
406  const uint8_t *dither, int offset)\
407 { \
408  yuv2planeX_## template_size ## _vsx(filter, \
409  filterSize, (const typeX_t **) src, \
410  (uint16_t *) dest, dstW, is_be, bits); \
411 }
412 
413 yuv2NBPS( 9, BE, 1, nbps, int16_t)
414 yuv2NBPS( 9, LE, 0, nbps, int16_t)
415 yuv2NBPS(10, BE, 1, nbps, int16_t)
416 yuv2NBPS(10, LE, 0, nbps, int16_t)
417 yuv2NBPS(12, BE, 1, nbps, int16_t)
418 yuv2NBPS(12, LE, 0, nbps, int16_t)
419 yuv2NBPS(14, BE, 1, nbps, int16_t)
420 yuv2NBPS(14, LE, 0, nbps, int16_t)
421 
422 yuv2NBPS1(16, BE, 1, 16, int32_t)
423 yuv2NBPS1(16, LE, 0, 16, int32_t)
424 #if HAVE_POWER8
425 yuv2NBPSX(16, BE, 1, 16, int32_t)
426 yuv2NBPSX(16, LE, 0, 16, int32_t)
427 #endif
428 
429 #define WRITERGB \
430  R_l = vec_max(R_l, zero32); \
431  R_r = vec_max(R_r, zero32); \
432  G_l = vec_max(G_l, zero32); \
433  G_r = vec_max(G_r, zero32); \
434  B_l = vec_max(B_l, zero32); \
435  B_r = vec_max(B_r, zero32); \
436 \
437  R_l = vec_min(R_l, rgbclip); \
438  R_r = vec_min(R_r, rgbclip); \
439  G_l = vec_min(G_l, rgbclip); \
440  G_r = vec_min(G_r, rgbclip); \
441  B_l = vec_min(B_l, rgbclip); \
442  B_r = vec_min(B_r, rgbclip); \
443 \
444  R_l = vec_sr(R_l, shift22); \
445  R_r = vec_sr(R_r, shift22); \
446  G_l = vec_sr(G_l, shift22); \
447  G_r = vec_sr(G_r, shift22); \
448  B_l = vec_sr(B_l, shift22); \
449  B_r = vec_sr(B_r, shift22); \
450 \
451  rd16 = vec_packsu(R_l, R_r); \
452  gd16 = vec_packsu(G_l, G_r); \
453  bd16 = vec_packsu(B_l, B_r); \
454  rd = vec_packsu(rd16, zero16); \
455  gd = vec_packsu(gd16, zero16); \
456  bd = vec_packsu(bd16, zero16); \
457 \
458  switch(target) { \
459  case AV_PIX_FMT_RGB24: \
460  out0 = vec_perm(rd, gd, perm3rg0); \
461  out0 = vec_perm(out0, bd, perm3tb0); \
462  out1 = vec_perm(rd, gd, perm3rg1); \
463  out1 = vec_perm(out1, bd, perm3tb1); \
464 \
465  vec_vsx_st(out0, 0, dest); \
466  vec_vsx_st(out1, 16, dest); \
467 \
468  dest += 24; \
469  break; \
470  case AV_PIX_FMT_BGR24: \
471  out0 = vec_perm(bd, gd, perm3rg0); \
472  out0 = vec_perm(out0, rd, perm3tb0); \
473  out1 = vec_perm(bd, gd, perm3rg1); \
474  out1 = vec_perm(out1, rd, perm3tb1); \
475 \
476  vec_vsx_st(out0, 0, dest); \
477  vec_vsx_st(out1, 16, dest); \
478 \
479  dest += 24; \
480  break; \
481  case AV_PIX_FMT_BGRA: \
482  out0 = vec_mergeh(bd, gd); \
483  out1 = vec_mergeh(rd, ad); \
484 \
485  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
486  vec_vsx_st(tmp8, 0, dest); \
487  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
488  vec_vsx_st(tmp8, 16, dest); \
489 \
490  dest += 32; \
491  break; \
492  case AV_PIX_FMT_RGBA: \
493  out0 = vec_mergeh(rd, gd); \
494  out1 = vec_mergeh(bd, ad); \
495 \
496  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
497  vec_vsx_st(tmp8, 0, dest); \
498  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
499  vec_vsx_st(tmp8, 16, dest); \
500 \
501  dest += 32; \
502  break; \
503  case AV_PIX_FMT_ARGB: \
504  out0 = vec_mergeh(ad, rd); \
505  out1 = vec_mergeh(gd, bd); \
506 \
507  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
508  vec_vsx_st(tmp8, 0, dest); \
509  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
510  vec_vsx_st(tmp8, 16, dest); \
511 \
512  dest += 32; \
513  break; \
514  case AV_PIX_FMT_ABGR: \
515  out0 = vec_mergeh(ad, bd); \
516  out1 = vec_mergeh(gd, rd); \
517 \
518  tmp8 = (vec_u8) vec_mergeh((vec_u16) out0, (vec_u16) out1); \
519  vec_vsx_st(tmp8, 0, dest); \
520  tmp8 = (vec_u8) vec_mergel((vec_u16) out0, (vec_u16) out1); \
521  vec_vsx_st(tmp8, 16, dest); \
522 \
523  dest += 32; \
524  break; \
525  }
526 
527 static av_always_inline void
528 yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
529  const int16_t **lumSrc, int lumFilterSize,
530  const int16_t *chrFilter, const int16_t **chrUSrc,
531  const int16_t **chrVSrc, int chrFilterSize,
532  const int16_t **alpSrc, uint8_t *dest,
533  int dstW, int y, enum AVPixelFormat target, int hasAlpha)
534 {
535  vec_s16 vv;
536  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
537  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
538  vec_s32 tmp, tmp2, tmp3, tmp4;
539  vec_u16 rd16, gd16, bd16;
540  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
541  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
542  const vec_s32 ystart = vec_splats(1 << 9);
543  const vec_s32 uvstart = vec_splats((1 << 9) - (128 << 19));
544  const vec_u16 zero16 = vec_splat_u16(0);
545  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
546  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
547  const vec_s32 y_add = vec_splats(1 << 21);
548  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
549  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
550  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
551  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
552  const vec_s32 rgbclip = vec_splats(1 << 30);
553  const vec_s32 zero32 = vec_splat_s32(0);
554  const vec_u32 shift22 = vec_splats(22U);
555  const vec_u32 shift10 = vec_splat_u32(10);
556  int i, j;
557 
558  // Various permutations
559  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
560  0x1, 0x11, 0,
561  0x2, 0x12, 0,
562  0x3, 0x13, 0,
563  0x4, 0x14, 0,
564  0x5 };
565  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
566  0x6, 0x16, 0,
567  0x7, 0x17, 0 };
568  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
569  0x3, 0x4, 0x11,
570  0x6, 0x7, 0x12,
571  0x9, 0xa, 0x13,
572  0xc, 0xd, 0x14,
573  0xf };
574  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
575  0x2, 0x3, 0x16,
576  0x5, 0x6, 0x17 };
577 
578  ad = vec_splats((uint8_t) 255);
579 
580  for (i = 0; i < lumFilterSize; i++)
581  vlumFilter[i] = vec_splats(lumFilter[i]);
582  for (i = 0; i < chrFilterSize; i++)
583  vchrFilter[i] = vec_splats(chrFilter[i]);
584 
585  for (i = 0; i < dstW; i += 8) {
586  vy32_l =
587  vy32_r = ystart;
588  vu32_l =
589  vu32_r =
590  vv32_l =
591  vv32_r = uvstart;
592 
593  for (j = 0; j < lumFilterSize; j++) {
594  vv = vec_ld(0, &lumSrc[j][i]);
595  tmp = vec_mule(vv, vlumFilter[j]);
596  tmp2 = vec_mulo(vv, vlumFilter[j]);
597  tmp3 = vec_mergeh(tmp, tmp2);
598  tmp4 = vec_mergel(tmp, tmp2);
599 
600  vy32_l = vec_adds(vy32_l, tmp3);
601  vy32_r = vec_adds(vy32_r, tmp4);
602  }
603 
604  for (j = 0; j < chrFilterSize; j++) {
605  vv = vec_ld(0, &chrUSrc[j][i]);
606  tmp = vec_mule(vv, vchrFilter[j]);
607  tmp2 = vec_mulo(vv, vchrFilter[j]);
608  tmp3 = vec_mergeh(tmp, tmp2);
609  tmp4 = vec_mergel(tmp, tmp2);
610 
611  vu32_l = vec_adds(vu32_l, tmp3);
612  vu32_r = vec_adds(vu32_r, tmp4);
613 
614  vv = vec_ld(0, &chrVSrc[j][i]);
615  tmp = vec_mule(vv, vchrFilter[j]);
616  tmp2 = vec_mulo(vv, vchrFilter[j]);
617  tmp3 = vec_mergeh(tmp, tmp2);
618  tmp4 = vec_mergel(tmp, tmp2);
619 
620  vv32_l = vec_adds(vv32_l, tmp3);
621  vv32_r = vec_adds(vv32_r, tmp4);
622  }
623 
624  vy32_l = vec_sra(vy32_l, shift10);
625  vy32_r = vec_sra(vy32_r, shift10);
626  vu32_l = vec_sra(vu32_l, shift10);
627  vu32_r = vec_sra(vu32_r, shift10);
628  vv32_l = vec_sra(vv32_l, shift10);
629  vv32_r = vec_sra(vv32_r, shift10);
630 
631  vy32_l = vec_sub(vy32_l, y_offset);
632  vy32_r = vec_sub(vy32_r, y_offset);
633  vy32_l = vec_mul(vy32_l, y_coeff);
634  vy32_r = vec_mul(vy32_r, y_coeff);
635  vy32_l = vec_add(vy32_l, y_add);
636  vy32_r = vec_add(vy32_r, y_add);
637 
638  R_l = vec_mul(vv32_l, v2r_coeff);
639  R_l = vec_add(R_l, vy32_l);
640  R_r = vec_mul(vv32_r, v2r_coeff);
641  R_r = vec_add(R_r, vy32_r);
642  G_l = vec_mul(vv32_l, v2g_coeff);
643  tmp32 = vec_mul(vu32_l, u2g_coeff);
644  G_l = vec_add(G_l, vy32_l);
645  G_l = vec_add(G_l, tmp32);
646  G_r = vec_mul(vv32_r, v2g_coeff);
647  tmp32 = vec_mul(vu32_r, u2g_coeff);
648  G_r = vec_add(G_r, vy32_r);
649  G_r = vec_add(G_r, tmp32);
650 
651  B_l = vec_mul(vu32_l, u2b_coeff);
652  B_l = vec_add(B_l, vy32_l);
653  B_r = vec_mul(vu32_r, u2b_coeff);
654  B_r = vec_add(B_r, vy32_r);
655 
656  WRITERGB
657  }
658 }
659 
660 #define SETUP(x, buf0, alpha1, buf1, alpha) { \
661  x = vec_ld(0, buf0); \
662  tmp = vec_mule(x, alpha1); \
663  tmp2 = vec_mulo(x, alpha1); \
664  tmp3 = vec_mergeh(tmp, tmp2); \
665  tmp4 = vec_mergel(tmp, tmp2); \
666 \
667  x = vec_ld(0, buf1); \
668  tmp = vec_mule(x, alpha); \
669  tmp2 = vec_mulo(x, alpha); \
670  tmp5 = vec_mergeh(tmp, tmp2); \
671  tmp6 = vec_mergel(tmp, tmp2); \
672 \
673  tmp3 = vec_add(tmp3, tmp5); \
674  tmp4 = vec_add(tmp4, tmp6); \
675 }
676 
677 
678 static av_always_inline void
679 yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2],
680  const int16_t *ubuf[2], const int16_t *vbuf[2],
681  const int16_t *abuf[2], uint8_t *dest, int dstW,
682  int yalpha, int uvalpha, int y,
683  enum AVPixelFormat target, int hasAlpha)
684 {
685  const int16_t *buf0 = buf[0], *buf1 = buf[1],
686  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
687  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
688  *abuf0 = hasAlpha ? abuf[0] : NULL,
689  *abuf1 = hasAlpha ? abuf[1] : NULL;
690  const int16_t yalpha1 = 4096 - yalpha;
691  const int16_t uvalpha1 = 4096 - uvalpha;
692  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
693  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
694  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
695  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
696  vec_u16 rd16, gd16, bd16;
697  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
698  const vec_s16 vyalpha1 = vec_splats(yalpha1);
699  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
700  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
701  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
702  const vec_u16 zero16 = vec_splat_u16(0);
703  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
704  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
705  const vec_s32 y_add = vec_splats(1 << 21);
706  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
707  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
708  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
709  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
710  const vec_s32 rgbclip = vec_splats(1 << 30);
711  const vec_s32 zero32 = vec_splat_s32(0);
712  const vec_u32 shift19 = vec_splats(19U);
713  const vec_u32 shift22 = vec_splats(22U);
714  const vec_u32 shift10 = vec_splat_u32(10);
715  const vec_s32 dec128 = vec_splats(128 << 19);
716  const vec_s32 add18 = vec_splats(1 << 18);
717  int i;
718 
719  // Various permutations
720  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
721  0x1, 0x11, 0,
722  0x2, 0x12, 0,
723  0x3, 0x13, 0,
724  0x4, 0x14, 0,
725  0x5 };
726  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
727  0x6, 0x16, 0,
728  0x7, 0x17, 0 };
729  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
730  0x3, 0x4, 0x11,
731  0x6, 0x7, 0x12,
732  0x9, 0xa, 0x13,
733  0xc, 0xd, 0x14,
734  0xf };
735  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
736  0x2, 0x3, 0x16,
737  0x5, 0x6, 0x17 };
738 
739  av_assert2(yalpha <= 4096U);
740  av_assert2(uvalpha <= 4096U);
741 
742  for (i = 0; i < dstW; i += 8) {
743  SETUP(vy, &buf0[i], vyalpha1, &buf1[i], vyalpha);
744  vy32_l = vec_sra(tmp3, shift10);
745  vy32_r = vec_sra(tmp4, shift10);
746 
747  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
748  tmp3 = vec_sub(tmp3, dec128);
749  tmp4 = vec_sub(tmp4, dec128);
750  vu32_l = vec_sra(tmp3, shift10);
751  vu32_r = vec_sra(tmp4, shift10);
752 
753  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
754  tmp3 = vec_sub(tmp3, dec128);
755  tmp4 = vec_sub(tmp4, dec128);
756  vv32_l = vec_sra(tmp3, shift10);
757  vv32_r = vec_sra(tmp4, shift10);
758 
759  if (hasAlpha) {
760  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
761  tmp3 = vec_add(tmp3, add18);
762  tmp4 = vec_add(tmp4, add18);
763  tmp3 = vec_sra(tmp3, shift19);
764  tmp4 = vec_sra(tmp4, shift19);
765  A = vec_packs(tmp3, tmp4);
766  ad = vec_packsu(A, (vec_s16) zero16);
767  } else {
768  ad = vec_splats((uint8_t) 255);
769  }
770 
771  vy32_l = vec_sub(vy32_l, y_offset);
772  vy32_r = vec_sub(vy32_r, y_offset);
773  vy32_l = vec_mul(vy32_l, y_coeff);
774  vy32_r = vec_mul(vy32_r, y_coeff);
775  vy32_l = vec_add(vy32_l, y_add);
776  vy32_r = vec_add(vy32_r, y_add);
777 
778  R_l = vec_mul(vv32_l, v2r_coeff);
779  R_l = vec_add(R_l, vy32_l);
780  R_r = vec_mul(vv32_r, v2r_coeff);
781  R_r = vec_add(R_r, vy32_r);
782  G_l = vec_mul(vv32_l, v2g_coeff);
783  tmp32 = vec_mul(vu32_l, u2g_coeff);
784  G_l = vec_add(G_l, vy32_l);
785  G_l = vec_add(G_l, tmp32);
786  G_r = vec_mul(vv32_r, v2g_coeff);
787  tmp32 = vec_mul(vu32_r, u2g_coeff);
788  G_r = vec_add(G_r, vy32_r);
789  G_r = vec_add(G_r, tmp32);
790 
791  B_l = vec_mul(vu32_l, u2b_coeff);
792  B_l = vec_add(B_l, vy32_l);
793  B_r = vec_mul(vu32_r, u2b_coeff);
794  B_r = vec_add(B_r, vy32_r);
795 
796  WRITERGB
797  }
798 }
799 
800 static av_always_inline void
801 yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
802  const int16_t *ubuf[2], const int16_t *vbuf[2],
803  const int16_t *abuf[2], uint8_t *dest, int dstW,
804  int yalpha, int uvalpha, int y,
805  enum AVPixelFormat target, int hasAlpha)
806 {
807  const int16_t *buf0 = buf[0], *buf1 = buf[1],
808  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
809  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
810  *abuf0 = hasAlpha ? abuf[0] : NULL,
811  *abuf1 = hasAlpha ? abuf[1] : NULL;
812  const int16_t yalpha1 = 4096 - yalpha;
813  const int16_t uvalpha1 = 4096 - uvalpha;
814  vec_s16 vy, vu, vv, A = vec_splat_s16(0);
815  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
816  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r;
817  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
818  vec_u16 rd16, gd16, bd16;
819  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
820  const vec_s16 vyalpha1 = vec_splats(yalpha1);
821  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
822  const vec_s16 vyalpha = vec_splats((int16_t) yalpha);
823  const vec_s16 vuvalpha = vec_splats((int16_t) uvalpha);
824  const vec_u16 zero16 = vec_splat_u16(0);
825  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
826  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
827  const vec_s32 y_add = vec_splats(1 << 21);
828  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
829  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
830  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
831  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
832  const vec_s32 rgbclip = vec_splats(1 << 30);
833  const vec_s32 zero32 = vec_splat_s32(0);
834  const vec_u32 shift19 = vec_splats(19U);
835  const vec_u32 shift22 = vec_splats(22U);
836  const vec_u32 shift10 = vec_splat_u32(10);
837  const vec_s32 dec128 = vec_splats(128 << 19);
838  const vec_s32 add18 = vec_splats(1 << 18);
839  int i;
840 
841  // Various permutations
842  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
843  0, 1, 2, 3,
844  4, 5, 6, 7,
845  4, 5, 6, 7 };
846  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
847  8, 9, 10, 11,
848  12, 13, 14, 15,
849  12, 13, 14, 15 };
850  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
851  0x1, 0x11, 0,
852  0x2, 0x12, 0,
853  0x3, 0x13, 0,
854  0x4, 0x14, 0,
855  0x5 };
856  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
857  0x6, 0x16, 0,
858  0x7, 0x17, 0 };
859  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
860  0x3, 0x4, 0x11,
861  0x6, 0x7, 0x12,
862  0x9, 0xa, 0x13,
863  0xc, 0xd, 0x14,
864  0xf };
865  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
866  0x2, 0x3, 0x16,
867  0x5, 0x6, 0x17 };
868 
869  av_assert2(yalpha <= 4096U);
870  av_assert2(uvalpha <= 4096U);
871 
872  for (i = 0; i < (dstW + 1) >> 1; i += 8) {
873  SETUP(vy, &buf0[i * 2], vyalpha1, &buf1[i * 2], vyalpha);
874  vy32_l = vec_sra(tmp3, shift10);
875  vy32_r = vec_sra(tmp4, shift10);
876 
877  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
878  tmp3 = vec_sub(tmp3, dec128);
879  tmp4 = vec_sub(tmp4, dec128);
880  vu32_l = vec_sra(tmp3, shift10);
881  vu32_r = vec_sra(tmp4, shift10);
882 
883  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
884  tmp3 = vec_sub(tmp3, dec128);
885  tmp4 = vec_sub(tmp4, dec128);
886  vv32_l = vec_sra(tmp3, shift10);
887  vv32_r = vec_sra(tmp4, shift10);
888 
889  if (hasAlpha) {
890  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
891  tmp3 = vec_add(tmp3, add18);
892  tmp4 = vec_add(tmp4, add18);
893  tmp3 = vec_sra(tmp3, shift19);
894  tmp4 = vec_sra(tmp4, shift19);
895  A = vec_packs(tmp3, tmp4);
896  ad = vec_packsu(A, (vec_s16) zero16);
897  } else {
898  ad = vec_splats((uint8_t) 255);
899  }
900 
901  vy32_l = vec_sub(vy32_l, y_offset);
902  vy32_r = vec_sub(vy32_r, y_offset);
903  vy32_l = vec_mul(vy32_l, y_coeff);
904  vy32_r = vec_mul(vy32_r, y_coeff);
905  vy32_l = vec_add(vy32_l, y_add);
906  vy32_r = vec_add(vy32_r, y_add);
907 
908  // Use the first UV half
909  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
910  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
911  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
912  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
913 
914  R_l = vec_mul(vvd32_l, v2r_coeff);
915  R_l = vec_add(R_l, vy32_l);
916  R_r = vec_mul(vvd32_r, v2r_coeff);
917  R_r = vec_add(R_r, vy32_r);
918  G_l = vec_mul(vvd32_l, v2g_coeff);
919  tmp32 = vec_mul(vud32_l, u2g_coeff);
920  G_l = vec_add(G_l, vy32_l);
921  G_l = vec_add(G_l, tmp32);
922  G_r = vec_mul(vvd32_r, v2g_coeff);
923  tmp32 = vec_mul(vud32_r, u2g_coeff);
924  G_r = vec_add(G_r, vy32_r);
925  G_r = vec_add(G_r, tmp32);
926 
927  B_l = vec_mul(vud32_l, u2b_coeff);
928  B_l = vec_add(B_l, vy32_l);
929  B_r = vec_mul(vud32_r, u2b_coeff);
930  B_r = vec_add(B_r, vy32_r);
931 
932  WRITERGB
933 
934  // New Y for the second half
935  SETUP(vy, &buf0[i * 2 + 8], vyalpha1, &buf1[i * 2 + 8], vyalpha);
936  vy32_l = vec_sra(tmp3, shift10);
937  vy32_r = vec_sra(tmp4, shift10);
938 
939  vy32_l = vec_sub(vy32_l, y_offset);
940  vy32_r = vec_sub(vy32_r, y_offset);
941  vy32_l = vec_mul(vy32_l, y_coeff);
942  vy32_r = vec_mul(vy32_r, y_coeff);
943  vy32_l = vec_add(vy32_l, y_add);
944  vy32_r = vec_add(vy32_r, y_add);
945 
946  // Second UV half
947  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
948  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
949  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
950  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
951 
952  R_l = vec_mul(vvd32_l, v2r_coeff);
953  R_l = vec_add(R_l, vy32_l);
954  R_r = vec_mul(vvd32_r, v2r_coeff);
955  R_r = vec_add(R_r, vy32_r);
956  G_l = vec_mul(vvd32_l, v2g_coeff);
957  tmp32 = vec_mul(vud32_l, u2g_coeff);
958  G_l = vec_add(G_l, vy32_l);
959  G_l = vec_add(G_l, tmp32);
960  G_r = vec_mul(vvd32_r, v2g_coeff);
961  tmp32 = vec_mul(vud32_r, u2g_coeff);
962  G_r = vec_add(G_r, vy32_r);
963  G_r = vec_add(G_r, tmp32);
964 
965  B_l = vec_mul(vud32_l, u2b_coeff);
966  B_l = vec_add(B_l, vy32_l);
967  B_r = vec_mul(vud32_r, u2b_coeff);
968  B_r = vec_add(B_r, vy32_r);
969 
970  WRITERGB
971  }
972 }
973 
974 #undef SETUP
975 
976 static av_always_inline void
977 yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
978  const int16_t *ubuf[2], const int16_t *vbuf[2],
979  const int16_t *abuf0, uint8_t *dest, int dstW,
980  int uvalpha, int y, enum AVPixelFormat target,
981  int hasAlpha)
982 {
983  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
984  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
985  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
986  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
987  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
988  vec_u16 rd16, gd16, bd16;
989  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
990  const vec_u16 zero16 = vec_splat_u16(0);
991  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
992  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
993  const vec_s32 y_add = vec_splats(1 << 21);
994  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
995  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
996  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
997  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
998  const vec_s32 rgbclip = vec_splats(1 << 30);
999  const vec_s32 zero32 = vec_splat_s32(0);
1000  const vec_u32 shift2 = vec_splat_u32(2);
1001  const vec_u32 shift22 = vec_splats(22U);
1002  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1003  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1004  const vec_s16 mul4 = vec_splat_s16(4);
1005  const vec_s16 mul8 = vec_splat_s16(8);
1006  const vec_s16 add64 = vec_splat_s16(64);
1007  const vec_u16 shift7 = vec_splat_u16(7);
1008  const vec_s16 max255 = vec_splat_s16(255);
1009  int i;
1010 
1011  // Various permutations
1012  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1013  0x1, 0x11, 0,
1014  0x2, 0x12, 0,
1015  0x3, 0x13, 0,
1016  0x4, 0x14, 0,
1017  0x5 };
1018  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1019  0x6, 0x16, 0,
1020  0x7, 0x17, 0 };
1021  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1022  0x3, 0x4, 0x11,
1023  0x6, 0x7, 0x12,
1024  0x9, 0xa, 0x13,
1025  0xc, 0xd, 0x14,
1026  0xf };
1027  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1028  0x2, 0x3, 0x16,
1029  0x5, 0x6, 0x17 };
1030 
1031  for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding bytes.
1032  vy = vec_ld(0, &buf0[i]);
1033  vy32_l = vec_unpackh(vy);
1034  vy32_r = vec_unpackl(vy);
1035  vy32_l = vec_sl(vy32_l, shift2);
1036  vy32_r = vec_sl(vy32_r, shift2);
1037 
1038  vu = vec_ld(0, &ubuf0[i]);
1039  vv = vec_ld(0, &vbuf0[i]);
1040  if (uvalpha < 2048) {
1041  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1042  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1043 
1044  tmp32 = vec_mule(vu, mul4);
1045  tmp32_2 = vec_mulo(vu, mul4);
1046  vu32_l = vec_mergeh(tmp32, tmp32_2);
1047  vu32_r = vec_mergel(tmp32, tmp32_2);
1048  tmp32 = vec_mule(vv, mul4);
1049  tmp32_2 = vec_mulo(vv, mul4);
1050  vv32_l = vec_mergeh(tmp32, tmp32_2);
1051  vv32_r = vec_mergel(tmp32, tmp32_2);
1052  } else {
1053  tmp16 = vec_ld(0, &ubuf1[i]);
1054  vu = vec_add(vu, tmp16);
1055  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1056  tmp16 = vec_ld(0, &vbuf1[i]);
1057  vv = vec_add(vv, tmp16);
1058  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1059 
1060  vu32_l = vec_mule(vu, mul8);
1061  vu32_r = vec_mulo(vu, mul8);
1062  vv32_l = vec_mule(vv, mul8);
1063  vv32_r = vec_mulo(vv, mul8);
1064  }
1065 
1066  if (hasAlpha) {
1067  A = vec_ld(0, &abuf0[i]);
1068  A = vec_add(A, add64);
1069  A = vec_sr(A, shift7);
1070  A = vec_max(A, max255);
1071  ad = vec_packsu(A, (vec_s16) zero16);
1072  } else {
1073  ad = vec_splats((uint8_t) 255);
1074  }
1075 
1076  vy32_l = vec_sub(vy32_l, y_offset);
1077  vy32_r = vec_sub(vy32_r, y_offset);
1078  vy32_l = vec_mul(vy32_l, y_coeff);
1079  vy32_r = vec_mul(vy32_r, y_coeff);
1080  vy32_l = vec_add(vy32_l, y_add);
1081  vy32_r = vec_add(vy32_r, y_add);
1082 
1083  R_l = vec_mul(vv32_l, v2r_coeff);
1084  R_l = vec_add(R_l, vy32_l);
1085  R_r = vec_mul(vv32_r, v2r_coeff);
1086  R_r = vec_add(R_r, vy32_r);
1087  G_l = vec_mul(vv32_l, v2g_coeff);
1088  tmp32 = vec_mul(vu32_l, u2g_coeff);
1089  G_l = vec_add(G_l, vy32_l);
1090  G_l = vec_add(G_l, tmp32);
1091  G_r = vec_mul(vv32_r, v2g_coeff);
1092  tmp32 = vec_mul(vu32_r, u2g_coeff);
1093  G_r = vec_add(G_r, vy32_r);
1094  G_r = vec_add(G_r, tmp32);
1095 
1096  B_l = vec_mul(vu32_l, u2b_coeff);
1097  B_l = vec_add(B_l, vy32_l);
1098  B_r = vec_mul(vu32_r, u2b_coeff);
1099  B_r = vec_add(B_r, vy32_r);
1100 
1101  WRITERGB
1102  }
1103 }
1104 
1105 static av_always_inline void
1106 yuv2rgb_1_vsx_template(SwsContext *c, const int16_t *buf0,
1107  const int16_t *ubuf[2], const int16_t *vbuf[2],
1108  const int16_t *abuf0, uint8_t *dest, int dstW,
1109  int uvalpha, int y, enum AVPixelFormat target,
1110  int hasAlpha)
1111 {
1112  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1113  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1114  vec_s16 vy, vu, vv, A = vec_splat_s16(0), tmp16;
1115  vec_s32 vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
1116  vec_s32 vud32_l, vud32_r, vvd32_l, vvd32_r;
1117  vec_s32 R_l, R_r, G_l, G_r, B_l, B_r;
1118  vec_u16 rd16, gd16, bd16;
1119  vec_u8 rd, bd, gd, ad, out0, out1, tmp8;
1120  const vec_u16 zero16 = vec_splat_u16(0);
1121  const vec_s32 y_offset = vec_splats(c->yuv2rgb_y_offset);
1122  const vec_s32 y_coeff = vec_splats(c->yuv2rgb_y_coeff);
1123  const vec_s32 y_add = vec_splats(1 << 21);
1124  const vec_s32 v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
1125  const vec_s32 v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
1126  const vec_s32 u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
1127  const vec_s32 u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
1128  const vec_s32 rgbclip = vec_splats(1 << 30);
1129  const vec_s32 zero32 = vec_splat_s32(0);
1130  const vec_u32 shift2 = vec_splat_u32(2);
1131  const vec_u32 shift22 = vec_splats(22U);
1132  const vec_u16 sub7 = vec_splats((uint16_t) (128 << 7));
1133  const vec_u16 sub8 = vec_splats((uint16_t) (128 << 8));
1134  const vec_s16 mul4 = vec_splat_s16(4);
1135  const vec_s16 mul8 = vec_splat_s16(8);
1136  const vec_s16 add64 = vec_splat_s16(64);
1137  const vec_u16 shift7 = vec_splat_u16(7);
1138  const vec_s16 max255 = vec_splat_s16(255);
1139  int i;
1140 
1141  // Various permutations
1142  const vec_u8 doubleleft = (vec_u8) {0, 1, 2, 3,
1143  0, 1, 2, 3,
1144  4, 5, 6, 7,
1145  4, 5, 6, 7 };
1146  const vec_u8 doubleright = (vec_u8) {8, 9, 10, 11,
1147  8, 9, 10, 11,
1148  12, 13, 14, 15,
1149  12, 13, 14, 15 };
1150  const vec_u8 perm3rg0 = (vec_u8) {0x0, 0x10, 0,
1151  0x1, 0x11, 0,
1152  0x2, 0x12, 0,
1153  0x3, 0x13, 0,
1154  0x4, 0x14, 0,
1155  0x5 };
1156  const vec_u8 perm3rg1 = (vec_u8) { 0x15, 0,
1157  0x6, 0x16, 0,
1158  0x7, 0x17, 0 };
1159  const vec_u8 perm3tb0 = (vec_u8) {0x0, 0x1, 0x10,
1160  0x3, 0x4, 0x11,
1161  0x6, 0x7, 0x12,
1162  0x9, 0xa, 0x13,
1163  0xc, 0xd, 0x14,
1164  0xf };
1165  const vec_u8 perm3tb1 = (vec_u8) { 0x0, 0x15,
1166  0x2, 0x3, 0x16,
1167  0x5, 0x6, 0x17 };
1168 
1169  for (i = 0; i < (dstW + 1) >> 1; i += 8) { // The x86 asm also overwrites padding bytes.
1170  vy = vec_ld(0, &buf0[i * 2]);
1171  vy32_l = vec_unpackh(vy);
1172  vy32_r = vec_unpackl(vy);
1173  vy32_l = vec_sl(vy32_l, shift2);
1174  vy32_r = vec_sl(vy32_r, shift2);
1175 
1176  vu = vec_ld(0, &ubuf0[i]);
1177  vv = vec_ld(0, &vbuf0[i]);
1178  if (uvalpha < 2048) {
1179  vu = (vec_s16) vec_sub((vec_u16) vu, sub7);
1180  vv = (vec_s16) vec_sub((vec_u16) vv, sub7);
1181 
1182  tmp32 = vec_mule(vu, mul4);
1183  tmp32_2 = vec_mulo(vu, mul4);
1184  vu32_l = vec_mergeh(tmp32, tmp32_2);
1185  vu32_r = vec_mergel(tmp32, tmp32_2);
1186  tmp32 = vec_mule(vv, mul4);
1187  tmp32_2 = vec_mulo(vv, mul4);
1188  vv32_l = vec_mergeh(tmp32, tmp32_2);
1189  vv32_r = vec_mergel(tmp32, tmp32_2);
1190  } else {
1191  tmp16 = vec_ld(0, &ubuf1[i]);
1192  vu = vec_add(vu, tmp16);
1193  vu = (vec_s16) vec_sub((vec_u16) vu, sub8);
1194  tmp16 = vec_ld(0, &vbuf1[i]);
1195  vv = vec_add(vv, tmp16);
1196  vv = (vec_s16) vec_sub((vec_u16) vv, sub8);
1197 
1198  vu32_l = vec_mule(vu, mul8);
1199  vu32_r = vec_mulo(vu, mul8);
1200  vv32_l = vec_mule(vv, mul8);
1201  vv32_r = vec_mulo(vv, mul8);
1202  }
1203 
1204  if (hasAlpha) {
1205  A = vec_ld(0, &abuf0[i]);
1206  A = vec_add(A, add64);
1207  A = vec_sr(A, shift7);
1208  A = vec_max(A, max255);
1209  ad = vec_packsu(A, (vec_s16) zero16);
1210  } else {
1211  ad = vec_splats((uint8_t) 255);
1212  }
1213 
1214  vy32_l = vec_sub(vy32_l, y_offset);
1215  vy32_r = vec_sub(vy32_r, y_offset);
1216  vy32_l = vec_mul(vy32_l, y_coeff);
1217  vy32_r = vec_mul(vy32_r, y_coeff);
1218  vy32_l = vec_add(vy32_l, y_add);
1219  vy32_r = vec_add(vy32_r, y_add);
1220 
1221  // Use the first UV half
1222  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
1223  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
1224  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
1225  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
1226 
1227  R_l = vec_mul(vvd32_l, v2r_coeff);
1228  R_l = vec_add(R_l, vy32_l);
1229  R_r = vec_mul(vvd32_r, v2r_coeff);
1230  R_r = vec_add(R_r, vy32_r);
1231  G_l = vec_mul(vvd32_l, v2g_coeff);
1232  tmp32 = vec_mul(vud32_l, u2g_coeff);
1233  G_l = vec_add(G_l, vy32_l);
1234  G_l = vec_add(G_l, tmp32);
1235  G_r = vec_mul(vvd32_r, v2g_coeff);
1236  tmp32 = vec_mul(vud32_r, u2g_coeff);
1237  G_r = vec_add(G_r, vy32_r);
1238  G_r = vec_add(G_r, tmp32);
1239 
1240  B_l = vec_mul(vud32_l, u2b_coeff);
1241  B_l = vec_add(B_l, vy32_l);
1242  B_r = vec_mul(vud32_r, u2b_coeff);
1243  B_r = vec_add(B_r, vy32_r);
1244 
1245  WRITERGB
1246 
1247  // New Y for the second half
1248  vy = vec_ld(16, &buf0[i * 2]);
1249  vy32_l = vec_unpackh(vy);
1250  vy32_r = vec_unpackl(vy);
1251  vy32_l = vec_sl(vy32_l, shift2);
1252  vy32_r = vec_sl(vy32_r, shift2);
1253 
1254  vy32_l = vec_sub(vy32_l, y_offset);
1255  vy32_r = vec_sub(vy32_r, y_offset);
1256  vy32_l = vec_mul(vy32_l, y_coeff);
1257  vy32_r = vec_mul(vy32_r, y_coeff);
1258  vy32_l = vec_add(vy32_l, y_add);
1259  vy32_r = vec_add(vy32_r, y_add);
1260 
1261  // Second UV half
1262  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
1263  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
1264  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
1265  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
1266 
1267  R_l = vec_mul(vvd32_l, v2r_coeff);
1268  R_l = vec_add(R_l, vy32_l);
1269  R_r = vec_mul(vvd32_r, v2r_coeff);
1270  R_r = vec_add(R_r, vy32_r);
1271  G_l = vec_mul(vvd32_l, v2g_coeff);
1272  tmp32 = vec_mul(vud32_l, u2g_coeff);
1273  G_l = vec_add(G_l, vy32_l);
1274  G_l = vec_add(G_l, tmp32);
1275  G_r = vec_mul(vvd32_r, v2g_coeff);
1276  tmp32 = vec_mul(vud32_r, u2g_coeff);
1277  G_r = vec_add(G_r, vy32_r);
1278  G_r = vec_add(G_r, tmp32);
1279 
1280  B_l = vec_mul(vud32_l, u2b_coeff);
1281  B_l = vec_add(B_l, vy32_l);
1282  B_r = vec_mul(vud32_r, u2b_coeff);
1283  B_r = vec_add(B_r, vy32_r);
1284 
1285  WRITERGB
1286  }
1287 }
1288 
1289 #undef WRITERGB
1290 
1291 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1292 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1293  const int16_t **lumSrc, int lumFilterSize, \
1294  const int16_t *chrFilter, const int16_t **chrUSrc, \
1295  const int16_t **chrVSrc, int chrFilterSize, \
1296  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1297  int y) \
1298 { \
1299  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1300  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1301  alpSrc, dest, dstW, y, fmt, hasAlpha); \
1302 }
1303 
1304 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
1305 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1306  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1307  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1308  int yalpha, int uvalpha, int y) \
1309 { \
1310  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1311  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1312 }
1313 
1314 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1315 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1316  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1317  const int16_t *abuf0, uint8_t *dest, int dstW, \
1318  int uvalpha, int y) \
1319 { \
1320  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1321  dstW, uvalpha, y, fmt, hasAlpha); \
1322 }
1323 
1324 YUV2RGBWRAPPER(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1325 YUV2RGBWRAPPER(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1326 YUV2RGBWRAPPER(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1327 YUV2RGBWRAPPER(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1328 
1329 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1330 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1331 
1332 YUV2RGBWRAPPERX2(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1333 YUV2RGBWRAPPERX2(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1334 YUV2RGBWRAPPERX2(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1335 YUV2RGBWRAPPERX2(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1336 
1337 YUV2RGBWRAPPERX2(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1338 YUV2RGBWRAPPERX2(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1339 
1340 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1341 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1342 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1343 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1344 
1345 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1346 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1347 
1348 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1349 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1350 YUV2RGBWRAPPERX2(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1351 YUV2RGBWRAPPERX2(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1352 
1353 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1354 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1355 
1356 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1357 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1358 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1359 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1360 
1361 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1362 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1363 
1364 static av_always_inline void
1365 write422(const vec_s16 vy1, const vec_s16 vy2,
1366  const vec_s16 vu, const vec_s16 vv,
1367  uint8_t *dest, const enum AVPixelFormat target)
1368 {
1369  vec_u8 vd1, vd2, tmp;
1370  const vec_u8 yuyv1 = (vec_u8) {
1371  0x0, 0x10, 0x1, 0x18,
1372  0x2, 0x11, 0x3, 0x19,
1373  0x4, 0x12, 0x5, 0x1a,
1374  0x6, 0x13, 0x7, 0x1b };
1375  const vec_u8 yuyv2 = (vec_u8) {
1376  0x8, 0x14, 0x9, 0x1c,
1377  0xa, 0x15, 0xb, 0x1d,
1378  0xc, 0x16, 0xd, 0x1e,
1379  0xe, 0x17, 0xf, 0x1f };
1380  const vec_u8 yvyu1 = (vec_u8) {
1381  0x0, 0x18, 0x1, 0x10,
1382  0x2, 0x19, 0x3, 0x11,
1383  0x4, 0x1a, 0x5, 0x12,
1384  0x6, 0x1b, 0x7, 0x13 };
1385  const vec_u8 yvyu2 = (vec_u8) {
1386  0x8, 0x1c, 0x9, 0x14,
1387  0xa, 0x1d, 0xb, 0x15,
1388  0xc, 0x1e, 0xd, 0x16,
1389  0xe, 0x1f, 0xf, 0x17 };
1390  const vec_u8 uyvy1 = (vec_u8) {
1391  0x10, 0x0, 0x18, 0x1,
1392  0x11, 0x2, 0x19, 0x3,
1393  0x12, 0x4, 0x1a, 0x5,
1394  0x13, 0x6, 0x1b, 0x7 };
1395  const vec_u8 uyvy2 = (vec_u8) {
1396  0x14, 0x8, 0x1c, 0x9,
1397  0x15, 0xa, 0x1d, 0xb,
1398  0x16, 0xc, 0x1e, 0xd,
1399  0x17, 0xe, 0x1f, 0xf };
1400 
1401  vd1 = vec_packsu(vy1, vy2);
1402  vd2 = vec_packsu(vu, vv);
1403 
1404  switch (target) {
1405  case AV_PIX_FMT_YUYV422:
1406  tmp = vec_perm(vd1, vd2, yuyv1);
1407  vec_st(tmp, 0, dest);
1408  tmp = vec_perm(vd1, vd2, yuyv2);
1409  vec_st(tmp, 16, dest);
1410  break;
1411  case AV_PIX_FMT_YVYU422:
1412  tmp = vec_perm(vd1, vd2, yvyu1);
1413  vec_st(tmp, 0, dest);
1414  tmp = vec_perm(vd1, vd2, yvyu2);
1415  vec_st(tmp, 16, dest);
1416  break;
1417  case AV_PIX_FMT_UYVY422:
1418  tmp = vec_perm(vd1, vd2, uyvy1);
1419  vec_st(tmp, 0, dest);
1420  tmp = vec_perm(vd1, vd2, uyvy2);
1421  vec_st(tmp, 16, dest);
1422  break;
1423  }
1424 }
1425 
1426 static av_always_inline void
1427 yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
1428  const int16_t **lumSrc, int lumFilterSize,
1429  const int16_t *chrFilter, const int16_t **chrUSrc,
1430  const int16_t **chrVSrc, int chrFilterSize,
1431  const int16_t **alpSrc, uint8_t *dest, int dstW,
1432  int y, enum AVPixelFormat target)
1433 {
1434  int i, j;
1435  vec_s16 vy1, vy2, vu, vv;
1436  vec_s32 vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
1437  vec_s16 vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
1438  const vec_s32 start = vec_splats(1 << 18);
1439  const vec_u32 shift19 = vec_splats(19U);
1440 
1441  for (i = 0; i < lumFilterSize; i++)
1442  vlumFilter[i] = vec_splats(lumFilter[i]);
1443  for (i = 0; i < chrFilterSize; i++)
1444  vchrFilter[i] = vec_splats(chrFilter[i]);
1445 
1446  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1447  vy32[0] =
1448  vy32[1] =
1449  vy32[2] =
1450  vy32[3] =
1451  vu32[0] =
1452  vu32[1] =
1453  vv32[0] =
1454  vv32[1] = start;
1455 
1456  for (j = 0; j < lumFilterSize; j++) {
1457  vv = vec_ld(0, &lumSrc[j][i * 2]);
1458  tmp = vec_mule(vv, vlumFilter[j]);
1459  tmp2 = vec_mulo(vv, vlumFilter[j]);
1460  tmp3 = vec_mergeh(tmp, tmp2);
1461  tmp4 = vec_mergel(tmp, tmp2);
1462 
1463  vy32[0] = vec_adds(vy32[0], tmp3);
1464  vy32[1] = vec_adds(vy32[1], tmp4);
1465 
1466  vv = vec_ld(0, &lumSrc[j][(i + 4) * 2]);
1467  tmp = vec_mule(vv, vlumFilter[j]);
1468  tmp2 = vec_mulo(vv, vlumFilter[j]);
1469  tmp3 = vec_mergeh(tmp, tmp2);
1470  tmp4 = vec_mergel(tmp, tmp2);
1471 
1472  vy32[2] = vec_adds(vy32[2], tmp3);
1473  vy32[3] = vec_adds(vy32[3], tmp4);
1474  }
1475 
1476  for (j = 0; j < chrFilterSize; j++) {
1477  vv = vec_ld(0, &chrUSrc[j][i]);
1478  tmp = vec_mule(vv, vchrFilter[j]);
1479  tmp2 = vec_mulo(vv, vchrFilter[j]);
1480  tmp3 = vec_mergeh(tmp, tmp2);
1481  tmp4 = vec_mergel(tmp, tmp2);
1482 
1483  vu32[0] = vec_adds(vu32[0], tmp3);
1484  vu32[1] = vec_adds(vu32[1], tmp4);
1485 
1486  vv = vec_ld(0, &chrVSrc[j][i]);
1487  tmp = vec_mule(vv, vchrFilter[j]);
1488  tmp2 = vec_mulo(vv, vchrFilter[j]);
1489  tmp3 = vec_mergeh(tmp, tmp2);
1490  tmp4 = vec_mergel(tmp, tmp2);
1491 
1492  vv32[0] = vec_adds(vv32[0], tmp3);
1493  vv32[1] = vec_adds(vv32[1], tmp4);
1494  }
1495 
1496  for (j = 0; j < 4; j++) {
1497  vy32[j] = vec_sra(vy32[j], shift19);
1498  }
1499  for (j = 0; j < 2; j++) {
1500  vu32[j] = vec_sra(vu32[j], shift19);
1501  vv32[j] = vec_sra(vv32[j], shift19);
1502  }
1503 
1504  vy1 = vec_packs(vy32[0], vy32[1]);
1505  vy2 = vec_packs(vy32[2], vy32[3]);
1506  vu = vec_packs(vu32[0], vu32[1]);
1507  vv = vec_packs(vv32[0], vv32[1]);
1508 
1509  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1510  }
1511 }
1512 
1513 #define SETUP(x, buf0, buf1, alpha) { \
1514  x = vec_ld(0, buf0); \
1515  tmp = vec_mule(x, alpha); \
1516  tmp2 = vec_mulo(x, alpha); \
1517  tmp3 = vec_mergeh(tmp, tmp2); \
1518  tmp4 = vec_mergel(tmp, tmp2); \
1519 \
1520  x = vec_ld(0, buf1); \
1521  tmp = vec_mule(x, alpha); \
1522  tmp2 = vec_mulo(x, alpha); \
1523  tmp5 = vec_mergeh(tmp, tmp2); \
1524  tmp6 = vec_mergel(tmp, tmp2); \
1525 \
1526  tmp3 = vec_add(tmp3, tmp5); \
1527  tmp4 = vec_add(tmp4, tmp6); \
1528 \
1529  tmp3 = vec_sra(tmp3, shift19); \
1530  tmp4 = vec_sra(tmp4, shift19); \
1531  x = vec_packs(tmp3, tmp4); \
1532 }
1533 
1534 static av_always_inline void
1535 yuv2422_2_vsx_template(SwsContext *c, const int16_t *buf[2],
1536  const int16_t *ubuf[2], const int16_t *vbuf[2],
1537  const int16_t *abuf[2], uint8_t *dest, int dstW,
1538  int yalpha, int uvalpha, int y,
1539  enum AVPixelFormat target)
1540 {
1541  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1542  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1543  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1544  const int16_t yalpha1 = 4096 - yalpha;
1545  const int16_t uvalpha1 = 4096 - uvalpha;
1546  vec_s16 vy1, vy2, vu, vv;
1547  vec_s32 tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
1548  const vec_s16 vyalpha1 = vec_splats(yalpha1);
1549  const vec_s16 vuvalpha1 = vec_splats(uvalpha1);
1550  const vec_u32 shift19 = vec_splats(19U);
1551  int i;
1552  av_assert2(yalpha <= 4096U);
1553  av_assert2(uvalpha <= 4096U);
1554 
1555  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1556 
1557  SETUP(vy1, &buf0[i * 2], &buf1[i * 2], vyalpha1)
1558  SETUP(vy2, &buf0[(i + 4) * 2], &buf1[(i + 4) * 2], vyalpha1)
1559  SETUP(vu, &ubuf0[i], &ubuf1[i], vuvalpha1)
1560  SETUP(vv, &vbuf0[i], &vbuf1[i], vuvalpha1)
1561 
1562  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1563  }
1564 }
1565 
1566 #undef SETUP
1567 
1568 static av_always_inline void
1569 yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
1570  const int16_t *ubuf[2], const int16_t *vbuf[2],
1571  const int16_t *abuf0, uint8_t *dest, int dstW,
1572  int uvalpha, int y, enum AVPixelFormat target)
1573 {
1574  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1575  vec_s16 vy1, vy2, vu, vv, tmp;
1576  const vec_s16 add64 = vec_splats((int16_t) 64);
1577  const vec_s16 add128 = vec_splats((int16_t) 128);
1578  const vec_u16 shift7 = vec_splat_u16(7);
1579  const vec_u16 shift8 = vec_splat_u16(8);
1580  int i;
1581 
1582  if (uvalpha < 2048) {
1583  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1584  vy1 = vec_ld(0, &buf0[i * 2]);
1585  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1586  vu = vec_ld(0, &ubuf0[i]);
1587  vv = vec_ld(0, &vbuf0[i]);
1588 
1589  vy1 = vec_add(vy1, add64);
1590  vy2 = vec_add(vy2, add64);
1591  vu = vec_add(vu, add64);
1592  vv = vec_add(vv, add64);
1593 
1594  vy1 = vec_sra(vy1, shift7);
1595  vy2 = vec_sra(vy2, shift7);
1596  vu = vec_sra(vu, shift7);
1597  vv = vec_sra(vv, shift7);
1598 
1599  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1600  }
1601  } else {
1602  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1603  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1604  vy1 = vec_ld(0, &buf0[i * 2]);
1605  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1606  vu = vec_ld(0, &ubuf0[i]);
1607  tmp = vec_ld(0, &ubuf1[i]);
1608  vu = vec_adds(vu, tmp);
1609  vv = vec_ld(0, &vbuf0[i]);
1610  tmp = vec_ld(0, &vbuf1[i]);
1611  vv = vec_adds(vv, tmp);
1612 
1613  vy1 = vec_add(vy1, add64);
1614  vy2 = vec_add(vy2, add64);
1615  vu = vec_adds(vu, add128);
1616  vv = vec_adds(vv, add128);
1617 
1618  vy1 = vec_sra(vy1, shift7);
1619  vy2 = vec_sra(vy2, shift7);
1620  vu = vec_sra(vu, shift8);
1621  vv = vec_sra(vv, shift8);
1622 
1623  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1624  }
1625  }
1626 }
1627 
1628 #define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1629 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1630  const int16_t **lumSrc, int lumFilterSize, \
1631  const int16_t *chrFilter, const int16_t **chrUSrc, \
1632  const int16_t **chrVSrc, int chrFilterSize, \
1633  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1634  int y) \
1635 { \
1636  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1637  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1638  alpSrc, dest, dstW, y, fmt); \
1639 }
1640 
1641 #define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1642 YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1643 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1644  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1645  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1646  int yalpha, int uvalpha, int y) \
1647 { \
1648  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1649  dest, dstW, yalpha, uvalpha, y, fmt); \
1650 }
1651 
1652 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
1653 YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1654 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1655  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1656  const int16_t *abuf0, uint8_t *dest, int dstW, \
1657  int uvalpha, int y) \
1658 { \
1659  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
1660  abuf0, dest, dstW, uvalpha, \
1661  y, fmt); \
1662 }
1663 
1664 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
1665 YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
1666 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
1667 
1668 static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth,
1669  const uint8_t *src, int srcW, int xInc)
1670 {
1671  int i;
1672  unsigned int xpos = 0, xx;
1673  vec_u8 vin, vin2, vperm;
1674  vec_s8 vmul, valpha;
1675  vec_s16 vtmp, vtmp2, vtmp3, vtmp4;
1676  vec_u16 vd_l, vd_r, vcoord16[2];
1677  vec_u32 vcoord[4];
1678  const vec_u32 vadd = (vec_u32) {
1679  0,
1680  xInc * 1,
1681  xInc * 2,
1682  xInc * 3,
1683  };
1684  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1685  0,
1686  xInc * 1,
1687  xInc * 2,
1688  xInc * 3,
1689  xInc * 4,
1690  xInc * 5,
1691  xInc * 6,
1692  xInc * 7,
1693  };
1694  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1695  const vec_u16 vshift9 = vec_splat_u16(9);
1696  const vec_u8 vzero = vec_splat_u8(0);
1697  const vec_u16 vshift = vec_splat_u16(7);
1698 
1699  for (i = 0; i < dstWidth; i += 16) {
1700  vcoord16[0] = vec_splats((uint16_t) xpos);
1701  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1702 
1703  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1704  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1705 
1706  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1707  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1708  valpha = (vec_s8) vec_pack(vcoord16[0], vcoord16[1]);
1709 
1710  xx = xpos >> 16;
1711  vin = vec_vsx_ld(0, &src[xx]);
1712 
1713  vcoord[0] = vec_splats(xpos & 0xffff);
1714  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1715  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1716  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1717 
1718  vcoord[0] = vec_add(vcoord[0], vadd);
1719  vcoord[1] = vec_add(vcoord[1], vadd);
1720  vcoord[2] = vec_add(vcoord[2], vadd);
1721  vcoord[3] = vec_add(vcoord[3], vadd);
1722 
1723  vcoord[0] = vec_sr(vcoord[0], vshift16);
1724  vcoord[1] = vec_sr(vcoord[1], vshift16);
1725  vcoord[2] = vec_sr(vcoord[2], vshift16);
1726  vcoord[3] = vec_sr(vcoord[3], vshift16);
1727 
1728  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1729  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1730  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1731 
1732  vin = vec_perm(vin, vin, vperm);
1733 
1734  vin2 = vec_vsx_ld(1, &src[xx]);
1735  vin2 = vec_perm(vin2, vin2, vperm);
1736 
1737  vmul = (vec_s8) vec_sub(vin2, vin);
1738  vtmp = vec_mule(vmul, valpha);
1739  vtmp2 = vec_mulo(vmul, valpha);
1740  vtmp3 = vec_mergeh(vtmp, vtmp2);
1741  vtmp4 = vec_mergel(vtmp, vtmp2);
1742 
1743  vd_l = (vec_u16) vec_mergeh(vin, vzero);
1744  vd_r = (vec_u16) vec_mergel(vin, vzero);
1745  vd_l = vec_sl(vd_l, vshift);
1746  vd_r = vec_sl(vd_r, vshift);
1747 
1748  vd_l = vec_add(vd_l, (vec_u16) vtmp3);
1749  vd_r = vec_add(vd_r, (vec_u16) vtmp4);
1750 
1751  vec_st((vec_s16) vd_l, 0, &dst[i]);
1752  vec_st((vec_s16) vd_r, 0, &dst[i + 8]);
1753 
1754  xpos += xInc * 16;
1755  }
1756  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1757  dst[i] = src[srcW-1]*128;
1758 }
1759 
1760 #define HCSCALE(in, out) \
1761  vin = vec_vsx_ld(0, &in[xx]); \
1762  vin = vec_perm(vin, vin, vperm); \
1763 \
1764  vin2 = vec_vsx_ld(1, &in[xx]); \
1765  vin2 = vec_perm(vin2, vin2, vperm); \
1766 \
1767  vtmp = vec_mule(vin, valphaxor); \
1768  vtmp2 = vec_mulo(vin, valphaxor); \
1769  vtmp3 = vec_mergeh(vtmp, vtmp2); \
1770  vtmp4 = vec_mergel(vtmp, vtmp2); \
1771 \
1772  vtmp = vec_mule(vin2, valpha); \
1773  vtmp2 = vec_mulo(vin2, valpha); \
1774  vd_l = vec_mergeh(vtmp, vtmp2); \
1775  vd_r = vec_mergel(vtmp, vtmp2); \
1776 \
1777  vd_l = vec_add(vd_l, vtmp3); \
1778  vd_r = vec_add(vd_r, vtmp4); \
1779 \
1780  vec_st((vec_s16) vd_l, 0, &out[i]); \
1781  vec_st((vec_s16) vd_r, 0, &out[i + 8])
1782 
1783 static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2,
1784  int dstWidth, const uint8_t *src1,
1785  const uint8_t *src2, int srcW, int xInc)
1786 {
1787  int i;
1788  unsigned int xpos = 0, xx;
1789  vec_u8 vin, vin2, vperm;
1790  vec_u8 valpha, valphaxor;
1791  vec_u16 vtmp, vtmp2, vtmp3, vtmp4;
1792  vec_u16 vd_l, vd_r, vcoord16[2];
1793  vec_u32 vcoord[4];
1794  const vec_u8 vxor = vec_splats((uint8_t) 127);
1795  const vec_u32 vadd = (vec_u32) {
1796  0,
1797  xInc * 1,
1798  xInc * 2,
1799  xInc * 3,
1800  };
1801  const vec_u16 vadd16 = (vec_u16) { // Modulo math
1802  0,
1803  xInc * 1,
1804  xInc * 2,
1805  xInc * 3,
1806  xInc * 4,
1807  xInc * 5,
1808  xInc * 6,
1809  xInc * 7,
1810  };
1811  const vec_u32 vshift16 = vec_splats((uint32_t) 16);
1812  const vec_u16 vshift9 = vec_splat_u16(9);
1813 
1814  for (i = 0; i < dstWidth; i += 16) {
1815  vcoord16[0] = vec_splats((uint16_t) xpos);
1816  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1817 
1818  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1819  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1820 
1821  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1822  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1823  valpha = vec_pack(vcoord16[0], vcoord16[1]);
1824  valphaxor = vec_xor(valpha, vxor);
1825 
1826  xx = xpos >> 16;
1827 
1828  vcoord[0] = vec_splats(xpos & 0xffff);
1829  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1830  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1831  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1832 
1833  vcoord[0] = vec_add(vcoord[0], vadd);
1834  vcoord[1] = vec_add(vcoord[1], vadd);
1835  vcoord[2] = vec_add(vcoord[2], vadd);
1836  vcoord[3] = vec_add(vcoord[3], vadd);
1837 
1838  vcoord[0] = vec_sr(vcoord[0], vshift16);
1839  vcoord[1] = vec_sr(vcoord[1], vshift16);
1840  vcoord[2] = vec_sr(vcoord[2], vshift16);
1841  vcoord[3] = vec_sr(vcoord[3], vshift16);
1842 
1843  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1844  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1845  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1846 
1847  HCSCALE(src1, dst1);
1848  HCSCALE(src2, dst2);
1849 
1850  xpos += xInc * 16;
1851  }
1852  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1853  dst1[i] = src1[srcW-1]*128;
1854  dst2[i] = src2[srcW-1]*128;
1855  }
1856 }
1857 
1858 #undef HCSCALE
1859 
1860 static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1861  const uint8_t *src, const int16_t *filter,
1862  const int32_t *filterPos, int filterSize)
1863 {
1864  int i, j;
1865  int32_t *dst = (int32_t *) _dst;
1866  vec_s16 vfilter, vin;
1867  vec_u8 vin8;
1868  vec_s32 vout;
1869  const vec_u8 vzero = vec_splat_u8(0);
1870  const vec_u8 vunusedtab[8] = {
1871  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1872  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1873  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1874  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1875  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1876  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1877  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1878  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1879  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1880  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1881  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1882  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1883  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1884  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1885  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1886  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1887  };
1888  const vec_u8 vunused = vunusedtab[filterSize % 8];
1889 
1890  if (filterSize == 1) {
1891  for (i = 0; i < dstW; i++) {
1892  int srcPos = filterPos[i];
1893  int val = 0;
1894  for (j = 0; j < filterSize; j++) {
1895  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
1896  }
1897  dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
1898  }
1899  } else {
1900  for (i = 0; i < dstW; i++) {
1901  const int srcPos = filterPos[i];
1902  vout = vec_splat_s32(0);
1903  for (j = 0; j < filterSize; j += 8) {
1904  vin8 = vec_vsx_ld(0, &src[srcPos + j]);
1905  vin = (vec_s16) vec_mergeh(vin8, vzero);
1906  if (j + 8 > filterSize) // Remove the unused elements on the last round
1907  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1908 
1909  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1910  vout = vec_msums(vin, vfilter, vout);
1911  }
1912  vout = vec_sums(vout, (vec_s32) vzero);
1913  dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1);
1914  }
1915  }
1916 }
1917 
1918 static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1919  const uint8_t *_src, const int16_t *filter,
1920  const int32_t *filterPos, int filterSize)
1921 {
1923  int i, j;
1924  int32_t *dst = (int32_t *) _dst;
1925  const uint16_t *src = (const uint16_t *) _src;
1926  int bits = desc->comp[0].depth - 1;
1927  int sh = bits - 4;
1928  vec_s16 vfilter, vin;
1929  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1930  const vec_u8 vzero = vec_splat_u8(0);
1931  const vec_u8 vunusedtab[8] = {
1932  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1933  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1934  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1935  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1936  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1937  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1938  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1939  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1940  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1941  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1942  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1943  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1944  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1945  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1946  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1947  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1948  };
1949  const vec_u8 vunused = vunusedtab[filterSize % 8];
1950 
1951  if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
1952  sh = 9;
1953  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1954  sh = 16 - 1 - 4;
1955  }
1956 
1957  if (filterSize == 1) {
1958  for (i = 0; i < dstW; i++) {
1959  int srcPos = filterPos[i];
1960  int val = 0;
1961 
1962  for (j = 0; j < filterSize; j++) {
1963  val += src[srcPos + j] * filter[filterSize * i + j];
1964  }
1965  // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1966  dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1967  }
1968  } else {
1969  for (i = 0; i < dstW; i++) {
1970  const int srcPos = filterPos[i];
1971  vout = vec_splat_s32(0);
1972  for (j = 0; j < filterSize; j += 8) {
1973  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
1974  if (j + 8 > filterSize) // Remove the unused elements on the last round
1975  vin = vec_perm(vin, (vec_s16) vzero, vunused);
1976 
1977  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1978  vfilter32_l = vec_unpackh(vfilter);
1979  vfilter32_r = vec_unpackl(vfilter);
1980 
1981  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
1982  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
1983 
1984  vtmp = vec_mul(vtmp, vfilter32_l);
1985  vtmp2 = vec_mul(vtmp2, vfilter32_r);
1986 
1987  vout = vec_adds(vout, vtmp);
1988  vout = vec_adds(vout, vtmp2);
1989  }
1990  vout = vec_sums(vout, (vec_s32) vzero);
1991  dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1);
1992  }
1993  }
1994 }
1995 
1996 static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW,
1997  const uint8_t *_src, const int16_t *filter,
1998  const int32_t *filterPos, int filterSize)
1999 {
2001  int i, j;
2002  const uint16_t *src = (const uint16_t *) _src;
2003  int sh = desc->comp[0].depth - 1;
2004  vec_s16 vfilter, vin;
2005  vec_s32 vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
2006  const vec_u8 vzero = vec_splat_u8(0);
2007  const vec_u8 vunusedtab[8] = {
2008  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2009  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
2010  (vec_u8) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
2011  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2012  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
2013  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2014  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
2015  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2016  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2017  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2018  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2019  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2020  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2021  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
2022  (vec_u8) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2023  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
2024  };
2025  const vec_u8 vunused = vunusedtab[filterSize % 8];
2026 
2027  if (sh<15) {
2028  sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
2029  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
2030  sh = 16 - 1;
2031  }
2032 
2033  if (filterSize == 1) {
2034  for (i = 0; i < dstW; i++) {
2035  int srcPos = filterPos[i];
2036  int val = 0;
2037 
2038  for (j = 0; j < filterSize; j++) {
2039  val += src[srcPos + j] * filter[filterSize * i + j];
2040  }
2041  // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2042  dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2043  }
2044  } else {
2045  for (i = 0; i < dstW; i++) {
2046  const int srcPos = filterPos[i];
2047  vout = vec_splat_s32(0);
2048  for (j = 0; j < filterSize; j += 8) {
2049  vin = (vec_s16) vec_vsx_ld(0, &src[srcPos + j]);
2050  if (j + 8 > filterSize) // Remove the unused elements on the last round
2051  vin = vec_perm(vin, (vec_s16) vzero, vunused);
2052 
2053  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
2054  vfilter32_l = vec_unpackh(vfilter);
2055  vfilter32_r = vec_unpackl(vfilter);
2056 
2057  vtmp = (vec_s32) vec_mergeh(vin, (vec_s16) vzero);
2058  vtmp2 = (vec_s32) vec_mergel(vin, (vec_s16) vzero);
2059 
2060  vtmp = vec_mul(vtmp, vfilter32_l);
2061  vtmp2 = vec_mul(vtmp2, vfilter32_r);
2062 
2063  vout = vec_adds(vout, vtmp);
2064  vout = vec_adds(vout, vtmp2);
2065  }
2066  vout = vec_sums(vout, (vec_s32) vzero);
2067  dst[i] = FFMIN(vout[3] >> sh, (1 << 15) - 1);
2068  }
2069  }
2070 }
2071 
2072 #endif /* !HAVE_BIGENDIAN */
2073 
2074 #endif /* HAVE_VSX */
2075 
2077 {
2078 #if HAVE_VSX
2079  enum AVPixelFormat dstFormat = c->dstFormat;
2080  const int cpu_flags = av_get_cpu_flags();
2081  const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8;
2082 
2083  if (!(cpu_flags & AV_CPU_FLAG_VSX))
2084  return;
2085 
2086 #if !HAVE_BIGENDIAN
2087  if (c->srcBpc == 8) {
2088  if (c->dstBpc <= 14) {
2089  c->hyScale = c->hcScale = hScale_real_vsx;
2090  if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
2091  c->hyscale_fast = hyscale_fast_vsx;
2092  c->hcscale_fast = hcscale_fast_vsx;
2093  }
2094  } else {
2095  c->hyScale = c->hcScale = hScale8To19_vsx;
2096  }
2097  } else {
2098  if (power8) {
2099  c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx
2100  : hScale16To15_vsx;
2101  }
2102  }
2103  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
2104  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
2105  !c->needAlpha) {
2106  c->yuv2planeX = yuv2planeX_vsx;
2107  }
2108 #endif
2109 
2110  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
2111  switch (c->dstBpc) {
2112  case 8:
2113  c->yuv2plane1 = yuv2plane1_8_vsx;
2114  break;
2115 #if !HAVE_BIGENDIAN
2116  case 9:
2117  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
2118  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
2119  break;
2120  case 10:
2121  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
2122  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
2123  break;
2124  case 12:
2125  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
2126  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
2127  break;
2128  case 14:
2129  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
2130  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
2131  break;
2132  case 16:
2133  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
2134 #if HAVE_POWER8
2135  if (cpu_flags & AV_CPU_FLAG_POWER8) {
2136  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
2137  }
2138 #endif /* HAVE_POWER8 */
2139  break;
2140 #endif /* !HAVE_BIGENDIAN */
2141  }
2142  }
2143 
2144  if (c->flags & SWS_BITEXACT)
2145  return;
2146 
2147 #if !HAVE_BIGENDIAN
2148  if (c->flags & SWS_FULL_CHR_H_INT) {
2149  switch (dstFormat) {
2150  case AV_PIX_FMT_RGB24:
2151  if (power8) {
2152  c->yuv2packed1 = yuv2rgb24_full_1_vsx;
2153  c->yuv2packed2 = yuv2rgb24_full_2_vsx;
2154  c->yuv2packedX = yuv2rgb24_full_X_vsx;
2155  }
2156  break;
2157  case AV_PIX_FMT_BGR24:
2158  if (power8) {
2159  c->yuv2packed1 = yuv2bgr24_full_1_vsx;
2160  c->yuv2packed2 = yuv2bgr24_full_2_vsx;
2161  c->yuv2packedX = yuv2bgr24_full_X_vsx;
2162  }
2163  break;
2164  case AV_PIX_FMT_BGRA:
2165  if (power8) {
2166  if (!c->needAlpha) {
2167  c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
2168  c->yuv2packed2 = yuv2bgrx32_full_2_vsx;
2169  c->yuv2packedX = yuv2bgrx32_full_X_vsx;
2170  }
2171  }
2172  break;
2173  case AV_PIX_FMT_RGBA:
2174  if (power8) {
2175  if (!c->needAlpha) {
2176  c->yuv2packed1 = yuv2rgbx32_full_1_vsx;
2177  c->yuv2packed2 = yuv2rgbx32_full_2_vsx;
2178  c->yuv2packedX = yuv2rgbx32_full_X_vsx;
2179  }
2180  }
2181  break;
2182  case AV_PIX_FMT_ARGB:
2183  if (power8) {
2184  if (!c->needAlpha) {
2185  c->yuv2packed1 = yuv2xrgb32_full_1_vsx;
2186  c->yuv2packed2 = yuv2xrgb32_full_2_vsx;
2187  c->yuv2packedX = yuv2xrgb32_full_X_vsx;
2188  }
2189  }
2190  break;
2191  case AV_PIX_FMT_ABGR:
2192  if (power8) {
2193  if (!c->needAlpha) {
2194  c->yuv2packed1 = yuv2xbgr32_full_1_vsx;
2195  c->yuv2packed2 = yuv2xbgr32_full_2_vsx;
2196  c->yuv2packedX = yuv2xbgr32_full_X_vsx;
2197  }
2198  }
2199  break;
2200  }
2201  } else { /* !SWS_FULL_CHR_H_INT */
2202  switch (dstFormat) {
2203  case AV_PIX_FMT_YUYV422:
2204  c->yuv2packed1 = yuv2yuyv422_1_vsx;
2205  c->yuv2packed2 = yuv2yuyv422_2_vsx;
2206  c->yuv2packedX = yuv2yuyv422_X_vsx;
2207  break;
2208  case AV_PIX_FMT_YVYU422:
2209  c->yuv2packed1 = yuv2yvyu422_1_vsx;
2210  c->yuv2packed2 = yuv2yvyu422_2_vsx;
2211  c->yuv2packedX = yuv2yvyu422_X_vsx;
2212  break;
2213  case AV_PIX_FMT_UYVY422:
2214  c->yuv2packed1 = yuv2uyvy422_1_vsx;
2215  c->yuv2packed2 = yuv2uyvy422_2_vsx;
2216  c->yuv2packedX = yuv2uyvy422_X_vsx;
2217  break;
2218  case AV_PIX_FMT_BGRA:
2219  if (power8) {
2220  if (!c->needAlpha) {
2221  c->yuv2packed1 = yuv2bgrx32_1_vsx;
2222  c->yuv2packed2 = yuv2bgrx32_2_vsx;
2223  }
2224  }
2225  break;
2226  case AV_PIX_FMT_RGBA:
2227  if (power8) {
2228  if (!c->needAlpha) {
2229  c->yuv2packed1 = yuv2rgbx32_1_vsx;
2230  c->yuv2packed2 = yuv2rgbx32_2_vsx;
2231  }
2232  }
2233  break;
2234  case AV_PIX_FMT_ARGB:
2235  if (power8) {
2236  if (!c->needAlpha) {
2237  c->yuv2packed1 = yuv2xrgb32_1_vsx;
2238  c->yuv2packed2 = yuv2xrgb32_2_vsx;
2239  }
2240  }
2241  break;
2242  case AV_PIX_FMT_ABGR:
2243  if (power8) {
2244  if (!c->needAlpha) {
2245  c->yuv2packed1 = yuv2xbgr32_1_vsx;
2246  c->yuv2packed2 = yuv2xbgr32_2_vsx;
2247  }
2248  }
2249  break;
2250  case AV_PIX_FMT_RGB24:
2251  if (power8) {
2252  c->yuv2packed1 = yuv2rgb24_1_vsx;
2253  c->yuv2packed2 = yuv2rgb24_2_vsx;
2254  }
2255  break;
2256  case AV_PIX_FMT_BGR24:
2257  if (power8) {
2258  c->yuv2packed1 = yuv2bgr24_1_vsx;
2259  c->yuv2packed2 = yuv2bgr24_2_vsx;
2260  }
2261  break;
2262  }
2263  }
2264 #endif /* !HAVE_BIGENDIAN */
2265 
2266 #endif /* HAVE_VSX */
2267 }
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
#define vec_s8
Definition: util_altivec.h:35
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
#define NULL
Definition: coverity.c:32
void(* hcScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
const char const char void * val
Definition: avisynth_c.h:863
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)
Definition: output.c:1801
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
static int shift(int a, int b)
Definition: sonic.c:82
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:2522
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
const char * desc
Definition: nvenc.c:68
#define mul8(a, b)
#define vec_s32
Definition: util_altivec.h:39
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)
Definition: output.c:1788
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Scale one horizontal line of input data using a filter over the input lines, to produce one (differen...
static atomic_int cpu_flags
Definition: cpu.c:50
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
#define src
Definition: vp8dsp.c:254
Macro definitions for various function/variable attributes.
AVComponentDescriptor comp[4]
Parameters that describe how pixels are packed.
Definition: pixdesc.h:117
uint8_t
#define av_cold
Definition: attributes.h:82
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:77
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:188
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
#define A(x)
Definition: vp56_arith.h:28
yuv2packedX_fn yuv2packedX
#define vec_s16
Definition: util_altivec.h:37
#define U(x)
Definition: vp56_arith.h:37
const AVS_VideoInfo * vi
Definition: avisynth_c.h:887
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57
yuv2packed1_fn yuv2packed1
uint8_t bits
Definition: vp3data.h:202
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
int chrDstW
Width of destination chroma planes.
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
uint64_t flags
Combination of AV_PIX_FMT_FLAG_...
Definition: pixdesc.h:106
#define FFMIN(a, b)
Definition: common.h:96
yuv2planar1_fn yuv2plane1
int32_t
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:210
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
#define vec_u8
Definition: util_altivec.h:34
int dstW
Width of destination luma/alpha planes.
#define MAX_FILTER_SIZE
Definition: af_dynaudnorm.c:33
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
#define src1
Definition: h264pred.c:139
#define vec_u32
Definition: util_altivec.h:38
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:81
yuv2planarX_fn yuv2planeX
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:67
void * buf
Definition: avisynth_c.h:766
#define AV_CPU_FLAG_VSX
ISA 2.06.
Definition: cpu.h:61
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output.c:1812
Contains misc utility macros and inline functions.
#define SWS_BITEXACT
Definition: swscale.h:84
#define AV_CPU_FLAG_POWER8
ISA 2.07.
Definition: cpu.h:62
yuv2NBPS(yuv2NBPS(9, yuv2NBPS(BE, yuv2NBPS(1, yuv2NBPS(10, int16_t)
Definition: output.c:369
int
yuv2packed2_fn yuv2packed2
#define LOCAL_ALIGNED(a, t, v,...)
Definition: internal.h:114
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
static const int shift2[6]
Definition: dxa.c:51
static const uint8_t shifts[2][12]
Definition: camellia.c:174
enum AVPixelFormat srcFormat
Source pixel format.
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:887
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2076
#define vec_u16
Definition: util_altivec.h:36
void INT64 start
Definition: avisynth_c.h:766
#define av_always_inline
Definition: attributes.h:39
int chrSrcW
Width of source chroma planes.
int depth
Number of bits in the component.
Definition: pixdesc.h:58
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
#define YUV2PACKEDWRAPPER(name, base, ext, fmt)
Definition: output.c:709
int srcW
Width of source luma/alpha planes.
int flags
Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
static uint8_t tmp[11]
Definition: aes_ctr.c:26