FFmpeg
swscale_vsx.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_VSX
35 #define vzero vec_splat_s32(0)
36 
37 #if !HAVE_BIGENDIAN
38 #define GET_LS(a,b,c,s) {\
39  ls = a;\
40  a = vec_vsx_ld(((b) << 1) + 16, s);\
41  }
42 
43 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
44  vector signed short ls;\
45  vector signed int vf1, vf2, i1, i2;\
46  GET_LS(l1, x, perm, src);\
47  i1 = vec_mule(filter, ls);\
48  i2 = vec_mulo(filter, ls);\
49  vf1 = vec_mergeh(i1, i2);\
50  vf2 = vec_mergel(i1, i2);\
51  d1 = vec_add(d1, vf1);\
52  d2 = vec_add(d2, vf2);\
53  } while (0)
54 
55 #define LOAD_FILTER(vf,f) {\
56  vf = vec_vsx_ld(joffset, f);\
57 }
58 #define LOAD_L1(ll1,s,p){\
59  ll1 = vec_vsx_ld(xoffset, s);\
60 }
61 
62 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
63 
64 // The neat trick: We only care for half the elements,
65 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
66 // and we're going to use vec_mule, so we choose
67 // carefully how to "unpack" the elements into the even slots.
68 #define GET_VF4(a, vf, f) {\
69  vf = (vector signed short)vec_vsx_ld(a << 3, f);\
70  vf = vec_mergeh(vf, (vector signed short)vzero);\
71 }
72 #define FIRST_LOAD(sv, pos, s, per) {}
73 #define UPDATE_PTR(s0, d0, s1, d1) {}
74 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
75  vf = vec_vsx_ld(pos + a, s);\
76 }
77 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
78 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
79  vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
80 }
81 
82 #define FUNC(name) name ## _vsx
83 #include "swscale_ppc_template.c"
84 #undef FUNC
85 
86 #undef vzero
87 
88 #endif /* !HAVE_BIGENDIAN */
89 
90 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
91  const uint8_t *dither, int offset, int start)
92 {
93  int i;
94  for (i = start; i < dstW; i++) {
95  int val = (src[i] + dither[(i + offset) & 7]) >> 7;
96  dest[i] = av_clip_uint8(val);
97  }
98 }
99 
100 static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
101  const uint8_t *dither, int offset)
102 {
103  const int dst_u = -(uintptr_t)dest & 15;
104  int i, j;
105  LOCAL_ALIGNED(16, int16_t, val, [16]);
106  const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
107  vector int16_t vi, vileft, ditherleft, ditherright;
108  vector uint8_t vd;
109 
110  for (j = 0; j < 16; j++) {
111  val[j] = dither[(dst_u + offset + j) & 7];
112  }
113 
114  ditherleft = vec_ld(0, val);
115  ditherright = vec_ld(0, &val[8]);
116 
117  yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
118 
119  for (i = dst_u; i < dstW - 15; i += 16) {
120 
121  vi = vec_vsx_ld(0, &src[i]);
122  vi = vec_adds(ditherleft, vi);
123  vileft = vec_sra(vi, shifts);
124 
125  vi = vec_vsx_ld(0, &src[i + 8]);
126  vi = vec_adds(ditherright, vi);
127  vi = vec_sra(vi, shifts);
128 
129  vd = vec_packsu(vileft, vi);
130  vec_st(vd, 0, &dest[i]);
131  }
132 
133  yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
134 }
135 
136 #if !HAVE_BIGENDIAN
137 
138 #define output_pixel(pos, val) \
139  if (big_endian) { \
140  AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
141  } else { \
142  AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
143  }
144 
145 static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
146  int big_endian, int output_bits, int start)
147 {
148  int i;
149  int shift = 15 - output_bits;
150 
151  for (i = start; i < dstW; i++) {
152  int val = src[i] + (1 << (shift - 1));
153  output_pixel(&dest[i], val);
154  }
155 }
156 
157 static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
158  int big_endian, int output_bits)
159 {
160  const int dst_u = -(uintptr_t)dest & 7;
161  const int shift = 15 - output_bits;
162  const int add = (1 << (shift - 1));
163  const int clip = (1 << output_bits) - 1;
164  const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add};
165  const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
166  const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift);
167  const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
168  vector uint16_t v;
169  int i;
170 
171  yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
172 
173  for (i = dst_u; i < dstW - 7; i += 8) {
174  v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
175  v = vec_add(v, vadd);
176  v = vec_sr(v, vshift);
177  v = vec_min(v, vlargest);
178  v = vec_rl(v, vswap);
179  vec_st(v, 0, &dest[i]);
180  }
181 
182  yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
183 }
184 
185 static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
186  const int16_t **src, uint16_t *dest, int dstW,
187  int big_endian, int output_bits, int start)
188 {
189  int i;
190  int shift = 11 + 16 - output_bits;
191 
192  for (i = start; i < dstW; i++) {
193  int val = 1 << (shift - 1);
194  int j;
195 
196  for (j = 0; j < filterSize; j++)
197  val += src[j][i] * filter[j];
198 
199  output_pixel(&dest[i], val);
200  }
201 }
202 
203 static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
204  const int16_t **src, uint16_t *dest, int dstW,
205  int big_endian, int output_bits)
206 {
207  const int dst_u = -(uintptr_t)dest & 7;
208  const int shift = 11 + 16 - output_bits;
209  const int add = (1 << (shift - 1));
210  const int clip = (1 << output_bits) - 1;
211  const uint16_t swap = big_endian ? 8 : 0;
212  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
213  const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
214  const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
215  const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
216  const vector int16_t vzero = vec_splat_s16(0);
217  const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
218  vector int16_t vfilter[MAX_FILTER_SIZE], vin;
219  vector uint16_t v;
220  vector uint32_t vleft, vright, vtmp;
221  int i, j;
222 
223  for (i = 0; i < filterSize; i++) {
224  vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
225  filter[i], filter[i], filter[i], filter[i]};
226  }
227 
228  yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
229 
230  for (i = dst_u; i < dstW - 7; i += 8) {
231  vleft = vright = vadd;
232 
233  for (j = 0; j < filterSize; j++) {
234  vin = vec_vsx_ld(0, &src[j][i]);
235  vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
236  vleft = vec_add(vleft, vtmp);
237  vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
238  vright = vec_add(vright, vtmp);
239  }
240 
241  vleft = vec_sra(vleft, vshift);
242  vright = vec_sra(vright, vshift);
243  v = vec_packsu(vleft, vright);
244  v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
245  v = vec_min(v, vlargest);
246  v = vec_rl(v, vswap);
247  v = vec_perm(v, v, vperm);
248  vec_st(v, 0, &dest[i]);
249  }
250 
251  yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
252 }
253 
254 
255 #undef output_pixel
256 
257 #define output_pixel(pos, val, bias, signedness) \
258  if (big_endian) { \
259  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
260  } else { \
261  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
262  }
263 
264 static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
265  int big_endian, int output_bits, int start)
266 {
267  int i;
268  const int shift = 3;
269 
270  for (i = start; i < dstW; i++) {
271  int val = src[i] + (1 << (shift - 1));
272  output_pixel(&dest[i], val, 0, uint);
273  }
274 }
275 
276 static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
277  int big_endian, int output_bits)
278 {
279  const int dst_u = -(uintptr_t)dest & 7;
280  const int shift = 3;
281  const int add = (1 << (shift - 1));
282  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
283  const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
284  const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
285  vector uint32_t v, v2;
286  vector uint16_t vd;
287  int i;
288 
289  yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
290 
291  for (i = dst_u; i < dstW - 7; i += 8) {
292  v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
293  v = vec_add(v, vadd);
294  v = vec_sr(v, vshift);
295 
296  v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
297  v2 = vec_add(v2, vadd);
298  v2 = vec_sr(v2, vshift);
299 
300  vd = vec_packsu(v, v2);
301  vd = vec_rl(vd, vswap);
302 
303  vec_st(vd, 0, &dest[i]);
304  }
305 
306  yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
307 }
308 
309 #if HAVE_POWER8
310 
311 static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
312  const int32_t **src, uint16_t *dest, int dstW,
313  int big_endian, int output_bits, int start)
314 {
315  int i;
316  int shift = 15;
317 
318  for (i = start; i < dstW; i++) {
319  int val = 1 << (shift - 1);
320  int j;
321 
322  /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
323  * filters (or anything with negative coeffs, the range can be slightly
324  * wider in both directions. To account for this overflow, we subtract
325  * a constant so it always fits in the signed range (assuming a
326  * reasonable filterSize), and re-add that at the end. */
327  val -= 0x40000000;
328  for (j = 0; j < filterSize; j++)
329  val += src[j][i] * (unsigned)filter[j];
330 
331  output_pixel(&dest[i], val, 0x8000, int);
332  }
333 }
334 
335 static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
336  const int32_t **src, uint16_t *dest, int dstW,
337  int big_endian, int output_bits)
338 {
339  const int dst_u = -(uintptr_t)dest & 7;
340  const int shift = 15;
341  const int bias = 0x8000;
342  const int add = (1 << (shift - 1)) - 0x40000000;
343  const uint16_t swap = big_endian ? 8 : 0;
344  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
345  const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
346  const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
347  const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, bias, bias, bias, bias, bias};
348  vector int32_t vfilter[MAX_FILTER_SIZE];
349  vector uint16_t v;
350  vector uint32_t vleft, vright, vtmp;
351  vector int32_t vin32l, vin32r;
352  int i, j;
353 
354  for (i = 0; i < filterSize; i++) {
355  vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], filter[i]};
356  }
357 
358  yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
359 
360  for (i = dst_u; i < dstW - 7; i += 8) {
361  vleft = vright = vadd;
362 
363  for (j = 0; j < filterSize; j++) {
364  vin32l = vec_vsx_ld(0, &src[j][i]);
365  vin32r = vec_vsx_ld(0, &src[j][i + 4]);
366 
367  vtmp = (vector uint32_t) vec_mul(vin32l, vfilter[j]);
368  vleft = vec_add(vleft, vtmp);
369  vtmp = (vector uint32_t) vec_mul(vin32r, vfilter[j]);
370  vright = vec_add(vright, vtmp);
371  }
372 
373  vleft = vec_sra(vleft, vshift);
374  vright = vec_sra(vright, vshift);
375  v = (vector uint16_t) vec_packs((vector int32_t) vleft, (vector int32_t) vright);
376  v = vec_add(v, vbias);
377  v = vec_rl(v, vswap);
378  vec_st(v, 0, &dest[i]);
379  }
380 
381  yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
382 }
383 
384 #endif /* HAVE_POWER8 */
385 
386 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
387  yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
388  yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
389 
390 #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
391 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
392  uint8_t *dest, int dstW, \
393  const uint8_t *dither, int offset) \
394 { \
395  yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
396  (uint16_t *) dest, dstW, is_be, bits); \
397 }
398 
399 #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
400 static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
401  const int16_t **src, uint8_t *dest, int dstW, \
402  const uint8_t *dither, int offset)\
403 { \
404  yuv2planeX_## template_size ## _vsx(filter, \
405  filterSize, (const typeX_t **) src, \
406  (uint16_t *) dest, dstW, is_be, bits); \
407 }
408 
409 yuv2NBPS( 9, BE, 1, nbps, int16_t)
410 yuv2NBPS( 9, LE, 0, nbps, int16_t)
411 yuv2NBPS(10, BE, 1, nbps, int16_t)
412 yuv2NBPS(10, LE, 0, nbps, int16_t)
413 yuv2NBPS(12, BE, 1, nbps, int16_t)
414 yuv2NBPS(12, LE, 0, nbps, int16_t)
415 yuv2NBPS(14, BE, 1, nbps, int16_t)
416 yuv2NBPS(14, LE, 0, nbps, int16_t)
417 
418 yuv2NBPS1(16, BE, 1, 16, int32_t)
419 yuv2NBPS1(16, LE, 0, 16, int32_t)
420 #if HAVE_POWER8
421 yuv2NBPSX(16, BE, 1, 16, int32_t)
422 yuv2NBPSX(16, LE, 0, 16, int32_t)
423 #endif
424 
425 #define WRITERGB \
426  R_l = vec_max(R_l, zero32); \
427  R_r = vec_max(R_r, zero32); \
428  G_l = vec_max(G_l, zero32); \
429  G_r = vec_max(G_r, zero32); \
430  B_l = vec_max(B_l, zero32); \
431  B_r = vec_max(B_r, zero32); \
432 \
433  R_l = vec_min(R_l, rgbclip); \
434  R_r = vec_min(R_r, rgbclip); \
435  G_l = vec_min(G_l, rgbclip); \
436  G_r = vec_min(G_r, rgbclip); \
437  B_l = vec_min(B_l, rgbclip); \
438  B_r = vec_min(B_r, rgbclip); \
439 \
440  R_l = vec_sr(R_l, shift22); \
441  R_r = vec_sr(R_r, shift22); \
442  G_l = vec_sr(G_l, shift22); \
443  G_r = vec_sr(G_r, shift22); \
444  B_l = vec_sr(B_l, shift22); \
445  B_r = vec_sr(B_r, shift22); \
446 \
447  rd16 = vec_packsu(R_l, R_r); \
448  gd16 = vec_packsu(G_l, G_r); \
449  bd16 = vec_packsu(B_l, B_r); \
450  rd = vec_packsu(rd16, zero16); \
451  gd = vec_packsu(gd16, zero16); \
452  bd = vec_packsu(bd16, zero16); \
453 \
454  switch(target) { \
455  case AV_PIX_FMT_RGB24: \
456  out0 = vec_perm(rd, gd, perm3rg0); \
457  out0 = vec_perm(out0, bd, perm3tb0); \
458  out1 = vec_perm(rd, gd, perm3rg1); \
459  out1 = vec_perm(out1, bd, perm3tb1); \
460 \
461  vec_vsx_st(out0, 0, dest); \
462  vec_vsx_st(out1, 16, dest); \
463 \
464  dest += 24; \
465  break; \
466  case AV_PIX_FMT_BGR24: \
467  out0 = vec_perm(bd, gd, perm3rg0); \
468  out0 = vec_perm(out0, rd, perm3tb0); \
469  out1 = vec_perm(bd, gd, perm3rg1); \
470  out1 = vec_perm(out1, rd, perm3tb1); \
471 \
472  vec_vsx_st(out0, 0, dest); \
473  vec_vsx_st(out1, 16, dest); \
474 \
475  dest += 24; \
476  break; \
477  case AV_PIX_FMT_BGRA: \
478  out0 = vec_mergeh(bd, gd); \
479  out1 = vec_mergeh(rd, ad); \
480 \
481  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
482  vec_vsx_st(tmp8, 0, dest); \
483  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
484  vec_vsx_st(tmp8, 16, dest); \
485 \
486  dest += 32; \
487  break; \
488  case AV_PIX_FMT_RGBA: \
489  out0 = vec_mergeh(rd, gd); \
490  out1 = vec_mergeh(bd, ad); \
491 \
492  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
493  vec_vsx_st(tmp8, 0, dest); \
494  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
495  vec_vsx_st(tmp8, 16, dest); \
496 \
497  dest += 32; \
498  break; \
499  case AV_PIX_FMT_ARGB: \
500  out0 = vec_mergeh(ad, rd); \
501  out1 = vec_mergeh(gd, bd); \
502 \
503  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
504  vec_vsx_st(tmp8, 0, dest); \
505  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
506  vec_vsx_st(tmp8, 16, dest); \
507 \
508  dest += 32; \
509  break; \
510  case AV_PIX_FMT_ABGR: \
511  out0 = vec_mergeh(ad, bd); \
512  out1 = vec_mergeh(gd, rd); \
513 \
514  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
515  vec_vsx_st(tmp8, 0, dest); \
516  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
517  vec_vsx_st(tmp8, 16, dest); \
518 \
519  dest += 32; \
520  break; \
521  }
522 
523 static av_always_inline void
524 yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
525  const int16_t **lumSrc, int lumFilterSize,
526  const int16_t *chrFilter, const int16_t **chrUSrc,
527  const int16_t **chrVSrc, int chrFilterSize,
528  const int16_t **alpSrc, uint8_t *dest,
529  int dstW, int y, enum AVPixelFormat target, int hasAlpha)
530 {
531  vector int16_t vv;
532  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
533  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
534  vector int32_t tmp, tmp2, tmp3, tmp4;
535  vector uint16_t rd16, gd16, bd16;
536  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
537  vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
538  const vector int32_t ystart = vec_splats(1 << 9);
539  const vector int32_t uvstart = vec_splats((1 << 9) - (128 << 19));
540  const vector uint16_t zero16 = vec_splat_u16(0);
541  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
542  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
543  const vector int32_t y_add = vec_splats(1 << 21);
544  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
545  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
546  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
547  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
548  const vector int32_t rgbclip = vec_splats(1 << 30);
549  const vector int32_t zero32 = vec_splat_s32(0);
550  const vector uint32_t shift22 = vec_splats(22U);
551  const vector uint32_t shift10 = vec_splat_u32(10);
552  int i, j;
553 
554  // Various permutations
555  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
556  0x1, 0x11, 0,
557  0x2, 0x12, 0,
558  0x3, 0x13, 0,
559  0x4, 0x14, 0,
560  0x5 };
561  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
562  0x6, 0x16, 0,
563  0x7, 0x17, 0 };
564  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
565  0x3, 0x4, 0x11,
566  0x6, 0x7, 0x12,
567  0x9, 0xa, 0x13,
568  0xc, 0xd, 0x14,
569  0xf };
570  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
571  0x2, 0x3, 0x16,
572  0x5, 0x6, 0x17 };
573 
574  ad = vec_splats((uint8_t) 255);
575 
576  for (i = 0; i < lumFilterSize; i++)
577  vlumFilter[i] = vec_splats(lumFilter[i]);
578  for (i = 0; i < chrFilterSize; i++)
579  vchrFilter[i] = vec_splats(chrFilter[i]);
580 
581  for (i = 0; i < dstW; i += 8) {
582  vy32_l =
583  vy32_r = ystart;
584  vu32_l =
585  vu32_r =
586  vv32_l =
587  vv32_r = uvstart;
588 
589  for (j = 0; j < lumFilterSize; j++) {
590  vv = vec_ld(0, &lumSrc[j][i]);
591  tmp = vec_mule(vv, vlumFilter[j]);
592  tmp2 = vec_mulo(vv, vlumFilter[j]);
593  tmp3 = vec_mergeh(tmp, tmp2);
594  tmp4 = vec_mergel(tmp, tmp2);
595 
596  vy32_l = vec_adds(vy32_l, tmp3);
597  vy32_r = vec_adds(vy32_r, tmp4);
598  }
599 
600  for (j = 0; j < chrFilterSize; j++) {
601  vv = vec_ld(0, &chrUSrc[j][i]);
602  tmp = vec_mule(vv, vchrFilter[j]);
603  tmp2 = vec_mulo(vv, vchrFilter[j]);
604  tmp3 = vec_mergeh(tmp, tmp2);
605  tmp4 = vec_mergel(tmp, tmp2);
606 
607  vu32_l = vec_adds(vu32_l, tmp3);
608  vu32_r = vec_adds(vu32_r, tmp4);
609 
610  vv = vec_ld(0, &chrVSrc[j][i]);
611  tmp = vec_mule(vv, vchrFilter[j]);
612  tmp2 = vec_mulo(vv, vchrFilter[j]);
613  tmp3 = vec_mergeh(tmp, tmp2);
614  tmp4 = vec_mergel(tmp, tmp2);
615 
616  vv32_l = vec_adds(vv32_l, tmp3);
617  vv32_r = vec_adds(vv32_r, tmp4);
618  }
619 
620  vy32_l = vec_sra(vy32_l, shift10);
621  vy32_r = vec_sra(vy32_r, shift10);
622  vu32_l = vec_sra(vu32_l, shift10);
623  vu32_r = vec_sra(vu32_r, shift10);
624  vv32_l = vec_sra(vv32_l, shift10);
625  vv32_r = vec_sra(vv32_r, shift10);
626 
627  vy32_l = vec_sub(vy32_l, y_offset);
628  vy32_r = vec_sub(vy32_r, y_offset);
629  vy32_l = vec_mul(vy32_l, y_coeff);
630  vy32_r = vec_mul(vy32_r, y_coeff);
631  vy32_l = vec_add(vy32_l, y_add);
632  vy32_r = vec_add(vy32_r, y_add);
633 
634  R_l = vec_mul(vv32_l, v2r_coeff);
635  R_l = vec_add(R_l, vy32_l);
636  R_r = vec_mul(vv32_r, v2r_coeff);
637  R_r = vec_add(R_r, vy32_r);
638  G_l = vec_mul(vv32_l, v2g_coeff);
639  tmp32 = vec_mul(vu32_l, u2g_coeff);
640  G_l = vec_add(G_l, vy32_l);
641  G_l = vec_add(G_l, tmp32);
642  G_r = vec_mul(vv32_r, v2g_coeff);
643  tmp32 = vec_mul(vu32_r, u2g_coeff);
644  G_r = vec_add(G_r, vy32_r);
645  G_r = vec_add(G_r, tmp32);
646 
647  B_l = vec_mul(vu32_l, u2b_coeff);
648  B_l = vec_add(B_l, vy32_l);
649  B_r = vec_mul(vu32_r, u2b_coeff);
650  B_r = vec_add(B_r, vy32_r);
651 
652  WRITERGB
653  }
654 }
655 
656 #define SETUP(x, buf0, alpha1, buf1, alpha) { \
657  x = vec_ld(0, buf0); \
658  tmp = vec_mule(x, alpha1); \
659  tmp2 = vec_mulo(x, alpha1); \
660  tmp3 = vec_mergeh(tmp, tmp2); \
661  tmp4 = vec_mergel(tmp, tmp2); \
662 \
663  x = vec_ld(0, buf1); \
664  tmp = vec_mule(x, alpha); \
665  tmp2 = vec_mulo(x, alpha); \
666  tmp5 = vec_mergeh(tmp, tmp2); \
667  tmp6 = vec_mergel(tmp, tmp2); \
668 \
669  tmp3 = vec_add(tmp3, tmp5); \
670  tmp4 = vec_add(tmp4, tmp6); \
671 }
672 
673 
674 static av_always_inline void
675 yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2],
676  const int16_t *ubuf[2], const int16_t *vbuf[2],
677  const int16_t *abuf[2], uint8_t *dest, int dstW,
678  int yalpha, int uvalpha, int y,
679  enum AVPixelFormat target, int hasAlpha)
680 {
681  const int16_t *buf0 = buf[0], *buf1 = buf[1],
682  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
683  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
684  *abuf0 = hasAlpha ? abuf[0] : NULL,
685  *abuf1 = hasAlpha ? abuf[1] : NULL;
686  const int16_t yalpha1 = 4096 - yalpha;
687  const int16_t uvalpha1 = 4096 - uvalpha;
688  vector int16_t vy, vu, vv, A = vec_splat_s16(0);
689  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
690  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
691  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
692  vector uint16_t rd16, gd16, bd16;
693  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
694  const vector int16_t vyalpha1 = vec_splats(yalpha1);
695  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
696  const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
697  const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
698  const vector uint16_t zero16 = vec_splat_u16(0);
699  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
700  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
701  const vector int32_t y_add = vec_splats(1 << 21);
702  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
703  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
704  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
705  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
706  const vector int32_t rgbclip = vec_splats(1 << 30);
707  const vector int32_t zero32 = vec_splat_s32(0);
708  const vector uint32_t shift19 = vec_splats(19U);
709  const vector uint32_t shift22 = vec_splats(22U);
710  const vector uint32_t shift10 = vec_splat_u32(10);
711  const vector int32_t dec128 = vec_splats(128 << 19);
712  const vector int32_t add18 = vec_splats(1 << 18);
713  int i;
714 
715  // Various permutations
716  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
717  0x1, 0x11, 0,
718  0x2, 0x12, 0,
719  0x3, 0x13, 0,
720  0x4, 0x14, 0,
721  0x5 };
722  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
723  0x6, 0x16, 0,
724  0x7, 0x17, 0 };
725  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
726  0x3, 0x4, 0x11,
727  0x6, 0x7, 0x12,
728  0x9, 0xa, 0x13,
729  0xc, 0xd, 0x14,
730  0xf };
731  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
732  0x2, 0x3, 0x16,
733  0x5, 0x6, 0x17 };
734 
735  av_assert2(yalpha <= 4096U);
736  av_assert2(uvalpha <= 4096U);
737 
738  for (i = 0; i < dstW; i += 8) {
739  SETUP(vy, &buf0[i], vyalpha1, &buf1[i], vyalpha);
740  vy32_l = vec_sra(tmp3, shift10);
741  vy32_r = vec_sra(tmp4, shift10);
742 
743  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
744  tmp3 = vec_sub(tmp3, dec128);
745  tmp4 = vec_sub(tmp4, dec128);
746  vu32_l = vec_sra(tmp3, shift10);
747  vu32_r = vec_sra(tmp4, shift10);
748 
749  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
750  tmp3 = vec_sub(tmp3, dec128);
751  tmp4 = vec_sub(tmp4, dec128);
752  vv32_l = vec_sra(tmp3, shift10);
753  vv32_r = vec_sra(tmp4, shift10);
754 
755  if (hasAlpha) {
756  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
757  tmp3 = vec_add(tmp3, add18);
758  tmp4 = vec_add(tmp4, add18);
759  tmp3 = vec_sra(tmp3, shift19);
760  tmp4 = vec_sra(tmp4, shift19);
761  A = vec_packs(tmp3, tmp4);
762  ad = vec_packsu(A, (vector int16_t) zero16);
763  } else {
764  ad = vec_splats((uint8_t) 255);
765  }
766 
767  vy32_l = vec_sub(vy32_l, y_offset);
768  vy32_r = vec_sub(vy32_r, y_offset);
769  vy32_l = vec_mul(vy32_l, y_coeff);
770  vy32_r = vec_mul(vy32_r, y_coeff);
771  vy32_l = vec_add(vy32_l, y_add);
772  vy32_r = vec_add(vy32_r, y_add);
773 
774  R_l = vec_mul(vv32_l, v2r_coeff);
775  R_l = vec_add(R_l, vy32_l);
776  R_r = vec_mul(vv32_r, v2r_coeff);
777  R_r = vec_add(R_r, vy32_r);
778  G_l = vec_mul(vv32_l, v2g_coeff);
779  tmp32 = vec_mul(vu32_l, u2g_coeff);
780  G_l = vec_add(G_l, vy32_l);
781  G_l = vec_add(G_l, tmp32);
782  G_r = vec_mul(vv32_r, v2g_coeff);
783  tmp32 = vec_mul(vu32_r, u2g_coeff);
784  G_r = vec_add(G_r, vy32_r);
785  G_r = vec_add(G_r, tmp32);
786 
787  B_l = vec_mul(vu32_l, u2b_coeff);
788  B_l = vec_add(B_l, vy32_l);
789  B_r = vec_mul(vu32_r, u2b_coeff);
790  B_r = vec_add(B_r, vy32_r);
791 
792  WRITERGB
793  }
794 }
795 
796 static av_always_inline void
797 yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
798  const int16_t *ubuf[2], const int16_t *vbuf[2],
799  const int16_t *abuf[2], uint8_t *dest, int dstW,
800  int yalpha, int uvalpha, int y,
801  enum AVPixelFormat target, int hasAlpha)
802 {
803  const int16_t *buf0 = buf[0], *buf1 = buf[1],
804  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
805  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
806  *abuf0 = hasAlpha ? abuf[0] : NULL,
807  *abuf1 = hasAlpha ? abuf[1] : NULL;
808  const int16_t yalpha1 = 4096 - yalpha;
809  const int16_t uvalpha1 = 4096 - uvalpha;
810  vector int16_t vy, vu, vv, A = vec_splat_s16(0);
811  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
812  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r;
813  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
814  vector uint16_t rd16, gd16, bd16;
815  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
816  const vector int16_t vyalpha1 = vec_splats(yalpha1);
817  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
818  const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
819  const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
820  const vector uint16_t zero16 = vec_splat_u16(0);
821  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
822  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
823  const vector int32_t y_add = vec_splats(1 << 21);
824  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
825  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
826  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
827  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
828  const vector int32_t rgbclip = vec_splats(1 << 30);
829  const vector int32_t zero32 = vec_splat_s32(0);
830  const vector uint32_t shift19 = vec_splats(19U);
831  const vector uint32_t shift22 = vec_splats(22U);
832  const vector uint32_t shift10 = vec_splat_u32(10);
833  const vector int32_t dec128 = vec_splats(128 << 19);
834  const vector int32_t add18 = vec_splats(1 << 18);
835  int i;
836 
837  // Various permutations
838  const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
839  0, 1, 2, 3,
840  4, 5, 6, 7,
841  4, 5, 6, 7 };
842  const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
843  8, 9, 10, 11,
844  12, 13, 14, 15,
845  12, 13, 14, 15 };
846  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
847  0x1, 0x11, 0,
848  0x2, 0x12, 0,
849  0x3, 0x13, 0,
850  0x4, 0x14, 0,
851  0x5 };
852  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
853  0x6, 0x16, 0,
854  0x7, 0x17, 0 };
855  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
856  0x3, 0x4, 0x11,
857  0x6, 0x7, 0x12,
858  0x9, 0xa, 0x13,
859  0xc, 0xd, 0x14,
860  0xf };
861  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
862  0x2, 0x3, 0x16,
863  0x5, 0x6, 0x17 };
864 
865  av_assert2(yalpha <= 4096U);
866  av_assert2(uvalpha <= 4096U);
867 
868  for (i = 0; i < (dstW + 1) >> 1; i += 8) {
869  SETUP(vy, &buf0[i * 2], vyalpha1, &buf1[i * 2], vyalpha);
870  vy32_l = vec_sra(tmp3, shift10);
871  vy32_r = vec_sra(tmp4, shift10);
872 
873  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
874  tmp3 = vec_sub(tmp3, dec128);
875  tmp4 = vec_sub(tmp4, dec128);
876  vu32_l = vec_sra(tmp3, shift10);
877  vu32_r = vec_sra(tmp4, shift10);
878 
879  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
880  tmp3 = vec_sub(tmp3, dec128);
881  tmp4 = vec_sub(tmp4, dec128);
882  vv32_l = vec_sra(tmp3, shift10);
883  vv32_r = vec_sra(tmp4, shift10);
884 
885  if (hasAlpha) {
886  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
887  tmp3 = vec_add(tmp3, add18);
888  tmp4 = vec_add(tmp4, add18);
889  tmp3 = vec_sra(tmp3, shift19);
890  tmp4 = vec_sra(tmp4, shift19);
891  A = vec_packs(tmp3, tmp4);
892  ad = vec_packsu(A, (vector int16_t) zero16);
893  } else {
894  ad = vec_splats((uint8_t) 255);
895  }
896 
897  vy32_l = vec_sub(vy32_l, y_offset);
898  vy32_r = vec_sub(vy32_r, y_offset);
899  vy32_l = vec_mul(vy32_l, y_coeff);
900  vy32_r = vec_mul(vy32_r, y_coeff);
901  vy32_l = vec_add(vy32_l, y_add);
902  vy32_r = vec_add(vy32_r, y_add);
903 
904  // Use the first UV half
905  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
906  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
907  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
908  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
909 
910  R_l = vec_mul(vvd32_l, v2r_coeff);
911  R_l = vec_add(R_l, vy32_l);
912  R_r = vec_mul(vvd32_r, v2r_coeff);
913  R_r = vec_add(R_r, vy32_r);
914  G_l = vec_mul(vvd32_l, v2g_coeff);
915  tmp32 = vec_mul(vud32_l, u2g_coeff);
916  G_l = vec_add(G_l, vy32_l);
917  G_l = vec_add(G_l, tmp32);
918  G_r = vec_mul(vvd32_r, v2g_coeff);
919  tmp32 = vec_mul(vud32_r, u2g_coeff);
920  G_r = vec_add(G_r, vy32_r);
921  G_r = vec_add(G_r, tmp32);
922 
923  B_l = vec_mul(vud32_l, u2b_coeff);
924  B_l = vec_add(B_l, vy32_l);
925  B_r = vec_mul(vud32_r, u2b_coeff);
926  B_r = vec_add(B_r, vy32_r);
927 
928  WRITERGB
929 
930  // New Y for the second half
931  SETUP(vy, &buf0[i * 2 + 8], vyalpha1, &buf1[i * 2 + 8], vyalpha);
932  vy32_l = vec_sra(tmp3, shift10);
933  vy32_r = vec_sra(tmp4, shift10);
934 
935  vy32_l = vec_sub(vy32_l, y_offset);
936  vy32_r = vec_sub(vy32_r, y_offset);
937  vy32_l = vec_mul(vy32_l, y_coeff);
938  vy32_r = vec_mul(vy32_r, y_coeff);
939  vy32_l = vec_add(vy32_l, y_add);
940  vy32_r = vec_add(vy32_r, y_add);
941 
942  // Second UV half
943  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
944  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
945  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
946  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
947 
948  R_l = vec_mul(vvd32_l, v2r_coeff);
949  R_l = vec_add(R_l, vy32_l);
950  R_r = vec_mul(vvd32_r, v2r_coeff);
951  R_r = vec_add(R_r, vy32_r);
952  G_l = vec_mul(vvd32_l, v2g_coeff);
953  tmp32 = vec_mul(vud32_l, u2g_coeff);
954  G_l = vec_add(G_l, vy32_l);
955  G_l = vec_add(G_l, tmp32);
956  G_r = vec_mul(vvd32_r, v2g_coeff);
957  tmp32 = vec_mul(vud32_r, u2g_coeff);
958  G_r = vec_add(G_r, vy32_r);
959  G_r = vec_add(G_r, tmp32);
960 
961  B_l = vec_mul(vud32_l, u2b_coeff);
962  B_l = vec_add(B_l, vy32_l);
963  B_r = vec_mul(vud32_r, u2b_coeff);
964  B_r = vec_add(B_r, vy32_r);
965 
966  WRITERGB
967  }
968 }
969 
970 #undef SETUP
971 
972 static av_always_inline void
973 yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
974  const int16_t *ubuf[2], const int16_t *vbuf[2],
975  const int16_t *abuf0, uint8_t *dest, int dstW,
976  int uvalpha, int y, enum AVPixelFormat target,
977  int hasAlpha)
978 {
979  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
980  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
981  vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
982  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
983  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
984  vector uint16_t rd16, gd16, bd16;
985  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
986  const vector uint16_t zero16 = vec_splat_u16(0);
987  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
988  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
989  const vector int32_t y_add = vec_splats(1 << 21);
990  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
991  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
992  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
993  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
994  const vector int32_t rgbclip = vec_splats(1 << 30);
995  const vector int32_t zero32 = vec_splat_s32(0);
996  const vector uint32_t shift2 = vec_splat_u32(2);
997  const vector uint32_t shift22 = vec_splats(22U);
998  const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
999  const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
1000  const vector int16_t mul4 = vec_splat_s16(4);
1001  const vector int16_t mul8 = vec_splat_s16(8);
1002  const vector int16_t add64 = vec_splat_s16(64);
1003  const vector uint16_t shift7 = vec_splat_u16(7);
1004  const vector int16_t max255 = vec_splat_s16(255);
1005  int i;
1006 
1007  // Various permutations
1008  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
1009  0x1, 0x11, 0,
1010  0x2, 0x12, 0,
1011  0x3, 0x13, 0,
1012  0x4, 0x14, 0,
1013  0x5 };
1014  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
1015  0x6, 0x16, 0,
1016  0x7, 0x17, 0 };
1017  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
1018  0x3, 0x4, 0x11,
1019  0x6, 0x7, 0x12,
1020  0x9, 0xa, 0x13,
1021  0xc, 0xd, 0x14,
1022  0xf };
1023  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
1024  0x2, 0x3, 0x16,
1025  0x5, 0x6, 0x17 };
1026 
1027  for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding bytes.
1028  vy = vec_ld(0, &buf0[i]);
1029  vy32_l = vec_unpackh(vy);
1030  vy32_r = vec_unpackl(vy);
1031  vy32_l = vec_sl(vy32_l, shift2);
1032  vy32_r = vec_sl(vy32_r, shift2);
1033 
1034  vu = vec_ld(0, &ubuf0[i]);
1035  vv = vec_ld(0, &vbuf0[i]);
1036  if (uvalpha < 2048) {
1037  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub7);
1038  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub7);
1039 
1040  tmp32 = vec_mule(vu, mul4);
1041  tmp32_2 = vec_mulo(vu, mul4);
1042  vu32_l = vec_mergeh(tmp32, tmp32_2);
1043  vu32_r = vec_mergel(tmp32, tmp32_2);
1044  tmp32 = vec_mule(vv, mul4);
1045  tmp32_2 = vec_mulo(vv, mul4);
1046  vv32_l = vec_mergeh(tmp32, tmp32_2);
1047  vv32_r = vec_mergel(tmp32, tmp32_2);
1048  } else {
1049  tmp16 = vec_ld(0, &ubuf1[i]);
1050  vu = vec_add(vu, tmp16);
1051  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub8);
1052  tmp16 = vec_ld(0, &vbuf1[i]);
1053  vv = vec_add(vv, tmp16);
1054  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub8);
1055 
1056  vu32_l = vec_mule(vu, mul8);
1057  vu32_r = vec_mulo(vu, mul8);
1058  vv32_l = vec_mule(vv, mul8);
1059  vv32_r = vec_mulo(vv, mul8);
1060  }
1061 
1062  if (hasAlpha) {
1063  A = vec_ld(0, &abuf0[i]);
1064  A = vec_add(A, add64);
1065  A = vec_sr(A, shift7);
1066  A = vec_max(A, max255);
1067  ad = vec_packsu(A, (vector int16_t) zero16);
1068  } else {
1069  ad = vec_splats((uint8_t) 255);
1070  }
1071 
1072  vy32_l = vec_sub(vy32_l, y_offset);
1073  vy32_r = vec_sub(vy32_r, y_offset);
1074  vy32_l = vec_mul(vy32_l, y_coeff);
1075  vy32_r = vec_mul(vy32_r, y_coeff);
1076  vy32_l = vec_add(vy32_l, y_add);
1077  vy32_r = vec_add(vy32_r, y_add);
1078 
1079  R_l = vec_mul(vv32_l, v2r_coeff);
1080  R_l = vec_add(R_l, vy32_l);
1081  R_r = vec_mul(vv32_r, v2r_coeff);
1082  R_r = vec_add(R_r, vy32_r);
1083  G_l = vec_mul(vv32_l, v2g_coeff);
1084  tmp32 = vec_mul(vu32_l, u2g_coeff);
1085  G_l = vec_add(G_l, vy32_l);
1086  G_l = vec_add(G_l, tmp32);
1087  G_r = vec_mul(vv32_r, v2g_coeff);
1088  tmp32 = vec_mul(vu32_r, u2g_coeff);
1089  G_r = vec_add(G_r, vy32_r);
1090  G_r = vec_add(G_r, tmp32);
1091 
1092  B_l = vec_mul(vu32_l, u2b_coeff);
1093  B_l = vec_add(B_l, vy32_l);
1094  B_r = vec_mul(vu32_r, u2b_coeff);
1095  B_r = vec_add(B_r, vy32_r);
1096 
1097  WRITERGB
1098  }
1099 }
1100 
1101 static av_always_inline void
1102 yuv2rgb_1_vsx_template(SwsContext *c, const int16_t *buf0,
1103  const int16_t *ubuf[2], const int16_t *vbuf[2],
1104  const int16_t *abuf0, uint8_t *dest, int dstW,
1105  int uvalpha, int y, enum AVPixelFormat target,
1106  int hasAlpha)
1107 {
1108  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1109  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1110  vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
1111  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
1112  vector int32_t vud32_l, vud32_r, vvd32_l, vvd32_r;
1113  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
1114  vector uint16_t rd16, gd16, bd16;
1115  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
1116  const vector uint16_t zero16 = vec_splat_u16(0);
1117  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
1118  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
1119  const vector int32_t y_add = vec_splats(1 << 21);
1120  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
1121  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
1122  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
1123  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
1124  const vector int32_t rgbclip = vec_splats(1 << 30);
1125  const vector int32_t zero32 = vec_splat_s32(0);
1126  const vector uint32_t shift2 = vec_splat_u32(2);
1127  const vector uint32_t shift22 = vec_splats(22U);
1128  const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
1129  const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
1130  const vector int16_t mul4 = vec_splat_s16(4);
1131  const vector int16_t mul8 = vec_splat_s16(8);
1132  const vector int16_t add64 = vec_splat_s16(64);
1133  const vector uint16_t shift7 = vec_splat_u16(7);
1134  const vector int16_t max255 = vec_splat_s16(255);
1135  int i;
1136 
1137  // Various permutations
1138  const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
1139  0, 1, 2, 3,
1140  4, 5, 6, 7,
1141  4, 5, 6, 7 };
1142  const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
1143  8, 9, 10, 11,
1144  12, 13, 14, 15,
1145  12, 13, 14, 15 };
1146  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
1147  0x1, 0x11, 0,
1148  0x2, 0x12, 0,
1149  0x3, 0x13, 0,
1150  0x4, 0x14, 0,
1151  0x5 };
1152  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
1153  0x6, 0x16, 0,
1154  0x7, 0x17, 0 };
1155  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
1156  0x3, 0x4, 0x11,
1157  0x6, 0x7, 0x12,
1158  0x9, 0xa, 0x13,
1159  0xc, 0xd, 0x14,
1160  0xf };
1161  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
1162  0x2, 0x3, 0x16,
1163  0x5, 0x6, 0x17 };
1164 
1165  for (i = 0; i < (dstW + 1) >> 1; i += 8) { // The x86 asm also overwrites padding bytes.
1166  vy = vec_ld(0, &buf0[i * 2]);
1167  vy32_l = vec_unpackh(vy);
1168  vy32_r = vec_unpackl(vy);
1169  vy32_l = vec_sl(vy32_l, shift2);
1170  vy32_r = vec_sl(vy32_r, shift2);
1171 
1172  vu = vec_ld(0, &ubuf0[i]);
1173  vv = vec_ld(0, &vbuf0[i]);
1174  if (uvalpha < 2048) {
1175  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub7);
1176  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub7);
1177 
1178  tmp32 = vec_mule(vu, mul4);
1179  tmp32_2 = vec_mulo(vu, mul4);
1180  vu32_l = vec_mergeh(tmp32, tmp32_2);
1181  vu32_r = vec_mergel(tmp32, tmp32_2);
1182  tmp32 = vec_mule(vv, mul4);
1183  tmp32_2 = vec_mulo(vv, mul4);
1184  vv32_l = vec_mergeh(tmp32, tmp32_2);
1185  vv32_r = vec_mergel(tmp32, tmp32_2);
1186  } else {
1187  tmp16 = vec_ld(0, &ubuf1[i]);
1188  vu = vec_add(vu, tmp16);
1189  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub8);
1190  tmp16 = vec_ld(0, &vbuf1[i]);
1191  vv = vec_add(vv, tmp16);
1192  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub8);
1193 
1194  vu32_l = vec_mule(vu, mul8);
1195  vu32_r = vec_mulo(vu, mul8);
1196  vv32_l = vec_mule(vv, mul8);
1197  vv32_r = vec_mulo(vv, mul8);
1198  }
1199 
1200  if (hasAlpha) {
1201  A = vec_ld(0, &abuf0[i]);
1202  A = vec_add(A, add64);
1203  A = vec_sr(A, shift7);
1204  A = vec_max(A, max255);
1205  ad = vec_packsu(A, (vector int16_t) zero16);
1206  } else {
1207  ad = vec_splats((uint8_t) 255);
1208  }
1209 
1210  vy32_l = vec_sub(vy32_l, y_offset);
1211  vy32_r = vec_sub(vy32_r, y_offset);
1212  vy32_l = vec_mul(vy32_l, y_coeff);
1213  vy32_r = vec_mul(vy32_r, y_coeff);
1214  vy32_l = vec_add(vy32_l, y_add);
1215  vy32_r = vec_add(vy32_r, y_add);
1216 
1217  // Use the first UV half
1218  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
1219  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
1220  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
1221  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
1222 
1223  R_l = vec_mul(vvd32_l, v2r_coeff);
1224  R_l = vec_add(R_l, vy32_l);
1225  R_r = vec_mul(vvd32_r, v2r_coeff);
1226  R_r = vec_add(R_r, vy32_r);
1227  G_l = vec_mul(vvd32_l, v2g_coeff);
1228  tmp32 = vec_mul(vud32_l, u2g_coeff);
1229  G_l = vec_add(G_l, vy32_l);
1230  G_l = vec_add(G_l, tmp32);
1231  G_r = vec_mul(vvd32_r, v2g_coeff);
1232  tmp32 = vec_mul(vud32_r, u2g_coeff);
1233  G_r = vec_add(G_r, vy32_r);
1234  G_r = vec_add(G_r, tmp32);
1235 
1236  B_l = vec_mul(vud32_l, u2b_coeff);
1237  B_l = vec_add(B_l, vy32_l);
1238  B_r = vec_mul(vud32_r, u2b_coeff);
1239  B_r = vec_add(B_r, vy32_r);
1240 
1241  WRITERGB
1242 
1243  // New Y for the second half
1244  vy = vec_ld(16, &buf0[i * 2]);
1245  vy32_l = vec_unpackh(vy);
1246  vy32_r = vec_unpackl(vy);
1247  vy32_l = vec_sl(vy32_l, shift2);
1248  vy32_r = vec_sl(vy32_r, shift2);
1249 
1250  vy32_l = vec_sub(vy32_l, y_offset);
1251  vy32_r = vec_sub(vy32_r, y_offset);
1252  vy32_l = vec_mul(vy32_l, y_coeff);
1253  vy32_r = vec_mul(vy32_r, y_coeff);
1254  vy32_l = vec_add(vy32_l, y_add);
1255  vy32_r = vec_add(vy32_r, y_add);
1256 
1257  // Second UV half
1258  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
1259  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
1260  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
1261  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
1262 
1263  R_l = vec_mul(vvd32_l, v2r_coeff);
1264  R_l = vec_add(R_l, vy32_l);
1265  R_r = vec_mul(vvd32_r, v2r_coeff);
1266  R_r = vec_add(R_r, vy32_r);
1267  G_l = vec_mul(vvd32_l, v2g_coeff);
1268  tmp32 = vec_mul(vud32_l, u2g_coeff);
1269  G_l = vec_add(G_l, vy32_l);
1270  G_l = vec_add(G_l, tmp32);
1271  G_r = vec_mul(vvd32_r, v2g_coeff);
1272  tmp32 = vec_mul(vud32_r, u2g_coeff);
1273  G_r = vec_add(G_r, vy32_r);
1274  G_r = vec_add(G_r, tmp32);
1275 
1276  B_l = vec_mul(vud32_l, u2b_coeff);
1277  B_l = vec_add(B_l, vy32_l);
1278  B_r = vec_mul(vud32_r, u2b_coeff);
1279  B_r = vec_add(B_r, vy32_r);
1280 
1281  WRITERGB
1282  }
1283 }
1284 
1285 #undef WRITERGB
1286 
1287 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1288 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1289  const int16_t **lumSrc, int lumFilterSize, \
1290  const int16_t *chrFilter, const int16_t **chrUSrc, \
1291  const int16_t **chrVSrc, int chrFilterSize, \
1292  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1293  int y) \
1294 { \
1295  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1296  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1297  alpSrc, dest, dstW, y, fmt, hasAlpha); \
1298 }
1299 
1300 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
1301 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1302  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1303  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1304  int yalpha, int uvalpha, int y) \
1305 { \
1306  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1307  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1308 }
1309 
1310 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1311 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1312  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1313  const int16_t *abuf0, uint8_t *dest, int dstW, \
1314  int uvalpha, int y) \
1315 { \
1316  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1317  dstW, uvalpha, y, fmt, hasAlpha); \
1318 }
1319 
1320 YUV2RGBWRAPPER(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1321 YUV2RGBWRAPPER(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1322 YUV2RGBWRAPPER(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1323 YUV2RGBWRAPPER(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1324 
1325 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1326 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1327 
1328 YUV2RGBWRAPPERX2(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1329 YUV2RGBWRAPPERX2(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1330 YUV2RGBWRAPPERX2(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1331 YUV2RGBWRAPPERX2(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1332 
1333 YUV2RGBWRAPPERX2(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1334 YUV2RGBWRAPPERX2(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1335 
1336 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1337 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1338 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1339 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1340 
1341 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1342 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1343 
1344 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1345 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1346 YUV2RGBWRAPPERX2(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1347 YUV2RGBWRAPPERX2(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1348 
1349 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1350 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1351 
1352 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1353 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1354 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1355 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1356 
1357 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1358 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1359 
1360 static av_always_inline void
1361 write422(const vector int16_t vy1, const vector int16_t vy2,
1362  const vector int16_t vu, const vector int16_t vv,
1363  uint8_t *dest, const enum AVPixelFormat target)
1364 {
1365  vector uint8_t vd1, vd2, tmp;
1366  const vector uint8_t yuyv1 = (vector uint8_t) {
1367  0x0, 0x10, 0x1, 0x18,
1368  0x2, 0x11, 0x3, 0x19,
1369  0x4, 0x12, 0x5, 0x1a,
1370  0x6, 0x13, 0x7, 0x1b };
1371  const vector uint8_t yuyv2 = (vector uint8_t) {
1372  0x8, 0x14, 0x9, 0x1c,
1373  0xa, 0x15, 0xb, 0x1d,
1374  0xc, 0x16, 0xd, 0x1e,
1375  0xe, 0x17, 0xf, 0x1f };
1376  const vector uint8_t yvyu1 = (vector uint8_t) {
1377  0x0, 0x18, 0x1, 0x10,
1378  0x2, 0x19, 0x3, 0x11,
1379  0x4, 0x1a, 0x5, 0x12,
1380  0x6, 0x1b, 0x7, 0x13 };
1381  const vector uint8_t yvyu2 = (vector uint8_t) {
1382  0x8, 0x1c, 0x9, 0x14,
1383  0xa, 0x1d, 0xb, 0x15,
1384  0xc, 0x1e, 0xd, 0x16,
1385  0xe, 0x1f, 0xf, 0x17 };
1386  const vector uint8_t uyvy1 = (vector uint8_t) {
1387  0x10, 0x0, 0x18, 0x1,
1388  0x11, 0x2, 0x19, 0x3,
1389  0x12, 0x4, 0x1a, 0x5,
1390  0x13, 0x6, 0x1b, 0x7 };
1391  const vector uint8_t uyvy2 = (vector uint8_t) {
1392  0x14, 0x8, 0x1c, 0x9,
1393  0x15, 0xa, 0x1d, 0xb,
1394  0x16, 0xc, 0x1e, 0xd,
1395  0x17, 0xe, 0x1f, 0xf };
1396 
1397  vd1 = vec_packsu(vy1, vy2);
1398  vd2 = vec_packsu(vu, vv);
1399 
1400  switch (target) {
1401  case AV_PIX_FMT_YUYV422:
1402  tmp = vec_perm(vd1, vd2, yuyv1);
1403  vec_st(tmp, 0, dest);
1404  tmp = vec_perm(vd1, vd2, yuyv2);
1405  vec_st(tmp, 16, dest);
1406  break;
1407  case AV_PIX_FMT_YVYU422:
1408  tmp = vec_perm(vd1, vd2, yvyu1);
1409  vec_st(tmp, 0, dest);
1410  tmp = vec_perm(vd1, vd2, yvyu2);
1411  vec_st(tmp, 16, dest);
1412  break;
1413  case AV_PIX_FMT_UYVY422:
1414  tmp = vec_perm(vd1, vd2, uyvy1);
1415  vec_st(tmp, 0, dest);
1416  tmp = vec_perm(vd1, vd2, uyvy2);
1417  vec_st(tmp, 16, dest);
1418  break;
1419  }
1420 }
1421 
1422 static av_always_inline void
1423 yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
1424  const int16_t **lumSrc, int lumFilterSize,
1425  const int16_t *chrFilter, const int16_t **chrUSrc,
1426  const int16_t **chrVSrc, int chrFilterSize,
1427  const int16_t **alpSrc, uint8_t *dest, int dstW,
1428  int y, enum AVPixelFormat target)
1429 {
1430  int i, j;
1431  vector int16_t vy1, vy2, vu, vv;
1432  vector int32_t vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
1433  vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
1434  const vector int32_t start = vec_splats(1 << 18);
1435  const vector uint32_t shift19 = vec_splats(19U);
1436 
1437  for (i = 0; i < lumFilterSize; i++)
1438  vlumFilter[i] = vec_splats(lumFilter[i]);
1439  for (i = 0; i < chrFilterSize; i++)
1440  vchrFilter[i] = vec_splats(chrFilter[i]);
1441 
1442  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1443  vy32[0] =
1444  vy32[1] =
1445  vy32[2] =
1446  vy32[3] =
1447  vu32[0] =
1448  vu32[1] =
1449  vv32[0] =
1450  vv32[1] = start;
1451 
1452  for (j = 0; j < lumFilterSize; j++) {
1453  vv = vec_ld(0, &lumSrc[j][i * 2]);
1454  tmp = vec_mule(vv, vlumFilter[j]);
1455  tmp2 = vec_mulo(vv, vlumFilter[j]);
1456  tmp3 = vec_mergeh(tmp, tmp2);
1457  tmp4 = vec_mergel(tmp, tmp2);
1458 
1459  vy32[0] = vec_adds(vy32[0], tmp3);
1460  vy32[1] = vec_adds(vy32[1], tmp4);
1461 
1462  vv = vec_ld(0, &lumSrc[j][(i + 4) * 2]);
1463  tmp = vec_mule(vv, vlumFilter[j]);
1464  tmp2 = vec_mulo(vv, vlumFilter[j]);
1465  tmp3 = vec_mergeh(tmp, tmp2);
1466  tmp4 = vec_mergel(tmp, tmp2);
1467 
1468  vy32[2] = vec_adds(vy32[2], tmp3);
1469  vy32[3] = vec_adds(vy32[3], tmp4);
1470  }
1471 
1472  for (j = 0; j < chrFilterSize; j++) {
1473  vv = vec_ld(0, &chrUSrc[j][i]);
1474  tmp = vec_mule(vv, vchrFilter[j]);
1475  tmp2 = vec_mulo(vv, vchrFilter[j]);
1476  tmp3 = vec_mergeh(tmp, tmp2);
1477  tmp4 = vec_mergel(tmp, tmp2);
1478 
1479  vu32[0] = vec_adds(vu32[0], tmp3);
1480  vu32[1] = vec_adds(vu32[1], tmp4);
1481 
1482  vv = vec_ld(0, &chrVSrc[j][i]);
1483  tmp = vec_mule(vv, vchrFilter[j]);
1484  tmp2 = vec_mulo(vv, vchrFilter[j]);
1485  tmp3 = vec_mergeh(tmp, tmp2);
1486  tmp4 = vec_mergel(tmp, tmp2);
1487 
1488  vv32[0] = vec_adds(vv32[0], tmp3);
1489  vv32[1] = vec_adds(vv32[1], tmp4);
1490  }
1491 
1492  for (j = 0; j < 4; j++) {
1493  vy32[j] = vec_sra(vy32[j], shift19);
1494  }
1495  for (j = 0; j < 2; j++) {
1496  vu32[j] = vec_sra(vu32[j], shift19);
1497  vv32[j] = vec_sra(vv32[j], shift19);
1498  }
1499 
1500  vy1 = vec_packs(vy32[0], vy32[1]);
1501  vy2 = vec_packs(vy32[2], vy32[3]);
1502  vu = vec_packs(vu32[0], vu32[1]);
1503  vv = vec_packs(vv32[0], vv32[1]);
1504 
1505  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1506  }
1507 }
1508 
1509 #define SETUP(x, buf0, buf1, alpha) { \
1510  x = vec_ld(0, buf0); \
1511  tmp = vec_mule(x, alpha); \
1512  tmp2 = vec_mulo(x, alpha); \
1513  tmp3 = vec_mergeh(tmp, tmp2); \
1514  tmp4 = vec_mergel(tmp, tmp2); \
1515 \
1516  x = vec_ld(0, buf1); \
1517  tmp = vec_mule(x, alpha); \
1518  tmp2 = vec_mulo(x, alpha); \
1519  tmp5 = vec_mergeh(tmp, tmp2); \
1520  tmp6 = vec_mergel(tmp, tmp2); \
1521 \
1522  tmp3 = vec_add(tmp3, tmp5); \
1523  tmp4 = vec_add(tmp4, tmp6); \
1524 \
1525  tmp3 = vec_sra(tmp3, shift19); \
1526  tmp4 = vec_sra(tmp4, shift19); \
1527  x = vec_packs(tmp3, tmp4); \
1528 }
1529 
1530 static av_always_inline void
1531 yuv2422_2_vsx_template(SwsContext *c, const int16_t *buf[2],
1532  const int16_t *ubuf[2], const int16_t *vbuf[2],
1533  const int16_t *abuf[2], uint8_t *dest, int dstW,
1534  int yalpha, int uvalpha, int y,
1535  enum AVPixelFormat target)
1536 {
1537  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1538  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1539  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1540  const int16_t yalpha1 = 4096 - yalpha;
1541  const int16_t uvalpha1 = 4096 - uvalpha;
1542  vector int16_t vy1, vy2, vu, vv;
1543  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
1544  const vector int16_t vyalpha1 = vec_splats(yalpha1);
1545  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
1546  const vector uint32_t shift19 = vec_splats(19U);
1547  int i;
1548  av_assert2(yalpha <= 4096U);
1549  av_assert2(uvalpha <= 4096U);
1550 
1551  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1552 
1553  SETUP(vy1, &buf0[i * 2], &buf1[i * 2], vyalpha1)
1554  SETUP(vy2, &buf0[(i + 4) * 2], &buf1[(i + 4) * 2], vyalpha1)
1555  SETUP(vu, &ubuf0[i], &ubuf1[i], vuvalpha1)
1556  SETUP(vv, &vbuf0[i], &vbuf1[i], vuvalpha1)
1557 
1558  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1559  }
1560 }
1561 
1562 #undef SETUP
1563 
1564 static av_always_inline void
1565 yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
1566  const int16_t *ubuf[2], const int16_t *vbuf[2],
1567  const int16_t *abuf0, uint8_t *dest, int dstW,
1568  int uvalpha, int y, enum AVPixelFormat target)
1569 {
1570  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1571  vector int16_t vy1, vy2, vu, vv, tmp;
1572  const vector int16_t add64 = vec_splats((int16_t) 64);
1573  const vector int16_t add128 = vec_splats((int16_t) 128);
1574  const vector uint16_t shift7 = vec_splat_u16(7);
1575  const vector uint16_t shift8 = vec_splat_u16(8);
1576  int i;
1577 
1578  if (uvalpha < 2048) {
1579  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1580  vy1 = vec_ld(0, &buf0[i * 2]);
1581  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1582  vu = vec_ld(0, &ubuf0[i]);
1583  vv = vec_ld(0, &vbuf0[i]);
1584 
1585  vy1 = vec_add(vy1, add64);
1586  vy2 = vec_add(vy2, add64);
1587  vu = vec_add(vu, add64);
1588  vv = vec_add(vv, add64);
1589 
1590  vy1 = vec_sra(vy1, shift7);
1591  vy2 = vec_sra(vy2, shift7);
1592  vu = vec_sra(vu, shift7);
1593  vv = vec_sra(vv, shift7);
1594 
1595  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1596  }
1597  } else {
1598  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1599  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1600  vy1 = vec_ld(0, &buf0[i * 2]);
1601  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1602  vu = vec_ld(0, &ubuf0[i]);
1603  tmp = vec_ld(0, &ubuf1[i]);
1604  vu = vec_adds(vu, tmp);
1605  vv = vec_ld(0, &vbuf0[i]);
1606  tmp = vec_ld(0, &vbuf1[i]);
1607  vv = vec_adds(vv, tmp);
1608 
1609  vy1 = vec_add(vy1, add64);
1610  vy2 = vec_add(vy2, add64);
1611  vu = vec_adds(vu, add128);
1612  vv = vec_adds(vv, add128);
1613 
1614  vy1 = vec_sra(vy1, shift7);
1615  vy2 = vec_sra(vy2, shift7);
1616  vu = vec_sra(vu, shift8);
1617  vv = vec_sra(vv, shift8);
1618 
1619  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1620  }
1621  }
1622 }
1623 
1624 #define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1625 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1626  const int16_t **lumSrc, int lumFilterSize, \
1627  const int16_t *chrFilter, const int16_t **chrUSrc, \
1628  const int16_t **chrVSrc, int chrFilterSize, \
1629  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1630  int y) \
1631 { \
1632  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1633  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1634  alpSrc, dest, dstW, y, fmt); \
1635 }
1636 
1637 #define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1638 YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1639 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1640  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1641  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1642  int yalpha, int uvalpha, int y) \
1643 { \
1644  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1645  dest, dstW, yalpha, uvalpha, y, fmt); \
1646 }
1647 
1648 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
1649 YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1650 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1651  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1652  const int16_t *abuf0, uint8_t *dest, int dstW, \
1653  int uvalpha, int y) \
1654 { \
1655  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
1656  abuf0, dest, dstW, uvalpha, \
1657  y, fmt); \
1658 }
1659 
1660 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
1661 YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
1662 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
1663 
1664 static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth,
1665  const uint8_t *src, int srcW, int xInc)
1666 {
1667  int i;
1668  unsigned int xpos = 0, xx;
1669  vector uint8_t vin, vin2, vperm;
1670  vector int8_t vmul, valpha;
1671  vector int16_t vtmp, vtmp2, vtmp3, vtmp4;
1672  vector uint16_t vd_l, vd_r, vcoord16[2];
1673  vector uint32_t vcoord[4];
1674  const vector uint32_t vadd = (vector uint32_t) {
1675  0,
1676  xInc * 1,
1677  xInc * 2,
1678  xInc * 3,
1679  };
1680  const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
1681  0,
1682  xInc * 1,
1683  xInc * 2,
1684  xInc * 3,
1685  xInc * 4,
1686  xInc * 5,
1687  xInc * 6,
1688  xInc * 7,
1689  };
1690  const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
1691  const vector uint16_t vshift9 = vec_splat_u16(9);
1692  const vector uint8_t vzero = vec_splat_u8(0);
1693  const vector uint16_t vshift = vec_splat_u16(7);
1694 
1695  for (i = 0; i < dstWidth; i += 16) {
1696  vcoord16[0] = vec_splats((uint16_t) xpos);
1697  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1698 
1699  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1700  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1701 
1702  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1703  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1704  valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]);
1705 
1706  xx = xpos >> 16;
1707  vin = vec_vsx_ld(0, &src[xx]);
1708 
1709  vcoord[0] = vec_splats(xpos & 0xffff);
1710  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1711  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1712  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1713 
1714  vcoord[0] = vec_add(vcoord[0], vadd);
1715  vcoord[1] = vec_add(vcoord[1], vadd);
1716  vcoord[2] = vec_add(vcoord[2], vadd);
1717  vcoord[3] = vec_add(vcoord[3], vadd);
1718 
1719  vcoord[0] = vec_sr(vcoord[0], vshift16);
1720  vcoord[1] = vec_sr(vcoord[1], vshift16);
1721  vcoord[2] = vec_sr(vcoord[2], vshift16);
1722  vcoord[3] = vec_sr(vcoord[3], vshift16);
1723 
1724  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1725  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1726  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1727 
1728  vin = vec_perm(vin, vin, vperm);
1729 
1730  vin2 = vec_vsx_ld(1, &src[xx]);
1731  vin2 = vec_perm(vin2, vin2, vperm);
1732 
1733  vmul = (vector int8_t) vec_sub(vin2, vin);
1734  vtmp = vec_mule(vmul, valpha);
1735  vtmp2 = vec_mulo(vmul, valpha);
1736  vtmp3 = vec_mergeh(vtmp, vtmp2);
1737  vtmp4 = vec_mergel(vtmp, vtmp2);
1738 
1739  vd_l = (vector uint16_t) vec_mergeh(vin, vzero);
1740  vd_r = (vector uint16_t) vec_mergel(vin, vzero);
1741  vd_l = vec_sl(vd_l, vshift);
1742  vd_r = vec_sl(vd_r, vshift);
1743 
1744  vd_l = vec_add(vd_l, (vector uint16_t) vtmp3);
1745  vd_r = vec_add(vd_r, (vector uint16_t) vtmp4);
1746 
1747  vec_st((vector int16_t) vd_l, 0, &dst[i]);
1748  vec_st((vector int16_t) vd_r, 0, &dst[i + 8]);
1749 
1750  xpos += xInc * 16;
1751  }
1752  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1753  dst[i] = src[srcW-1]*128;
1754 }
1755 
1756 #define HCSCALE(in, out) \
1757  vin = vec_vsx_ld(0, &in[xx]); \
1758  vin = vec_perm(vin, vin, vperm); \
1759 \
1760  vin2 = vec_vsx_ld(1, &in[xx]); \
1761  vin2 = vec_perm(vin2, vin2, vperm); \
1762 \
1763  vtmp = vec_mule(vin, valphaxor); \
1764  vtmp2 = vec_mulo(vin, valphaxor); \
1765  vtmp3 = vec_mergeh(vtmp, vtmp2); \
1766  vtmp4 = vec_mergel(vtmp, vtmp2); \
1767 \
1768  vtmp = vec_mule(vin2, valpha); \
1769  vtmp2 = vec_mulo(vin2, valpha); \
1770  vd_l = vec_mergeh(vtmp, vtmp2); \
1771  vd_r = vec_mergel(vtmp, vtmp2); \
1772 \
1773  vd_l = vec_add(vd_l, vtmp3); \
1774  vd_r = vec_add(vd_r, vtmp4); \
1775 \
1776  vec_st((vector int16_t) vd_l, 0, &out[i]); \
1777  vec_st((vector int16_t) vd_r, 0, &out[i + 8])
1778 
1779 static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2,
1780  int dstWidth, const uint8_t *src1,
1781  const uint8_t *src2, int srcW, int xInc)
1782 {
1783  int i;
1784  unsigned int xpos = 0, xx;
1785  vector uint8_t vin, vin2, vperm;
1786  vector uint8_t valpha, valphaxor;
1787  vector uint16_t vtmp, vtmp2, vtmp3, vtmp4;
1788  vector uint16_t vd_l, vd_r, vcoord16[2];
1789  vector uint32_t vcoord[4];
1790  const vector uint8_t vxor = vec_splats((uint8_t) 127);
1791  const vector uint32_t vadd = (vector uint32_t) {
1792  0,
1793  xInc * 1,
1794  xInc * 2,
1795  xInc * 3,
1796  };
1797  const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
1798  0,
1799  xInc * 1,
1800  xInc * 2,
1801  xInc * 3,
1802  xInc * 4,
1803  xInc * 5,
1804  xInc * 6,
1805  xInc * 7,
1806  };
1807  const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
1808  const vector uint16_t vshift9 = vec_splat_u16(9);
1809 
1810  for (i = 0; i < dstWidth; i += 16) {
1811  vcoord16[0] = vec_splats((uint16_t) xpos);
1812  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1813 
1814  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1815  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1816 
1817  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1818  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1819  valpha = vec_pack(vcoord16[0], vcoord16[1]);
1820  valphaxor = vec_xor(valpha, vxor);
1821 
1822  xx = xpos >> 16;
1823 
1824  vcoord[0] = vec_splats(xpos & 0xffff);
1825  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1826  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1827  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1828 
1829  vcoord[0] = vec_add(vcoord[0], vadd);
1830  vcoord[1] = vec_add(vcoord[1], vadd);
1831  vcoord[2] = vec_add(vcoord[2], vadd);
1832  vcoord[3] = vec_add(vcoord[3], vadd);
1833 
1834  vcoord[0] = vec_sr(vcoord[0], vshift16);
1835  vcoord[1] = vec_sr(vcoord[1], vshift16);
1836  vcoord[2] = vec_sr(vcoord[2], vshift16);
1837  vcoord[3] = vec_sr(vcoord[3], vshift16);
1838 
1839  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1840  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1841  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1842 
1843  HCSCALE(src1, dst1);
1844  HCSCALE(src2, dst2);
1845 
1846  xpos += xInc * 16;
1847  }
1848  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1849  dst1[i] = src1[srcW-1]*128;
1850  dst2[i] = src2[srcW-1]*128;
1851  }
1852 }
1853 
1854 #undef HCSCALE
1855 
1856 static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1857  const uint8_t *src, const int16_t *filter,
1858  const int32_t *filterPos, int filterSize)
1859 {
1860  int i, j;
1861  int32_t *dst = (int32_t *) _dst;
1862  vector int16_t vfilter, vin;
1863  vector uint8_t vin8;
1864  vector int32_t vout;
1865  const vector uint8_t vzero = vec_splat_u8(0);
1866  const vector uint8_t vunusedtab[8] = {
1867  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1868  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1869  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1870  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1871  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1872  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1873  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1874  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1875  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1876  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1877  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1878  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1879  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1880  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1881  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1882  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1883  };
1884  const vector uint8_t vunused = vunusedtab[filterSize % 8];
1885 
1886  if (filterSize == 1) {
1887  for (i = 0; i < dstW; i++) {
1888  int srcPos = filterPos[i];
1889  int val = 0;
1890  for (j = 0; j < filterSize; j++) {
1891  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
1892  }
1893  dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
1894  }
1895  } else {
1896  for (i = 0; i < dstW; i++) {
1897  const int srcPos = filterPos[i];
1898  vout = vec_splat_s32(0);
1899  for (j = 0; j < filterSize; j += 8) {
1900  vin8 = vec_vsx_ld(0, &src[srcPos + j]);
1901  vin = (vector int16_t) vec_mergeh(vin8, vzero);
1902  if (j + 8 > filterSize) // Remove the unused elements on the last round
1903  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
1904 
1905  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1906  vout = vec_msums(vin, vfilter, vout);
1907  }
1908  vout = vec_sums(vout, (vector int32_t) vzero);
1909  dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1);
1910  }
1911  }
1912 }
1913 
1914 static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1915  const uint8_t *_src, const int16_t *filter,
1916  const int32_t *filterPos, int filterSize)
1917 {
1918  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
1919  int i, j;
1920  int32_t *dst = (int32_t *) _dst;
1921  const uint16_t *src = (const uint16_t *) _src;
1922  int bits = desc->comp[0].depth - 1;
1923  int sh = bits - 4;
1924  vector int16_t vfilter, vin;
1925  vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1926  const vector uint8_t vzero = vec_splat_u8(0);
1927  const vector uint8_t vunusedtab[8] = {
1928  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1929  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1930  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1931  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1932  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1933  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1934  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1935  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1936  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1937  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1938  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1939  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1940  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1941  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1942  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1943  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1944  };
1945  const vector uint8_t vunused = vunusedtab[filterSize % 8];
1946 
1947  if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
1948  sh = 9;
1949  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1950  sh = 16 - 1 - 4;
1951  }
1952 
1953  if (filterSize == 1) {
1954  for (i = 0; i < dstW; i++) {
1955  int srcPos = filterPos[i];
1956  int val = 0;
1957 
1958  for (j = 0; j < filterSize; j++) {
1959  val += src[srcPos + j] * filter[filterSize * i + j];
1960  }
1961  // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1962  dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1963  }
1964  } else {
1965  for (i = 0; i < dstW; i++) {
1966  const int srcPos = filterPos[i];
1967  vout = vec_splat_s32(0);
1968  for (j = 0; j < filterSize; j += 8) {
1969  vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]);
1970  if (j + 8 > filterSize) // Remove the unused elements on the last round
1971  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
1972 
1973  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1974  vfilter32_l = vec_unpackh(vfilter);
1975  vfilter32_r = vec_unpackl(vfilter);
1976 
1977  vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero);
1978  vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero);
1979 
1980  vtmp = vec_mul(vtmp, vfilter32_l);
1981  vtmp2 = vec_mul(vtmp2, vfilter32_r);
1982 
1983  vout = vec_adds(vout, vtmp);
1984  vout = vec_adds(vout, vtmp2);
1985  }
1986  vout = vec_sums(vout, (vector int32_t) vzero);
1987  dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1);
1988  }
1989  }
1990 }
1991 
1992 static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW,
1993  const uint8_t *_src, const int16_t *filter,
1994  const int32_t *filterPos, int filterSize)
1995 {
1996  const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
1997  int i, j;
1998  const uint16_t *src = (const uint16_t *) _src;
1999  int sh = desc->comp[0].depth - 1;
2000  vector int16_t vfilter, vin;
2001  vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
2002  const vector uint8_t vzero = vec_splat_u8(0);
2003  const vector uint8_t vunusedtab[8] = {
2004  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2005  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
2006  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
2007  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2008  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
2009  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2010  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
2011  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2012  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2013  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2014  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2015  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2016  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2017  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
2018  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2019  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
2020  };
2021  const vector uint8_t vunused = vunusedtab[filterSize % 8];
2022 
2023  if (sh<15) {
2024  sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
2025  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
2026  sh = 16 - 1;
2027  }
2028 
2029  if (filterSize == 1) {
2030  for (i = 0; i < dstW; i++) {
2031  int srcPos = filterPos[i];
2032  int val = 0;
2033 
2034  for (j = 0; j < filterSize; j++) {
2035  val += src[srcPos + j] * filter[filterSize * i + j];
2036  }
2037  // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2038  dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2039  }
2040  } else {
2041  for (i = 0; i < dstW; i++) {
2042  const int srcPos = filterPos[i];
2043  vout = vec_splat_s32(0);
2044  for (j = 0; j < filterSize; j += 8) {
2045  vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]);
2046  if (j + 8 > filterSize) // Remove the unused elements on the last round
2047  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
2048 
2049  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
2050  vfilter32_l = vec_unpackh(vfilter);
2051  vfilter32_r = vec_unpackl(vfilter);
2052 
2053  vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero);
2054  vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero);
2055 
2056  vtmp = vec_mul(vtmp, vfilter32_l);
2057  vtmp2 = vec_mul(vtmp2, vfilter32_r);
2058 
2059  vout = vec_adds(vout, vtmp);
2060  vout = vec_adds(vout, vtmp2);
2061  }
2062  vout = vec_sums(vout, (vector int32_t) vzero);
2063  dst[i] = FFMIN(vout[3] >> sh, (1 << 15) - 1);
2064  }
2065  }
2066 }
2067 
2068 #endif /* !HAVE_BIGENDIAN */
2069 
2070 #endif /* HAVE_VSX */
2071 
2073 {
2074 #if HAVE_VSX
2075  enum AVPixelFormat dstFormat = c->dstFormat;
2076  const int cpu_flags = av_get_cpu_flags();
2077  const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8;
2078 
2079  if (!(cpu_flags & AV_CPU_FLAG_VSX))
2080  return;
2081 
2082 #if !HAVE_BIGENDIAN
2083  if (c->srcBpc == 8) {
2084  if (c->dstBpc <= 14) {
2085  c->hyScale = c->hcScale = hScale_real_vsx;
2086  if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
2087  c->hyscale_fast = hyscale_fast_vsx;
2088  c->hcscale_fast = hcscale_fast_vsx;
2089  }
2090  } else {
2091  c->hyScale = c->hcScale = hScale8To19_vsx;
2092  }
2093  } else {
2094  if (power8) {
2095  c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx
2096  : hScale16To15_vsx;
2097  }
2098  }
2099  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
2100  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
2101  !c->needAlpha) {
2102  c->yuv2planeX = yuv2planeX_vsx;
2103  }
2104 #endif
2105 
2106  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
2107  switch (c->dstBpc) {
2108  case 8:
2109  c->yuv2plane1 = yuv2plane1_8_vsx;
2110  break;
2111 #if !HAVE_BIGENDIAN
2112  case 9:
2113  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
2114  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
2115  break;
2116  case 10:
2117  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
2118  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
2119  break;
2120  case 12:
2121  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
2122  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
2123  break;
2124  case 14:
2125  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
2126  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
2127  break;
2128  case 16:
2129  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
2130 #if HAVE_POWER8
2131  if (cpu_flags & AV_CPU_FLAG_POWER8) {
2132  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
2133  }
2134 #endif /* HAVE_POWER8 */
2135  break;
2136 #endif /* !HAVE_BIGENDIAN */
2137  }
2138  }
2139 
2140  if (c->flags & SWS_BITEXACT)
2141  return;
2142 
2143 #if !HAVE_BIGENDIAN
2144  if (c->flags & SWS_FULL_CHR_H_INT) {
2145  switch (dstFormat) {
2146  case AV_PIX_FMT_RGB24:
2147  if (power8) {
2148  c->yuv2packed1 = yuv2rgb24_full_1_vsx;
2149  c->yuv2packed2 = yuv2rgb24_full_2_vsx;
2150  c->yuv2packedX = yuv2rgb24_full_X_vsx;
2151  }
2152  break;
2153  case AV_PIX_FMT_BGR24:
2154  if (power8) {
2155  c->yuv2packed1 = yuv2bgr24_full_1_vsx;
2156  c->yuv2packed2 = yuv2bgr24_full_2_vsx;
2157  c->yuv2packedX = yuv2bgr24_full_X_vsx;
2158  }
2159  break;
2160  case AV_PIX_FMT_BGRA:
2161  if (power8) {
2162  if (!c->needAlpha) {
2163  c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
2164  c->yuv2packed2 = yuv2bgrx32_full_2_vsx;
2165  c->yuv2packedX = yuv2bgrx32_full_X_vsx;
2166  }
2167  }
2168  break;
2169  case AV_PIX_FMT_RGBA:
2170  if (power8) {
2171  if (!c->needAlpha) {
2172  c->yuv2packed1 = yuv2rgbx32_full_1_vsx;
2173  c->yuv2packed2 = yuv2rgbx32_full_2_vsx;
2174  c->yuv2packedX = yuv2rgbx32_full_X_vsx;
2175  }
2176  }
2177  break;
2178  case AV_PIX_FMT_ARGB:
2179  if (power8) {
2180  if (!c->needAlpha) {
2181  c->yuv2packed1 = yuv2xrgb32_full_1_vsx;
2182  c->yuv2packed2 = yuv2xrgb32_full_2_vsx;
2183  c->yuv2packedX = yuv2xrgb32_full_X_vsx;
2184  }
2185  }
2186  break;
2187  case AV_PIX_FMT_ABGR:
2188  if (power8) {
2189  if (!c->needAlpha) {
2190  c->yuv2packed1 = yuv2xbgr32_full_1_vsx;
2191  c->yuv2packed2 = yuv2xbgr32_full_2_vsx;
2192  c->yuv2packedX = yuv2xbgr32_full_X_vsx;
2193  }
2194  }
2195  break;
2196  }
2197  } else { /* !SWS_FULL_CHR_H_INT */
2198  switch (dstFormat) {
2199  case AV_PIX_FMT_YUYV422:
2200  c->yuv2packed1 = yuv2yuyv422_1_vsx;
2201  c->yuv2packed2 = yuv2yuyv422_2_vsx;
2202  c->yuv2packedX = yuv2yuyv422_X_vsx;
2203  break;
2204  case AV_PIX_FMT_YVYU422:
2205  c->yuv2packed1 = yuv2yvyu422_1_vsx;
2206  c->yuv2packed2 = yuv2yvyu422_2_vsx;
2207  c->yuv2packedX = yuv2yvyu422_X_vsx;
2208  break;
2209  case AV_PIX_FMT_UYVY422:
2210  c->yuv2packed1 = yuv2uyvy422_1_vsx;
2211  c->yuv2packed2 = yuv2uyvy422_2_vsx;
2212  c->yuv2packedX = yuv2uyvy422_X_vsx;
2213  break;
2214  case AV_PIX_FMT_BGRA:
2215  if (power8) {
2216  if (!c->needAlpha) {
2217  c->yuv2packed1 = yuv2bgrx32_1_vsx;
2218  c->yuv2packed2 = yuv2bgrx32_2_vsx;
2219  }
2220  }
2221  break;
2222  case AV_PIX_FMT_RGBA:
2223  if (power8) {
2224  if (!c->needAlpha) {
2225  c->yuv2packed1 = yuv2rgbx32_1_vsx;
2226  c->yuv2packed2 = yuv2rgbx32_2_vsx;
2227  }
2228  }
2229  break;
2230  case AV_PIX_FMT_ARGB:
2231  if (power8) {
2232  if (!c->needAlpha) {
2233  c->yuv2packed1 = yuv2xrgb32_1_vsx;
2234  c->yuv2packed2 = yuv2xrgb32_2_vsx;
2235  }
2236  }
2237  break;
2238  case AV_PIX_FMT_ABGR:
2239  if (power8) {
2240  if (!c->needAlpha) {
2241  c->yuv2packed1 = yuv2xbgr32_1_vsx;
2242  c->yuv2packed2 = yuv2xbgr32_2_vsx;
2243  }
2244  }
2245  break;
2246  case AV_PIX_FMT_RGB24:
2247  if (power8) {
2248  c->yuv2packed1 = yuv2rgb24_1_vsx;
2249  c->yuv2packed2 = yuv2rgb24_2_vsx;
2250  }
2251  break;
2252  case AV_PIX_FMT_BGR24:
2253  if (power8) {
2254  c->yuv2packed1 = yuv2bgr24_1_vsx;
2255  c->yuv2packed2 = yuv2bgr24_2_vsx;
2256  }
2257  break;
2258  }
2259  }
2260 #endif /* !HAVE_BIGENDIAN */
2261 
2262 #endif /* HAVE_VSX */
2263 }
YUV2PACKEDWRAPPER
#define YUV2PACKEDWRAPPER(name, base, ext, fmt)
Definition: output.c:709
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
yuv2NBPS
yuv2NBPS(yuv2NBPS(9, yuv2NBPS(BE, yuv2NBPS(1, yuv2NBPS(10, int16_t)
Definition: output.c:369
av_pix_fmt_desc_get
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:2522
YUV2RGBWRAPPERX2
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)
Definition: output.c:1798
AV_PIX_FMT_FLAG_FLOAT
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:188
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:50
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
A
#define A(x)
Definition: vp56_arith.h:28
SWS_FAST_BILINEAR
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:646
SWS_BITEXACT
#define SWS_BITEXACT
Definition: swscale.h:84
U
#define U(x)
Definition: vp56_arith.h:37
start
void INT64 start
Definition: avisynth_c.h:767
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:653
AV_CPU_FLAG_VSX
#define AV_CPU_FLAG_VSX
ISA 2.06.
Definition: cpu.h:61
src
#define src
Definition: vp8dsp.c:254
MAX_FILTER_SIZE
#define MAX_FILTER_SIZE
Definition: swscale_internal.h:41
buf
void * buf
Definition: avisynth_c.h:766
av_cold
#define av_cold
Definition: attributes.h:84
bits
uint8_t bits
Definition: vp3data.h:202
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
int32_t
int32_t
Definition: audio_convert.c:194
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:685
NULL
#define NULL
Definition: coverity.c:32
AV_PIX_FMT_YUYV422
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:67
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
isBE
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:660
desc
const char * desc
Definition: nvenc.c:68
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
cpu.h
isAnyRGB
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:761
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
val
const char const char void * val
Definition: avisynth_c.h:863
output_pixel
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:887
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
yuv2rgb_altivec.h
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
shift2
static const int shift2[6]
Definition: dxa.c:51
ff_sws_init_swscale_vsx
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2072
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
src1
#define src1
Definition: h264pred.c:139
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
av_always_inline
#define av_always_inline
Definition: attributes.h:43
swscale_internal.h
uint8_t
uint8_t
Definition: audio_convert.c:194
AV_PIX_FMT_PAL8
@ AV_PIX_FMT_PAL8
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:77
AV_PIX_FMT_YVYU422
@ AV_PIX_FMT_YVYU422
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:210
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
YUV2RGBWRAPPER
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output.c:1809
swscale_ppc_template.c
AV_PIX_FMT_UYVY422
@ AV_PIX_FMT_UYVY422
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
SETUP
@ SETUP
Definition: rtspcodes.h:133
mul8
#define mul8(a, b)
Definition: texturedspenc.c:143
config.h
AV_CPU_FLAG_POWER8
#define AV_CPU_FLAG_POWER8
ISA 2.07.
Definition: cpu.h:62
shift
static int shift(int a, int b)
Definition: sonic.c:82
YUV2RGBWRAPPERX
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)
Definition: output.c:1785
LOCAL_ALIGNED
#define LOCAL_ALIGNED(a, t, v,...)
Definition: internal.h:114
AVPixFmtDescriptor
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:81
util_altivec.h
vi
const AVS_VideoInfo * vi
Definition: avisynth_c.h:887
int
int
Definition: ffmpeg_filter.c:191
SwsContext
Definition: swscale_internal.h:280
clip
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
swscale.h
shifts
static const uint8_t shifts[2][12]
Definition: camellia.c:174
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57