105 #undef PROFILE_THE_BEAST
108 typedef unsigned char ubyte;
109 typedef signed char sbyte;
146 static const vector
unsigned char
147 perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
148 0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
149 perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
150 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
151 perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
152 0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
153 perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
154 0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
156 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
158 __typeof__(x0) o0, o2, o3; \
159 o0 = vec_mergeh(x0, x1); \
160 y0 = vec_perm(o0, x2, perm_rgb_0); \
161 o2 = vec_perm(o0, x2, perm_rgb_1); \
162 o3 = vec_mergel(x0, x1); \
163 y1 = vec_perm(o3, o2, perm_rgb_2); \
164 y2 = vec_perm(o3, o2, perm_rgb_3); \
167 #define vec_mstbgr24(x0, x1, x2, ptr) \
169 __typeof__(x0) _0, _1, _2; \
170 vec_merge3(x0, x1, x2, _0, _1, _2); \
171 vec_st(_0, 0, ptr++); \
172 vec_st(_1, 0, ptr++); \
173 vec_st(_2, 0, ptr++); \
176 #define vec_mstrgb24(x0, x1, x2, ptr) \
178 __typeof__(x0) _0, _1, _2; \
179 vec_merge3(x2, x1, x0, _0, _1, _2); \
180 vec_st(_0, 0, ptr++); \
181 vec_st(_1, 0, ptr++); \
182 vec_st(_2, 0, ptr++); \
189 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
192 _0 = vec_mergeh(x0, x1); \
193 _1 = vec_mergeh(x2, x3); \
194 _2 = (T) vec_mergeh((vector unsigned short) _0, \
195 (vector unsigned short) _1); \
196 _3 = (T) vec_mergel((vector unsigned short) _0, \
197 (vector unsigned short) _1); \
198 vec_st(_2, 0 * 16, (T *) ptr); \
199 vec_st(_3, 1 * 16, (T *) ptr); \
200 _0 = vec_mergel(x0, x1); \
201 _1 = vec_mergel(x2, x3); \
202 _2 = (T) vec_mergeh((vector unsigned short) _0, \
203 (vector unsigned short) _1); \
204 _3 = (T) vec_mergel((vector unsigned short) _0, \
205 (vector unsigned short) _1); \
206 vec_st(_2, 2 * 16, (T *) ptr); \
207 vec_st(_3, 3 * 16, (T *) ptr); \
226 (vector signed short) \
227 vec_perm(x, (__typeof__(x)) { 0 }, \
228 ((vector unsigned char) { \
229 0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
230 0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
233 (vector signed short) \
234 vec_perm(x, (__typeof__(x)) { 0 }, \
235 ((vector unsigned char) { \
236 0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
237 0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
239 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
240 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
243 #define vec_clip_s16(x) \
244 vec_max(vec_min(x, ((vector signed short) { \
245 235, 235, 235, 235, 235, 235, 235, 235 })), \
246 ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
248 #define vec_packclp(x, y) \
249 (vector unsigned char) \
250 vec_packs((vector unsigned short) \
251 vec_max(x, ((vector signed short) { 0 })), \
252 (vector unsigned short) \
253 vec_max(y, ((vector signed short) { 0 })))
255 static inline void cvtyuvtoRGB(
SwsInternal *
c, vector
signed short Y,
256 vector
signed short U, vector
signed short V,
257 vector
signed short *
R, vector
signed short *
G,
258 vector
signed short *
B)
260 vector
signed short vx, ux, uvx;
262 Y = vec_mradds(
Y,
c->CY,
c->OY);
263 U = vec_sub(
U, (vector
signed short)
264 vec_splat((vector
signed short) { 128 }, 0));
265 V = vec_sub(
V, (vector
signed short)
266 vec_splat((vector
signed short) { 128 }, 0));
269 ux = vec_sl(
U,
c->CSHIFT);
270 *
B = vec_mradds(ux,
c->CBU,
Y);
273 vx = vec_sl(
V,
c->CSHIFT);
274 *
R = vec_mradds(vx,
c->CRV,
Y);
277 uvx = vec_mradds(
U,
c->CGU,
Y);
278 *
G = vec_mradds(
V,
c->CGV, uvx);
288 static inline vector
unsigned char vec_xl(
signed long long offset,
const ubyte *addr)
290 const vector
unsigned char *v_addr = (
const vector
unsigned char *) (addr +
offset);
291 vector
unsigned char align_perm = vec_lvsl(
offset, addr);
293 return (vector
unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
297 #define DEFCSP420_CVT(name, out_pixels) \
298 static int altivec_ ## name(SwsInternal *c, const unsigned char *const *in, \
299 const int *instrides, int srcSliceY, int srcSliceH, \
300 unsigned char *const *oplanes, const int *outstrides) \
305 int instrides_scl[3]; \
306 vector unsigned char y0, y1; \
308 vector signed char u, v; \
310 vector signed short Y0, Y1, Y2, Y3; \
311 vector signed short U, V; \
312 vector signed short vx, ux, uvx; \
313 vector signed short vx0, ux0, uvx0; \
314 vector signed short vx1, ux1, uvx1; \
315 vector signed short R0, G0, B0; \
316 vector signed short R1, G1, B1; \
317 vector unsigned char R, G, B; \
319 vector signed short lCY = c->CY; \
320 vector signed short lOY = c->OY; \
321 vector signed short lCRV = c->CRV; \
322 vector signed short lCBU = c->CBU; \
323 vector signed short lCGU = c->CGU; \
324 vector signed short lCGV = c->CGV; \
325 vector unsigned short lCSHIFT = c->CSHIFT; \
327 const ubyte *y1i = in[0]; \
328 const ubyte *y2i = in[0] + instrides[0]; \
329 const ubyte *ui = in[1]; \
330 const ubyte *vi = in[2]; \
332 vector unsigned char *oute, *outo; \
335 instrides_scl[0] = instrides[0] * 2 - w; \
337 instrides_scl[1] = instrides[1] - w / 2; \
339 instrides_scl[2] = instrides[2] - w / 2; \
341 for (i = 0; i < h / 2; i++) { \
342 oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
343 (srcSliceY + i * 2)); \
344 outo = oute + (outstrides[0] >> 4); \
345 vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
346 vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
348 for (j = 0; j < w / 16; j++) { \
349 y0 = vec_xl(0, y1i); \
351 y1 = vec_xl(0, y2i); \
353 u = (vector signed char) vec_xl(0, ui); \
355 v = (vector signed char) vec_xl(0, vi); \
357 u = (vector signed char) \
359 (vector signed char) \
360 vec_splat((vector signed char) { 128 }, 0)); \
361 v = (vector signed char) \
363 (vector signed char) \
364 vec_splat((vector signed char) { 128 }, 0)); \
366 U = vec_unpackh(u); \
367 V = vec_unpackh(v); \
374 Y0 = vec_mradds(Y0, lCY, lOY); \
375 Y1 = vec_mradds(Y1, lCY, lOY); \
376 Y2 = vec_mradds(Y2, lCY, lOY); \
377 Y3 = vec_mradds(Y3, lCY, lOY); \
380 ux = vec_sl(U, lCSHIFT); \
381 ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
382 ux0 = vec_mergeh(ux, ux); \
383 ux1 = vec_mergel(ux, ux); \
386 vx = vec_sl(V, lCSHIFT); \
387 vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
388 vx0 = vec_mergeh(vx, vx); \
389 vx1 = vec_mergel(vx, vx); \
392 uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
393 uvx = vec_mradds(V, lCGV, uvx); \
394 uvx0 = vec_mergeh(uvx, uvx); \
395 uvx1 = vec_mergel(uvx, uvx); \
397 R0 = vec_add(Y0, vx0); \
398 G0 = vec_add(Y0, uvx0); \
399 B0 = vec_add(Y0, ux0); \
400 R1 = vec_add(Y1, vx1); \
401 G1 = vec_add(Y1, uvx1); \
402 B1 = vec_add(Y1, ux1); \
404 R = vec_packclp(R0, R1); \
405 G = vec_packclp(G0, G1); \
406 B = vec_packclp(B0, B1); \
408 out_pixels(R, G, B, oute); \
410 R0 = vec_add(Y2, vx0); \
411 G0 = vec_add(Y2, uvx0); \
412 B0 = vec_add(Y2, ux0); \
413 R1 = vec_add(Y3, vx1); \
414 G1 = vec_add(Y3, uvx1); \
415 B1 = vec_add(Y3, ux1); \
416 R = vec_packclp(R0, R1); \
417 G = vec_packclp(G0, G1); \
418 B = vec_packclp(B0, B1); \
421 out_pixels(R, G, B, outo); \
429 ui += instrides_scl[1]; \
430 vi += instrides_scl[2]; \
431 y1i += instrides_scl[0]; \
432 y2i += instrides_scl[0]; \
437 #define out_abgr(a, b, c, ptr) \
438 vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr)
439 #define out_bgra(a, b, c, ptr) \
440 vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
441 #define out_rgba(a, b, c, ptr) \
442 vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
443 #define out_argb(a, b, c, ptr) \
444 vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr)
445 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
446 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
448 DEFCSP420_CVT(yuv2_abgr, out_abgr)
449 DEFCSP420_CVT(yuv2_bgra, out_bgra)
450 DEFCSP420_CVT(yuv2_rgba, out_rgba)
451 DEFCSP420_CVT(yuv2_argb, out_argb)
452 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
453 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
457 static const vector
unsigned char
458 demux_u = { 0x10, 0x00, 0x10, 0x00,
459 0x10, 0x04, 0x10, 0x04,
460 0x10, 0x08, 0x10, 0x08,
461 0x10, 0x0c, 0x10, 0x0c },
462 demux_v = { 0x10, 0x02, 0x10, 0x02,
463 0x10, 0x06, 0x10, 0x06,
464 0x10, 0x0A, 0x10, 0x0A,
465 0x10, 0x0E, 0x10, 0x0E },
466 demux_y = { 0x10, 0x01, 0x10, 0x03,
467 0x10, 0x05, 0x10, 0x07,
468 0x10, 0x09, 0x10, 0x0B,
469 0x10, 0x0D, 0x10, 0x0F };
474 static int altivec_uyvy_rgb32(
SwsInternal *
c,
const unsigned char *
const *in,
475 const int *instrides,
int srcSliceY,
int srcSliceH,
476 unsigned char *
const *oplanes,
const int *outstrides)
481 vector
unsigned char uyvy;
482 vector
signed short Y,
U,
V;
483 vector
signed short R0, G0,
B0,
R1, G1,
B1;
484 vector
unsigned char R,
G,
B;
485 vector
unsigned char *
out;
489 out = (vector
unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
491 for (
i = 0;
i <
h;
i++)
492 for (j = 0; j <
w / 16; j++) {
493 uyvy = vec_ld(0,
img);
496 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_u);
498 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_v);
500 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_y);
502 cvtyuvtoRGB(
c,
Y,
U,
V, &
R0, &G0, &
B0);
504 uyvy = vec_ld(16,
img);
507 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_u);
509 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_v);
511 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_y);
513 cvtyuvtoRGB(
c,
Y,
U,
V, &
R1, &G1, &
B1);
515 R = vec_packclp(
R0,
R1);
516 G = vec_packclp(G0, G1);
517 B = vec_packclp(
B0,
B1);
548 if ((
c->srcW & 0
xf) != 0)
551 switch (
c->srcFormat) {
558 if ((
c->srcH & 0x1) != 0)
561 switch (
c->dstFormat) {
564 return altivec_yuv2_rgb24;
567 return altivec_yuv2_bgr24;
570 return altivec_yuv2_argb;
573 return altivec_yuv2_abgr;
576 return altivec_yuv2_rgba;
579 return altivec_yuv2_bgra;
580 default:
return NULL;
585 switch (
c->dstFormat) {
588 return altivec_uyvy_rgb32;
589 default:
return NULL;
599 const int inv_table[4],
607 vector
signed short vec;
613 buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9;
614 buf.tmp[1] = -256 * brightness;
615 buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16);
616 buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16);
617 buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16));
618 buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16));
620 c->CSHIFT = (vector
unsigned short) vec_splat_u16(2);
621 c->CY = vec_splat((vector
signed short) buf.vec, 0);
622 c->OY = vec_splat((vector
signed short) buf.vec, 1);
623 c->CRV = vec_splat((vector
signed short) buf.vec, 2);
624 c->CBU = vec_splat((vector
signed short) buf.vec, 3);
625 c->CGU = vec_splat((vector
signed short) buf.vec, 4);
626 c->CGV = vec_splat((vector
signed short) buf.vec, 5);
634 const int16_t *lumFilter,
635 const int16_t **lumSrc,
637 const int16_t *chrFilter,
638 const int16_t **chrUSrc,
639 const int16_t **chrVSrc,
641 const int16_t **alpSrc,
647 vector
signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1,
U,
V;
648 vector
signed short R0, G0,
B0,
R1, G1,
B1;
650 vector
unsigned char R,
G,
B;
651 vector
unsigned char *
out, *nout;
653 vector
signed short RND = vec_splat_s16(1 << 3);
654 vector
unsigned short SCL = vec_splat_u16(4);
657 vector
signed short *YCoeffs, *CCoeffs;
659 YCoeffs =
c->vYCoeffsBank + dstY * lumFilterSize;
660 CCoeffs =
c->vCCoeffsBank + dstY * chrFilterSize;
662 out = (vector
unsigned char *) dest;
664 for (
i = 0;
i < dstW;
i += 16) {
668 for (j = 0; j < lumFilterSize; j++) {
669 X0 = vec_ld(0, &lumSrc[j][
i]);
670 X1 = vec_ld(16, &lumSrc[j][
i]);
671 Y0 = vec_mradds(X0, YCoeffs[j], Y0);
672 Y1 = vec_mradds(X1, YCoeffs[j], Y1);
678 for (j = 0; j < chrFilterSize; j++) {
679 X = vec_ld(0, &chrUSrc[j][
i / 2]);
680 U = vec_mradds(
X, CCoeffs[j],
U);
681 X = vec_ld(0, &chrVSrc[j][
i / 2]);
682 V = vec_mradds(
X, CCoeffs[j],
V);
686 Y0 = vec_sra(Y0, SCL);
687 Y1 = vec_sra(Y1, SCL);
691 Y0 = vec_clip_s16(Y0);
692 Y1 = vec_clip_s16(Y1);
705 U0 = vec_mergeh(
U,
U);
706 V0 = vec_mergeh(
V,
V);
708 U1 = vec_mergel(
U,
U);
709 V1 = vec_mergel(
V,
V);
711 cvtyuvtoRGB(
c, Y0, U0, V0, &
R0, &G0, &
B0);
712 cvtyuvtoRGB(
c, Y1, U1, V1, &
R1, &G1, &
B1);
714 R = vec_packclp(
R0,
R1);
715 G = vec_packclp(G0, G1);
716 B = vec_packclp(
B0,
B1);
741 static int printed_error_message;
742 if (!printed_error_message) {
744 "altivec_yuv2packedX doesn't support %s output\n",
746 printed_error_message = 1;
759 for (j = 0; j < lumFilterSize; j++) {
760 X0 = vec_ld(0, &lumSrc[j][
i]);
761 X1 = vec_ld(16, &lumSrc[j][
i]);
762 Y0 = vec_mradds(X0, YCoeffs[j], Y0);
763 Y1 = vec_mradds(X1, YCoeffs[j], Y1);
769 for (j = 0; j < chrFilterSize; j++) {
770 X = vec_ld(0, &chrUSrc[j][
i / 2]);
771 U = vec_mradds(
X, CCoeffs[j],
U);
772 X = vec_ld(0, &chrVSrc[j][
i / 2]);
773 V = vec_mradds(
X, CCoeffs[j],
V);
777 Y0 = vec_sra(Y0, SCL);
778 Y1 = vec_sra(Y1, SCL);
782 Y0 = vec_clip_s16(Y0);
783 Y1 = vec_clip_s16(Y1);
796 U0 = vec_mergeh(
U,
U);
797 V0 = vec_mergeh(
V,
V);
799 U1 = vec_mergel(
U,
U);
800 V1 = vec_mergel(
V,
V);
802 cvtyuvtoRGB(
c, Y0, U0, V0, &
R0, &G0, &
B0);
803 cvtyuvtoRGB(
c, Y1, U1, V1, &
R1, &G1, &
B1);
805 R = vec_packclp(
R0,
R1);
806 G = vec_packclp(G0, G1);
807 B = vec_packclp(
B0,
B1);
809 nout = (vector
unsigned char *) scratch;
812 out_abgr(
R,
G,
B, nout);
815 out_bgra(
R,
G,
B, nout);
818 out_rgba(
R,
G,
B, nout);
821 out_argb(
R,
G,
B, nout);
824 out_rgb24(
R,
G,
B, nout);
827 out_bgr24(
R,
G,
B, nout);
832 "altivec_yuv2packedX doesn't support %s output\n",
837 memcpy(&((uint32_t *) dest)[
i], scratch, (dstW -
i) / 4);
841 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
842 void ff_yuv2 ## suffix ## _X_altivec(SwsInternal *c, \
843 const int16_t *lumFilter, \
844 const int16_t **lumSrc, \
846 const int16_t *chrFilter, \
847 const int16_t **chrUSrc, \
848 const int16_t **chrVSrc, \
850 const int16_t **alpSrc, \
851 uint8_t *dest, int dstW, int dstY) \
853 yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
854 chrFilter, chrUSrc, chrVSrc, \
855 chrFilterSize, alpSrc, \
856 dest, dstW, dstY, pixfmt); \