[FFmpeg-cvslog] swscale/ppc: VSX-optimize hscale_fast
Lauri Kasanen
git at videolan.org
Tue Apr 30 14:41:39 EEST 2019
ffmpeg | branch: master | Lauri Kasanen <cand at gmx.com> | Tue Apr 23 14:54:46 2019 +0300| [d0e4d0429ed2841283e31d5f2cb6b1c56210382c] | committer: Lauri Kasanen
swscale/ppc: VSX-optimize hscale_fast
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags fast_bilinear \
-s 2400x720 -f rawvideo -vframes 5 -pix_fmt abgr -nostats test.raw
4.27 speedup for hyscale_fast:
24796 UNITS in hyscale_fast, 4096 runs, 0 skips
5797 UNITS in hyscale_fast, 4096 runs, 0 skips
4.48 speedup for hcscale_fast:
19911 UNITS in hcscale_fast, 4095 runs, 1 skips
4437 UNITS in hcscale_fast, 4096 runs, 0 skips
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d0e4d0429ed2841283e31d5f2cb6b1c56210382c
---
libswscale/ppc/swscale_vsx.c | 196 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 196 insertions(+)
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index ba00791dcb..2e20ab388a 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -1661,6 +1661,198 @@ YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
+static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0, xx;
+ vector uint8_t vin, vin2, vperm;
+ vector int8_t vmul, valpha;
+ vector int16_t vtmp, vtmp2, vtmp3, vtmp4;
+ vector uint16_t vd_l, vd_r, vcoord16[2];
+ vector uint32_t vcoord[4];
+ const vector uint32_t vadd = (vector uint32_t) {
+ 0,
+ xInc * 1,
+ xInc * 2,
+ xInc * 3,
+ };
+ const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
+ 0,
+ xInc * 1,
+ xInc * 2,
+ xInc * 3,
+ xInc * 4,
+ xInc * 5,
+ xInc * 6,
+ xInc * 7,
+ };
+ const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
+ const vector uint16_t vshift9 = vec_splat_u16(9);
+ const vector uint8_t vzero = vec_splat_u8(0);
+ const vector uint16_t vshift = vec_splat_u16(7);
+
+ for (i = 0; i < dstWidth; i += 16) {
+ vcoord16[0] = vec_splats((uint16_t) xpos);
+ vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
+
+ vcoord16[0] = vec_add(vcoord16[0], vadd16);
+ vcoord16[1] = vec_add(vcoord16[1], vadd16);
+
+ vcoord16[0] = vec_sr(vcoord16[0], vshift9);
+ vcoord16[1] = vec_sr(vcoord16[1], vshift9);
+ valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]);
+
+ xx = xpos >> 16;
+ vin = vec_vsx_ld(0, &src[xx]);
+
+ vcoord[0] = vec_splats(xpos & 0xffff);
+ vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
+ vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
+ vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
+
+ vcoord[0] = vec_add(vcoord[0], vadd);
+ vcoord[1] = vec_add(vcoord[1], vadd);
+ vcoord[2] = vec_add(vcoord[2], vadd);
+ vcoord[3] = vec_add(vcoord[3], vadd);
+
+ vcoord[0] = vec_sr(vcoord[0], vshift16);
+ vcoord[1] = vec_sr(vcoord[1], vshift16);
+ vcoord[2] = vec_sr(vcoord[2], vshift16);
+ vcoord[3] = vec_sr(vcoord[3], vshift16);
+
+ vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
+ vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
+ vperm = vec_pack(vcoord16[0], vcoord16[1]);
+
+ vin = vec_perm(vin, vin, vperm);
+
+ vin2 = vec_vsx_ld(1, &src[xx]);
+ vin2 = vec_perm(vin2, vin2, vperm);
+
+ vmul = (vector int8_t) vec_sub(vin2, vin);
+ vtmp = vec_mule(vmul, valpha);
+ vtmp2 = vec_mulo(vmul, valpha);
+ vtmp3 = vec_mergeh(vtmp, vtmp2);
+ vtmp4 = vec_mergel(vtmp, vtmp2);
+
+ vd_l = (vector uint16_t) vec_mergeh(vin, vzero);
+ vd_r = (vector uint16_t) vec_mergel(vin, vzero);
+ vd_l = vec_sl(vd_l, vshift);
+ vd_r = vec_sl(vd_r, vshift);
+
+ vd_l = vec_add(vd_l, (vector uint16_t) vtmp3);
+ vd_r = vec_add(vd_r, (vector uint16_t) vtmp4);
+
+ vec_st((vector int16_t) vd_l, 0, &dst[i]);
+ vec_st((vector int16_t) vd_r, 0, &dst[i + 8]);
+
+ xpos += xInc * 16;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+#define HCSCALE(in, out) \
+ vin = vec_vsx_ld(0, &in[xx]); \
+ vin = vec_perm(vin, vin, vperm); \
+\
+ vin2 = vec_vsx_ld(1, &in[xx]); \
+ vin2 = vec_perm(vin2, vin2, vperm); \
+\
+ vtmp = vec_mule(vin, valphaxor); \
+ vtmp2 = vec_mulo(vin, valphaxor); \
+ vtmp3 = vec_mergeh(vtmp, vtmp2); \
+ vtmp4 = vec_mergel(vtmp, vtmp2); \
+\
+ vtmp = vec_mule(vin2, valpha); \
+ vtmp2 = vec_mulo(vin2, valpha); \
+ vd_l = vec_mergeh(vtmp, vtmp2); \
+ vd_r = vec_mergel(vtmp, vtmp2); \
+\
+ vd_l = vec_add(vd_l, vtmp3); \
+ vd_r = vec_add(vd_r, vtmp4); \
+\
+ vec_st((vector int16_t) vd_l, 0, &out[i]); \
+ vec_st((vector int16_t) vd_r, 0, &out[i + 8])
+
+static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0, xx;
+ vector uint8_t vin, vin2, vperm;
+ vector uint8_t valpha, valphaxor;
+ vector uint16_t vtmp, vtmp2, vtmp3, vtmp4;
+ vector uint16_t vd_l, vd_r, vcoord16[2];
+ vector uint32_t vcoord[4];
+ const vector uint8_t vxor = vec_splats((uint8_t) 127);
+ const vector uint32_t vadd = (vector uint32_t) {
+ 0,
+ xInc * 1,
+ xInc * 2,
+ xInc * 3,
+ };
+ const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
+ 0,
+ xInc * 1,
+ xInc * 2,
+ xInc * 3,
+ xInc * 4,
+ xInc * 5,
+ xInc * 6,
+ xInc * 7,
+ };
+ const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
+ const vector uint16_t vshift9 = vec_splat_u16(9);
+
+ for (i = 0; i < dstWidth; i += 16) {
+ vcoord16[0] = vec_splats((uint16_t) xpos);
+ vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
+
+ vcoord16[0] = vec_add(vcoord16[0], vadd16);
+ vcoord16[1] = vec_add(vcoord16[1], vadd16);
+
+ vcoord16[0] = vec_sr(vcoord16[0], vshift9);
+ vcoord16[1] = vec_sr(vcoord16[1], vshift9);
+ valpha = vec_pack(vcoord16[0], vcoord16[1]);
+ valphaxor = vec_xor(valpha, vxor);
+
+ xx = xpos >> 16;
+
+ vcoord[0] = vec_splats(xpos & 0xffff);
+ vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
+ vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
+ vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
+
+ vcoord[0] = vec_add(vcoord[0], vadd);
+ vcoord[1] = vec_add(vcoord[1], vadd);
+ vcoord[2] = vec_add(vcoord[2], vadd);
+ vcoord[3] = vec_add(vcoord[3], vadd);
+
+ vcoord[0] = vec_sr(vcoord[0], vshift16);
+ vcoord[1] = vec_sr(vcoord[1], vshift16);
+ vcoord[2] = vec_sr(vcoord[2], vshift16);
+ vcoord[3] = vec_sr(vcoord[3], vshift16);
+
+ vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
+ vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
+ vperm = vec_pack(vcoord16[0], vcoord16[1]);
+
+ HCSCALE(src1, dst1);
+ HCSCALE(src2, dst2);
+
+ xpos += xInc * 16;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+
+#undef HCSCALE
+
#endif /* !HAVE_BIGENDIAN */
#endif /* HAVE_VSX */
@@ -1677,6 +1869,10 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
#if !HAVE_BIGENDIAN
if (c->srcBpc == 8 && c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale_real_vsx;
+ if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
+ c->hyscale_fast = hyscale_fast_vsx;
+ c->hcscale_fast = hcscale_fast_vsx;
+ }
}
if (!is16BPS(dstFormat) && !isNBPS(dstFormat) &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
More information about the ffmpeg-cvslog
mailing list