[FFmpeg-devel] [PATCH] PPC64: Add IBM POWER8 SIMD Implementation

Tue Jun 21 00:45:25 CEST 2016

On Sun, Jun 19, 2016 at 09:57:42PM +0000, Dan Parrot wrote:
> First commit addressing Trac ticket #5570. Functions defined in libswscale/input.c
> have corresponding SIMD definitions in libswscale/ppc/input_vsx.c
> ---
>  libswscale/ppc/Makefile       |    1 +
>  libswscale/ppc/input_vsx.c    | 1070 +++++++++++++++++++++++++++++++++++++++++
>  libswscale/swscale.c          |    3 +
>  libswscale/swscale_internal.h |    1 +
>  4 files changed, 1075 insertions(+)
>  create mode 100644 libswscale/ppc/input_vsx.c
> 
> diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile
> index d1b596e..2482893 100644
> --- a/libswscale/ppc/Makefile
> +++ b/libswscale/ppc/Makefile
> @@ -1,3 +1,4 @@
>  OBJS += ppc/swscale_altivec.o                                           \
> +        ppc/input_vsx.o                                                 \
>          ppc/yuv2rgb_altivec.o                                           \
>          ppc/yuv2yuv_altivec.o                                           \
> diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c
> new file mode 100644
> index 0000000..adb0e38
> --- /dev/null
> +++ b/libswscale/ppc/input_vsx.c
> @@ -0,0 +1,1070 @@
> +/*
> + * Copyright (C) 2016 Dan Parrot <dan.parrot at mail.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <math.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
> +
> +#include "libavutil/avutil.h"
> +#include "libavutil/bswap.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/mathematics.h"
> +#include "libavutil/pixdesc.h"
> +#include "libavutil/avassert.h"
> +#include "config.h"
> +#include "libswscale/rgb2rgb.h"
> +#include "libswscale/swscale.h"
> +#include "libswscale/swscale_internal.h"
> +
> +#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
> +
> +#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b)
> +#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r)
> +
> +#if HAVE_VSX
> +
> +// This is a SIMD version for IBM POWER8 of function rgb64ToY_c_template
> +// in file libswscale/input.c
> +static av_always_inline void
> +rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width,
> +                        enum AVPixelFormat origin, int32_t *rgb2yuv)
> +{
> +    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
> +    int i, j;
> +    int num_vec, frag;
> +
> +    num_vec = width / 8;
> +    frag    = width % 8;
> +
> +    vector int v_ry = vec_splats((int)ry);
> +    vector int v_gy = vec_splats((int)gy);
> +    vector int v_by = vec_splats((int)by);
> +
> +    int s_opr2;
> +    s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1));
> +
> +    vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
> +    vector int v_opr2 = vec_splats((int)s_opr2);
> +
> +    vector int v_r, v_g, v_b, v_tmp;
> +    vector short v_tmpi, v_dst;
> +
> +    for (i = 0; i < num_vec; i++) {
> +        for (j = 7; j >= 0  ; j--) {
> +            int r_b = input_pixel(&src[(i*8+j)*4+0]);
> +            int g   = input_pixel(&src[(i*8+j)*4+1]);
> +            int b_r = input_pixel(&src[(i*8+j)*4+2]);
> +
> +            v_r[j % 4] = r;
> +            v_g[j % 4] = g;
> +            v_b[j % 4] = b;
> +
> +            if (!(j % 4)) {
                       ^

> +                v_tmp = v_ry * v_r;
> +                v_tmp = v_tmp + v_gy * v_g;
> +                v_tmp = v_tmp + v_by * v_b;
> +                v_tmp = v_tmp + v_opr2;
> +                v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
> +
> +                v_tmpi  = (vector short)v_tmp;
> +                v_dst[(j / 4) * 4 + 3]  = v_tmpi[6];
                            ^
what is the speed of a division and modulo on PPC compared to a
bitwise and ?

its also not trivial for the compiler to optimize then out as it
has to proof the varables are never negative


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

it is not once nor twice but times without number that the same ideas make
their appearance in the world. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160621/5fa8274c/attachment.sig>