[FFmpeg-devel] [PATCH] avfilter/vf_maskedmerge: add SIMD for maskedmerge with 8 bit depth input

Thu Oct 1 20:00:36 CEST 2015

On 10/1/2015 2:25 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>  libavfilter/maskedmerge.h             | 39 +++++++++++++++++++++
>  libavfilter/vf_maskedmerge.c          | 33 ++++++------------
>  libavfilter/x86/Makefile              |  2 ++
>  libavfilter/x86/vf_maskedmerge.asm    | 66 +++++++++++++++++++++++++++++++++++
>  libavfilter/x86/vf_maskedmerge_init.c | 39 +++++++++++++++++++++
>  5 files changed, 156 insertions(+), 23 deletions(-)
>  create mode 100644 libavfilter/maskedmerge.h
>  create mode 100644 libavfilter/x86/vf_maskedmerge.asm
>  create mode 100644 libavfilter/x86/vf_maskedmerge_init.c
> 
> diff --git a/libavfilter/maskedmerge.h b/libavfilter/maskedmerge.h
> new file mode 100644
> index 0000000..b198e65
> --- /dev/null
> +++ b/libavfilter/maskedmerge.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2015 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "avfilter.h"
> +#include "framesync.h"
> +
> +typedef struct MaskedMergeContext {
> +    const AVClass *class;
> +    int width[4], height[4];
> +    int nb_planes;
> +    int planes;
> +    int half, depth;
> +    FFFrameSync fs;
> +
> +    void (*maskedmerge)(const uint8_t *bsrc, int blinesize,
> +                        const uint8_t *osrc, int olinesize,
> +                        const uint8_t *msrc, int mlinesize,
> +                        uint8_t *dst, int dlinesize, int w, int h,
> +                        int half, int shift);

Make the pointers the first four arguments, followed by the linesize
ones, then the rest.
It will allow you to get this working on x86_32 with some changes.

Also, linesize arguments should be ptrdiff_t.

[...]

> diff --git a/libavfilter/x86/vf_maskedmerge.asm b/libavfilter/x86/vf_maskedmerge.asm
> new file mode 100644
> index 0000000..462674a
> --- /dev/null
> +++ b/libavfilter/x86/vf_maskedmerge.asm
> @@ -0,0 +1,66 @@
> +;*****************************************************************************
> +;* x86-optimized functions for maskedmerge filter
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"

If this is x86_64 only for now, then you need to put everything
under this line inside an "%if ARCH_X86_64" preprocessor check.

> +
> +SECTION_RODATA
> +
> +pw_128: times 8 dw 128
> +pw_256: times 8 dw 256
> +
> +SECTION .text
> +
> +INIT_XMM sse2
> +cglobal maskedmerge8, 10, 11, 3, 0, bsrc, blinesize, osrc, olinesize, msrc, mlinesize, dst, dlinesize, w, h

You can remove the 0 if you're not reserving stack space. It's an
optional parameter.
Also, you're using more than 3 xmm regs.

> +    mova m7, [pw_128]
> +    pxor m6, m6
> +.nextrow:
> +    mov r10q, 0
> +    %define x r10q
> +
> +    .loop:
> +        movh m0, [bsrcq + x]
> +        movh m1, [osrcq + x]
> +        movh m3, [msrcq + x]
> +        mova m4, [pw_256]

You're not using m2, so you can store pw_256 on it outside the loop
like you did for pw_128. Much faster than constantly loading it from
memory.
For that matter m5 is also unused.

> +        punpcklbw m0, m6
> +        punpcklbw m1, m6
> +        punpcklbw m3, m6
> +        psubw m4, m3
> +        pmullw m4, m0
> +        pmullw m1, m3
> +        paddw m1, m4
> +        paddw m1, m7
> +        psrlw m1, 8
> +        packuswb m1, m1
> +        movh [dstq + x], m1
> +        add r10q, mmsize / 2
> +        cmp r10q, wq
> +    jl .loop
> +
> +    lea bsrcq, [bsrcq+blinesizeq]
> +    lea osrcq, [osrcq+olinesizeq]
> +    lea msrcq, [msrcq+mlinesizeq]
> +    lea dstq, [dstq+dlinesizeq]

These are simple sums, so just use add.