[FFmpeg-devel] [PATCH] avfilter/vf_maskedmerge: add SIMD for maskedmerge with 8 bit depth input
James Almer
jamrial at gmail.com
Thu Oct 1 20:00:36 CEST 2015
On 10/1/2015 2:25 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
> libavfilter/maskedmerge.h | 39 +++++++++++++++++++++
> libavfilter/vf_maskedmerge.c | 33 ++++++------------
> libavfilter/x86/Makefile | 2 ++
> libavfilter/x86/vf_maskedmerge.asm | 66 +++++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_maskedmerge_init.c | 39 +++++++++++++++++++++
> 5 files changed, 156 insertions(+), 23 deletions(-)
> create mode 100644 libavfilter/maskedmerge.h
> create mode 100644 libavfilter/x86/vf_maskedmerge.asm
> create mode 100644 libavfilter/x86/vf_maskedmerge_init.c
>
> diff --git a/libavfilter/maskedmerge.h b/libavfilter/maskedmerge.h
> new file mode 100644
> index 0000000..b198e65
> --- /dev/null
> +++ b/libavfilter/maskedmerge.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright (c) 2015 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "avfilter.h"
> +#include "framesync.h"
> +
> +typedef struct MaskedMergeContext {
> + const AVClass *class;
> + int width[4], height[4];
> + int nb_planes;
> + int planes;
> + int half, depth;
> + FFFrameSync fs;
> +
> + void (*maskedmerge)(const uint8_t *bsrc, int blinesize,
> + const uint8_t *osrc, int olinesize,
> + const uint8_t *msrc, int mlinesize,
> + uint8_t *dst, int dlinesize, int w, int h,
> + int half, int shift);
Make the pointers the first four arguments, followed by the linesize
ones, then the rest.
It will allow you to get this working on x86_32 with some changes.
Also, linesize arguments should be ptrdiff_t.
[...]
> diff --git a/libavfilter/x86/vf_maskedmerge.asm b/libavfilter/x86/vf_maskedmerge.asm
> new file mode 100644
> index 0000000..462674a
> --- /dev/null
> +++ b/libavfilter/x86/vf_maskedmerge.asm
> @@ -0,0 +1,66 @@
> +;*****************************************************************************
> +;* x86-optimized functions for maskedmerge filter
> +;*
> +;* Copyright (C) 2015 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;*****************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
If this is x86_64 only for now, then you need to put everything
under this line inside an "%if ARCH_X86_64" preprocessor check.
> +
> +SECTION_RODATA
> +
> +pw_128: times 8 dw 128
> +pw_256: times 8 dw 256
> +
> +SECTION .text
> +
> +INIT_XMM sse2
> +cglobal maskedmerge8, 10, 11, 3, 0, bsrc, blinesize, osrc, olinesize, msrc, mlinesize, dst, dlinesize, w, h
You can remove the 0 if you're not reserving stack space. It's an
optional parameter.
Also, you're using more than 3 xmm regs.
> + mova m7, [pw_128]
> + pxor m6, m6
> +.nextrow:
> + mov r10q, 0
> + %define x r10q
> +
> + .loop:
> + movh m0, [bsrcq + x]
> + movh m1, [osrcq + x]
> + movh m3, [msrcq + x]
> + mova m4, [pw_256]
You're not using m2, so you can store pw_256 on it outside the loop
like you did for pw_128. Much faster than constantly loading it from
memory.
For that matter m5 is also unused.
> + punpcklbw m0, m6
> + punpcklbw m1, m6
> + punpcklbw m3, m6
> + psubw m4, m3
> + pmullw m4, m0
> + pmullw m1, m3
> + paddw m1, m4
> + paddw m1, m7
> + psrlw m1, 8
> + packuswb m1, m1
> + movh [dstq + x], m1
> + add r10q, mmsize / 2
> + cmp r10q, wq
> + jl .loop
> +
> + lea bsrcq, [bsrcq+blinesizeq]
> + lea osrcq, [osrcq+olinesizeq]
> + lea msrcq, [msrcq+mlinesizeq]
> + lea dstq, [dstq+dlinesizeq]
These are simple sums, so just use add.
More information about the ffmpeg-devel
mailing list