[FFmpeg-devel] libavcodec/exr : add X86 64 SIMD for reorder pixels (SSE and AVX2) (v4)
James Almer
jamrial at gmail.com
Sun Sep 17 22:19:44 EEST 2017
On 9/17/2017 3:22 PM, Martin Vignali wrote:
> From 338f96a7f3c0f97cfafc0deda2322695a4006b5a Mon Sep 17 00:00:00 2001
> From: Martin Vignali <martin.vignali at gmail.com>
> Date: Sun, 17 Sep 2017 20:05:16 +0200
> Subject: [PATCH] libavcodec/exr : add X86 64 SIMD for reorder_pixels
>
> ---
> libavcodec/Makefile | 2 +-
> libavcodec/exr.c | 44 ++++++++++++++--------------
> libavcodec/exrdsp.c | 45 +++++++++++++++++++++++++++++
> libavcodec/exrdsp.h | 32 ++++++++++++++++++++
> libavcodec/x86/Makefile | 2 ++
> libavcodec/x86/exrdsp.asm | 69 ++++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/exrdsp_init.c | 43 +++++++++++++++++++++++++++
> 7 files changed, 213 insertions(+), 24 deletions(-)
> create mode 100644 libavcodec/exrdsp.c
> create mode 100644 libavcodec/exrdsp.h
> create mode 100644 libavcodec/x86/exrdsp.asm
> create mode 100644 libavcodec/x86/exrdsp_init.c
>
> diff --git a/libavcodec/Makefile b/libavcodec/Makefile
> index 943e5db511..fad56129a3 100644
> --- a/libavcodec/Makefile
> +++ b/libavcodec/Makefile
> @@ -286,7 +286,7 @@ OBJS-$(CONFIG_EIGHTSVX_FIB_DECODER) += 8svx.o
> OBJS-$(CONFIG_ESCAPE124_DECODER) += escape124.o
> OBJS-$(CONFIG_ESCAPE130_DECODER) += escape130.o
> OBJS-$(CONFIG_EVRC_DECODER) += evrcdec.o acelp_vectors.o lsp.o
> -OBJS-$(CONFIG_EXR_DECODER) += exr.o
> +OBJS-$(CONFIG_EXR_DECODER) += exr.o exrdsp.o
> OBJS-$(CONFIG_FFV1_DECODER) += ffv1dec.o ffv1.o
> OBJS-$(CONFIG_FFV1_ENCODER) += ffv1enc.o ffv1.o
> OBJS-$(CONFIG_FFWAVESYNTH_DECODER) += ffwavesynth.o
> diff --git a/libavcodec/exr.c b/libavcodec/exr.c
> index 759880756d..478c127ebe 100644
> --- a/libavcodec/exr.c
> +++ b/libavcodec/exr.c
> @@ -40,6 +40,7 @@
> #include "libavutil/avassert.h"
> #include "libavutil/common.h"
> #include "libavutil/imgutils.h"
> +#include "libavutil/timer.h"
Not needed.
> #include "libavutil/intfloat.h"
> #include "libavutil/opt.h"
> #include "libavutil/color_utils.h"
> @@ -55,6 +56,7 @@
> #include "internal.h"
> #include "mathops.h"
> #include "thread.h"
> +#include "exrdsp.h"
Add this one above of get_bits.h, to keep the alphabetical order.
>
> enum ExrCompr {
> EXR_RAW,
> @@ -121,6 +123,7 @@ typedef struct EXRContext {
> AVClass *class;
> AVFrame *picture;
> AVCodecContext *avctx;
> + ExrDSPContext dsp;
>
> #if HAVE_BIGENDIAN
> BswapDSPContext bbdsp;
> @@ -275,23 +278,7 @@ static void predictor(uint8_t *src, int size)
> }
> }
>
> -static void reorder_pixels(uint8_t *src, uint8_t *dst, int size)
> -{
> - const uint8_t *t1 = src;
> - int half_size = size / 2;
> - const uint8_t *t2 = src + half_size;
> - uint8_t *s = dst;
> - int i;
> -
> - av_assert1(size % 2 == 0);
> -
> - for (i = 0; i < half_size; i++) {
> - *(s++) = *(t1++);
> - *(s++) = *(t2++);
> - }
> -}
> -
> -static int zip_uncompress(const uint8_t *src, int compressed_size,
> +static int zip_uncompress(EXRContext *s, const uint8_t *src, int compressed_size,
> int uncompressed_size, EXRThreadData *td)
> {
> unsigned long dest_len = uncompressed_size;
> @@ -300,13 +287,18 @@ static int zip_uncompress(const uint8_t *src, int compressed_size,
> dest_len != uncompressed_size)
> return AVERROR_INVALIDDATA;
>
> + av_assert1(uncompressed_size % 2 == 0);
> +
> predictor(td->tmp, uncompressed_size);
> - reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> +
> + //START_TIMER;
Don't add dead benchmarking/debug code.
> + s->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> + //STOP_TIMER("reorder_pixels_zip");
>
> return 0;
> }
>
> -static int rle_uncompress(const uint8_t *src, int compressed_size,
> +static int rle_uncompress(EXRContext *ctx, const uint8_t *src, int compressed_size,
> int uncompressed_size, EXRThreadData *td)
> {
> uint8_t *d = td->tmp;
> @@ -345,8 +337,10 @@ static int rle_uncompress(const uint8_t *src, int compressed_size,
> if (dend != d)
> return AVERROR_INVALIDDATA;
>
> + av_assert1(uncompressed_size % 2 == 0);
> +
> predictor(td->tmp, uncompressed_size);
> - reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
> + ctx->dsp.reorder_pixels(td->tmp, td->uncompressed_data, uncompressed_size);
>
> return 0;
> }
> @@ -954,6 +948,7 @@ static void unpack_14(const uint8_t b[14], uint16_t s[16])
> }
> }
>
> +
Stray new line.
> static void unpack_3(const uint8_t b[3], uint16_t s[16])
> {
> int i;
> @@ -1000,6 +995,7 @@ static int b44_uncompress(EXRContext *s, const uint8_t *src, int compressed_size
>
> if (src[compressed_size - stay_to_uncompress + 2] == 0xfc) { /* B44A block */
> unpack_3(sr, tmp_buffer);
> +
Same.
> sr += 3;
> stay_to_uncompress -= 3;
> } else {/* B44 Block */
> @@ -1152,7 +1148,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
>
> if (data_size < uncompressed_size) {
> av_fast_padded_malloc(&td->uncompressed_data,
> - &td->uncompressed_size, uncompressed_size);
> + &td->uncompressed_size, uncompressed_size + 64);/* Force 64 padding for AVX2 reorder_pixels dst */
>
> if (!td->uncompressed_data)
> return AVERROR(ENOMEM);
> @@ -1161,7 +1157,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
> switch (s->compression) {
> case EXR_ZIP1:
> case EXR_ZIP16:
> - ret = zip_uncompress(src, data_size, uncompressed_size, td);
> + ret = zip_uncompress(s, src, data_size, uncompressed_size, td);
> break;
> case EXR_PIZ:
> ret = piz_uncompress(s, src, data_size, uncompressed_size, td);
> @@ -1170,7 +1166,7 @@ static int decode_block(AVCodecContext *avctx, void *tdata,
> ret = pxr24_uncompress(s, src, data_size, uncompressed_size, td);
> break;
> case EXR_RLE:
> - ret = rle_uncompress(src, data_size, uncompressed_size, td);
> + ret = rle_uncompress(s, src, data_size, uncompressed_size, td);
> break;
> case EXR_B44:
> case EXR_B44A:
> @@ -1804,6 +1800,8 @@ static av_cold int decode_init(AVCodecContext *avctx)
>
> s->avctx = avctx;
>
> + ff_exrdsp_init(&s->dsp);
> +
> #if HAVE_BIGENDIAN
> ff_bswapdsp_init(&s->bbdsp);
> #endif
> diff --git a/libavcodec/exrdsp.c b/libavcodec/exrdsp.c
> new file mode 100644
> index 0000000000..af47a6f8df
> --- /dev/null
> +++ b/libavcodec/exrdsp.c
> @@ -0,0 +1,45 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "exrdsp.h"
> +#include "config.h"
> +
> +static void reorder_pixels_scalar(uint8_t *src, uint8_t *dst, ptrdiff_t size)
> +{
> + const uint8_t *t1 = src;
> + int half_size = size / 2;
> + const uint8_t *t2 = src + half_size;
> + uint8_t *s = dst;
> + int i;
> +
> + for (i = 0; i < half_size; i++) {
> + *(s++) = *(t1++);
> + *(s++) = *(t2++);
> + }
> +}
> +
> +av_cold void ff_exrdsp_init(ExrDSPContext *c)
> +{
> + c->reorder_pixels = reorder_pixels_scalar;
> +
> + if (ARCH_X86)
> + ff_exrdsp_init_x86(c);
> +}
> diff --git a/libavcodec/exrdsp.h b/libavcodec/exrdsp.h
> new file mode 100644
> index 0000000000..09a76a518e
> --- /dev/null
> +++ b/libavcodec/exrdsp.h
> @@ -0,0 +1,32 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_EXRDSP_H
> +#define AVCODEC_EXRDSP_H
> +
> +#include <stdint.h>
> +#include "libavutil/common.h"
> +
> +typedef struct ExrDSPContext {
> + void (*reorder_pixels)(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +} ExrDSPContext;
> +
> +void ff_exrdsp_init(ExrDSPContext *c);
> +void ff_exrdsp_init_x86(ExrDSPContext *c);
> +
> +#endif /* AVCODEC_EXRDSP_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index e36644c72a..a805cd37b4 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
> OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
> OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
> OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
> +OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o
> OBJS-$(CONFIG_OPUS_DECODER) += x86/opus_dsp_init.o
> OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o
> OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
> @@ -153,6 +154,7 @@ X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
> X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
> x86/dirac_dwt.o
> X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
> +X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
> X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
> ifdef CONFIG_GPL
> X86ASM-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flac_dsp_gpl.o
> diff --git a/libavcodec/x86/exrdsp.asm b/libavcodec/x86/exrdsp.asm
> new file mode 100644
> index 0000000000..f609c055b0
> --- /dev/null
> +++ b/libavcodec/x86/exrdsp.asm
> @@ -0,0 +1,69 @@
> +;******************************************************************************
> +;* X86 Optimized functions for Open Exr Decoder
> +;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
> +;*
> +;* reorder_pixels based on patch by John Loy
> +;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +;------------------------------------------------------------------------------
> +; void ff_reorder_pixels(uint8_t *src, uint8_t *dst, ptrdiff_t size)
> +;------------------------------------------------------------------------------
> +
> +%macro REORDER_PIXELS 0
> +cglobal reorder_pixels, 3,4,3, src1, dst, size, src2
> + lea src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
> + add dstq, sizeq ; dst offset by size
> + shr sizeq, 1 ; half_size
> + add src1q, sizeq ; offset src by half_size
> + neg sizeq ; size = offset for dst, src1, src2
> +.loop:
> +
> +%if cpuflag(avx2)
> + vpermq m0, [src1q + sizeq], 0xd8; load first part
> + vpermq m1, [src2q + sizeq], 0xd8; load second part
> +
> + vpunpcklbw m2, m0, m1 ; interleaved part 1
> + vmovdqa [dstq + sizeq*2], m2 ; copy to dst
> +
> + vpunpckhbw m0, m0, m1 ; interleaved part 2
> + vmovdqa [dstq + sizeq*2 + mmsize], m0 ; copy to dst
> +%else
> + mova m0, [src1q+sizeq] ; load first part
> + movu m1, [src2q+sizeq] ; load second part
> + SBUTTERFLY bw, 0, 1, 2 ; interleaved
> + mova [dstq+2*sizeq ], m0 ; copy to dst
> + mova [dstq+2*sizeq+mmsize], m1
> +%endif
> + add sizeq, mmsize
> + jl .loop
> + RET
You can reuse the SBUTTERFLY + 2 store mova in the avx2 version as well.
The resulting assembly is essentially the same, and it will look much
cleaner here.
%if cpuflag(avx2)
vpermq m0, [src1q+sizeq], 0xd8 ; load first part
vpermq m1, [src2q+sizeq], 0xd8 ; load second part
%else
mova m0, [src1q+sizeq] ; load first part
movu m1, [src2q+sizeq] ; load second part
%endif
SBUTTERFLY bw, 0, 1, 2 ; interleaved
mova [dstq+2*sizeq ], m0 ; copy to dst
mova [dstq+2*sizeq+mmsize], m1
> +%endmacro
> +
> +INIT_XMM sse2
> +REORDER_PIXELS
> +
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +REORDER_PIXELS
> +%endif
> diff --git a/libavcodec/x86/exrdsp_init.c b/libavcodec/x86/exrdsp_init.c
> new file mode 100644
> index 0000000000..49fd00e640
> --- /dev/null
> +++ b/libavcodec/x86/exrdsp_init.c
> @@ -0,0 +1,43 @@
> +/*
> + * OpenEXR (.exr) image decoder
> + *
> + * Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/exrdsp.h"
> +
> +void ff_reorder_pixels_sse2(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +
> +void ff_reorder_pixels_avx2(uint8_t *src, uint8_t *dst, ptrdiff_t size);
> +
> +av_cold void ff_exrdsp_init_x86(ExrDSPContext *dsp)
> +{
> +#if ARCH_X86_64
The functions are being assembled on x86_32, and they should work just
fine with such targets. So why limit the initialization to x86_64 only here?
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (EXTERNAL_SSE2(cpu_flags)) {
> + dsp->reorder_pixels = ff_reorder_pixels_sse2;
> + }
> + if (EXTERNAL_AVX2(cpu_flags)) {
EXTERNAL_AVX2_FAST(cpu_flags)
The AVX2 function uses YMM registers, meaning it will be slow on certain
AMD CPUs. The _FAST version of the macro makes sure it will not be used
with those.
> + dsp->reorder_pixels = ff_reorder_pixels_avx2;
> + }
> +#endif /* ARCH_X86_64 */
> +}
> --
> 2.11.0 (Apple Git-81)
>
fate-exr passes on mingw-w64 as well.
More information about the ffmpeg-devel
mailing list