[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions
Rostislav Pehlivanov
atomnuker at gmail.com
Thu Jul 26 18:29:11 EEST 2018
On 26 July 2018 at 12:28, James Darnley <jdarnley at obe.tv> wrote:
> Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
> relevant transform.
> C: 119fps
> SSE2: 204fps
> AVX: 206fps
> AVX2: 221fps
>
> timer measurements, haar horizontal compose:
> sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
> avx: 3.68x faster (45143 vs. 12275 decicycles) compared with C
> avx2: 5.16x faster (45143 vs. 8742 decicycles) compared with C
> haar vertical compose:
> sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
> avx: 1.58x faster (31792 vs. 20090 decicycles) compared with C
> avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
> ---
> libavcodec/dirac_dwt.c | 7 +-
> libavcodec/dirac_dwt.h | 1 +
> libavcodec/x86/Makefile | 6 +-
> libavcodec/x86/dirac_dwt_10bit.asm | 160 ++++++++++++++++++++++++++
> libavcodec/x86/dirac_dwt_init_10bit.c | 76 ++++++++++++
> 5 files changed, 247 insertions(+), 3 deletions(-)
> create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
> create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c
>
> diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
> index cc08f8865a..86bee5bb9b 100644
> --- a/libavcodec/dirac_dwt.c
> +++ b/libavcodec/dirac_dwt.c
> @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p,
> enum dwt_type type,
> return AVERROR_INVALIDDATA;
> }
>
> - if (ARCH_X86 && bit_depth == 8)
> +#if ARCH_X86
> + if (bit_depth == 8)
> ff_spatial_idwt_init_x86(d, type);
> + else if (bit_depth == 10)
> + ff_spatial_idwt_init_10bit_x86(d, type);
> +#endif
> +
> return 0;
> }
>
> diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
> index 994dc21d70..1ad7b9a821 100644
> --- a/libavcodec/dirac_dwt.h
> +++ b/libavcodec/dirac_dwt.h
> @@ -88,6 +88,7 @@ enum dwt_type {
> int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
> int decomposition_count, int bit_depth);
> void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
> +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
>
> void ff_spatial_idwt_slice2(DWTContext *d, int y);
>
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 2350c8bbee..590d83c167 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) +=
> x86/blockdsp_init.o
> OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
> OBJS-$(CONFIG_DCT) += x86/dct_init.o
> OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
> - x86/dirac_dwt_init.o
> + x86/dirac_dwt_init.o \
> + x86/dirac_dwt_init_10bit.o
> OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
> OBJS-$(CONFIG_FFT) += x86/fft_init.o
> OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
> @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
> X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
> X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
> X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
> - x86/dirac_dwt.o
> + x86/dirac_dwt.o \
> + x86/dirac_dwt_10bit.o
> X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
> X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
> X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
> diff --git a/libavcodec/x86/dirac_dwt_10bit.asm
> b/libavcodec/x86/dirac_dwt_10bit.asm
> new file mode 100644
> index 0000000000..baea91329e
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_10bit.asm
> @@ -0,0 +1,160 @@
> +;**********************************************************
> ********************
> +;* x86 optimized discrete 10-bit wavelet trasnform
> +;* Copyright (c) 2018 James Darnley
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> +;**********************************************************
> ********************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +cextern pd_1
> +
> +SECTION .text
> +
> +%macro HAAR_VERTICAL 0
>
+
>
See below.
+cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
> + DECLARE_REG_TMP 4,5
> +
> + mova m2, [pd_1]
> + mov r3d, wd
> + and wd, ~(mmsize/4 - 1)
> + shl wd, 2
> + add b0q, wq
> + add b1q, wq
> + neg wq
> +
> + ALIGN 16
> + .loop_simd:
> + mova m0, [b0q + wq]
> + mova m1, [b1q + wq]
> + paddd m3, m1, m2
> + psrad m3, 1
> + psubd m0, m3
> + paddd m1, m0
> + mova [b0q + wq], m0
> + mova [b1q + wq], m1
> + add wq, mmsize
> + jl .loop_simd
> +
> + and r3d, mmsize/4 - 1
> + jz .end
> + .loop_scalar:
> + mov t0d, [b0q]
> + mov t1d, [b1q]
> + mov r2d, t1d
> + add r2d, 1
> + sar r2d, 1
> + sub t0d, r2d
> + add t1d, t0d
> + mov [b0q], t0d
> + mov [b1q], t1d
> +
> + add b0q, 4
> + add b1q, 4
> + sub r3d, 1
> + jg .loop_scalar
> +
> + .end:
> +RET
> +
> +%endmacro
> +
> +%macro HAAR_HORIZONTAL 0
>
+
>
Could you remove this newline from every patch? All asm I've written and
seen keep them without a newline. It made me think there's something in the
asm which checked the value of the macro, not that the entire function is
macro'd.
+cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> x, b2
> + DECLARE_REG_TMP 2,5
> + %if ARCH_X86_64
> + %define tail r6d
> + %else
> + %define tail dword wm
> + %endif
> +
> + mova m2, [pd_1]
> + xor xd, xd
> + shr wd, 1
> + mov tail, wd
> + lea b2q, [bq + 4*wq]
> +
> + ALIGN 16
> + .loop_lo:
> + mova m0, [bq + 4*xq]
> + movu m1, [b2q + 4*xq]
> + paddd m1, m2
> + psrad m1, 1
> + psubd m0, m1
> + mova [temp_q + 4*xq], m0
> + add xd, mmsize/4
> + cmp xd, wd
> + jl .loop_lo
> +
> + xor xd, xd
> + and wd, ~(mmsize/4 - 1)
> +
> + ALIGN 16
> + .loop_hi:
> + mova m0, [temp_q + 4*xq]
> + movu m1, [b2q + 4*xq]
> + paddd m1, m0
> + paddd m0, m2
> + paddd m1, m2
> + psrad m0, 1
> + psrad m1, 1
> + SBUTTERFLY dq, 0,1,3
> + %if cpuflag(avx2)
> + SBUTTERFLY dqqq, 0,1,3
> + %endif
> + mova [bq + 8*xq], m0
> + mova [bq + 8*xq + mmsize], m1
> + add xd, mmsize/4
> + cmp xd, wd
> + jl .loop_hi
> +
> + and tail, mmsize/4 - 1
> + jz .end
> + .loop_scalar:
> + mov t0d, [temp_q + 4*xq]
> + mov t1d, [b2q + 4*xq]
> + add t1d, t0d
> + add t0d, 1
> + add t1d, 1
> + sar t0d, 1
> + sar t1d, 1
> + mov [bq + 8*xq], t0d
> + mov [bq + 8*xq + 4], t1d
> + add xq, 1
> + sub tail, 1
> + jg .loop_scalar
> +
> + .end:
> +REP_RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> +
> +INIT_XMM avx
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
>
You're not using any avx functions in that version, not unless a macro'd
instruction inserts one for you. I think you should remove the avx version
then.
Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
version you can just make a single macro to do both versions at the same
time.
+
> +INIT_YMM avx2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> b/libavcodec/x86/dirac_dwt_init_10bit.c
> new file mode 100644
> index 0000000000..289862d728
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> @@ -0,0 +1,76 @@
> +/*
> + * x86 optimized discrete wavelet transform
> + * Copyright (c) 2018 James Darnley
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/dirac_dwt.h"
> +
> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
> type)
> +{
> +#if HAVE_X86ASM
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (EXTERNAL_SSE2(cpu_flags)) {
> + switch (type) {
> + case DWT_DIRAC_HAAR0:
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> + break;
> + case DWT_DIRAC_HAAR1:
> + d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_sse2;
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> + break;
> + }
> + }
> +
> + if (EXTERNAL_AVX(cpu_flags)) {
> + switch (type) {
> + case DWT_DIRAC_HAAR0:
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> + break;
> + case DWT_DIRAC_HAAR1:
> + d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx;
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> + break;
> + }
>
We keep switches and cases on the same line.
+ }
> +
> + if (EXTERNAL_AVX2(cpu_flags)) {
> + switch (type) {
> + case DWT_DIRAC_HAAR0:
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> + break;
> + case DWT_DIRAC_HAAR1:
> + d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx2;
> + d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> + break;
> + }
Same.
More information about the ffmpeg-devel
mailing list