[FFmpeg-devel] [PATCH 1/6] diracdec: add 10-bit Haar SIMD functions
James Darnley
jdarnley at obe.tv
Thu Jul 19 17:52:47 EEST 2018
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 119fps
SSE2: 204fps
AVX: 206fps
AVX2: 221fps
---
libavcodec/dirac_dwt.c | 7 +-
libavcodec/dirac_dwt.h | 1 +
libavcodec/x86/Makefile | 6 +-
libavcodec/x86/dirac_dwt_10bit.asm | 113 +++++++++++++++++++++
libavcodec/x86/dirac_dwt_init_10bit.c | 136 ++++++++++++++++++++++++++
5 files changed, 260 insertions(+), 3 deletions(-)
create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
index cc08f8865a..86bee5bb9b 100644
--- a/libavcodec/dirac_dwt.c
+++ b/libavcodec/dirac_dwt.c
@@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
return AVERROR_INVALIDDATA;
}
- if (ARCH_X86 && bit_depth == 8)
+#if ARCH_X86
+ if (bit_depth == 8)
ff_spatial_idwt_init_x86(d, type);
+ else if (bit_depth == 10)
+ ff_spatial_idwt_init_10bit_x86(d, type);
+#endif
+
return 0;
}
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
index 994dc21d70..1ad7b9a821 100644
--- a/libavcodec/dirac_dwt.h
+++ b/libavcodec/dirac_dwt.h
@@ -88,6 +88,7 @@ enum dwt_type {
int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
int decomposition_count, int bit_depth);
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
void ff_spatial_idwt_slice2(DWTContext *d, int y);
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 2350c8bbee..590d83c167 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
- x86/dirac_dwt_init.o
+ x86/dirac_dwt_init.o \
+ x86/dirac_dwt_init_10bit.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o
@@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o
X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
- x86/dirac_dwt.o
+ x86/dirac_dwt.o \
+ x86/dirac_dwt_10bit.o
X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o
X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
new file mode 100644
index 0000000000..dc3830615e
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -0,0 +1,113 @@
+;******************************************************************************
+;* x86 optimized discrete 10-bit wavelet trasnform
+;* Copyright (c) 2018 James Darnley
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pd_1
+
+SECTION .text
+
+%macro HAAR_VERTICAL 0
+
+cglobal vertical_compose_haar_10bit, 3, 3, 4, b0, b1, w
+ mova m2, [pd_1]
+ shl wd, 2
+ add b0q, wq
+ add b1q, wq
+ neg wq
+
+ ALIGN 16
+ .loop:
+ mova m0, [b0q + wq]
+ mova m1, [b1q + wq]
+ paddd m3, m1, m2
+ psrad m3, 1
+ psubd m0, m3
+ paddd m1, m0
+ mova [b0q + wq], m0
+ mova [b1q + wq], m1
+ add wq, mmsize
+ jl .loop
+RET
+
+%endmacro
+
+%macro HAAR_HORIZONTAL 0
+
+cglobal horizontal_compose_haar_10bit, 3, 6, 4, b, temp_, w, x, b2
+ mova m2, [pd_1]
+ xor xd, xd
+ shr wd, 1
+ lea b2q, [bq + 4*wq]
+
+ ALIGN 16
+ .loop_lo:
+ mova m0, [bq + 4*xq]
+ movu m1, [b2q + 4*xq]
+ paddd m1, m2
+ psrad m1, 1
+ psubd m0, m1
+ mova [temp_q + 4*xq], m0
+ add xd, mmsize/4
+ cmp xd, wd
+ jl .loop_lo
+
+ xor xd, xd
+ and wd, ~(mmsize/4 - 1)
+ cmp wd, mmsize/4
+ jl .end
+
+ ALIGN 16
+ .loop_hi:
+ mova m0, [temp_q + 4*xq]
+ movu m1, [b2q + 4*xq]
+ paddd m1, m0
+ paddd m0, m2
+ paddd m1, m2
+ psrad m0, 1
+ psrad m1, 1
+ SBUTTERFLY dq, 0,1,3
+ %if cpuflag(avx2)
+ SBUTTERFLY dqqq, 0,1,3
+ %endif
+ mova [bq + 8*xq], m0
+ mova [bq + 8*xq + mmsize], m1
+ add xd, mmsize/4
+ cmp xd, wd
+ jl .loop_hi
+ .end:
+REP_RET
+
+%endmacro
+
+INIT_XMM sse2
+HAAR_HORIZONTAL
+HAAR_VERTICAL
+
+INIT_XMM avx
+HAAR_HORIZONTAL
+HAAR_VERTICAL
+
+INIT_YMM avx2
+HAAR_HORIZONTAL
+HAAR_VERTICAL
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
new file mode 100644
index 0000000000..939950e3ff
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -0,0 +1,136 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2018 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
+void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
+void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
+
+void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
+void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
+void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
+
+static void vertical_compose_haar_sse2(int32_t *b0, int32_t *b1, int width)
+{
+ int i, width_align = width & ~3;
+ ff_vertical_compose_haar_10bit_sse2(b0, b1, width_align);
+ for(i=width_align; i<width; i++) {
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+ }
+}
+
+static void vertical_compose_haar_avx(int32_t *b0, int32_t *b1, int width)
+{
+ int i, width_align = width & ~3;
+ ff_vertical_compose_haar_10bit_avx(b0, b1, width_align);
+ for(i=width_align; i<width; i++) {
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+ }
+}
+
+static void vertical_compose_haar_avx2(int32_t *b0, int32_t *b1, int width)
+{
+ int i, width_align = width & ~7;
+ ff_vertical_compose_haar_10bit_avx2(b0, b1, width_align);
+ for(i=width_align; i<width; i++) {
+ b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]);
+ b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]);
+ }
+}
+
+static void horizontal_compose_haar_sse2(int32_t *b, int32_t *tmp, int width)
+{
+ int i = width/2 & ~3;
+ ff_horizontal_compose_haar_10bit_sse2(b, tmp, width);
+ for (; i < width/2; i++) {
+ b[2*i ] = (tmp[i] + 1) >> 1;
+ b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1;
+ }
+}
+
+static void horizontal_compose_haar_avx(int32_t *b, int32_t *tmp, int width)
+{
+ int i = width/2 & ~3;
+ ff_horizontal_compose_haar_10bit_avx(b, tmp, width);
+ for (; i < width/2; i++) {
+ b[2*i ] = (tmp[i] + 1) >> 1;
+ b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1;
+ }
+}
+
+static void horizontal_compose_haar_avx2(int32_t *b, int32_t *tmp, int width)
+{
+ int i = width/2 & ~7;
+ ff_horizontal_compose_haar_10bit_avx2(b, tmp, width);
+ for (; i < width/2; i++) {
+ b[2*i ] = (tmp[i] + 1) >> 1;
+ b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1;
+ }
+}
+
+av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_X86ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ switch (type) {
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->horizontal_compose = (void*)horizontal_compose_haar_sse2;
+ d->vertical_compose = (void*)vertical_compose_haar_sse2;
+ break;
+ }
+ }
+
+ if (EXTERNAL_AVX(cpu_flags)) {
+ switch (type) {
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_avx;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->horizontal_compose = (void*)horizontal_compose_haar_avx;
+ d->vertical_compose = (void*)vertical_compose_haar_avx;
+ break;
+ }
+ }
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ switch (type) {
+ case DWT_DIRAC_HAAR0:
+ d->vertical_compose = (void*)vertical_compose_haar_avx2;
+ break;
+ case DWT_DIRAC_HAAR1:
+ d->horizontal_compose = (void*)horizontal_compose_haar_avx2;
+ d->vertical_compose = (void*)vertical_compose_haar_avx2;
+ break;
+ }
+ }
+
+#endif // HAVE_X86ASM
+}
--
2.17.1
More information about the ffmpeg-devel
mailing list