[FFmpeg-devel] [PATCH 3/6] diracdec: add 10-bit Deslauriers-Dubuc 9, 7 (9_7) vertical high-pass function
James Darnley
jdarnley at obe.tv
Thu Jul 19 17:52:49 EEST 2018
Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C: 84fps
SSE2: 111fps
AVX2: 115fps
---
libavcodec/x86/dirac_dwt_10bit.asm | 38 +++++++++++++++++++++++++++
libavcodec/x86/dirac_dwt_init_10bit.c | 16 +++++++++++
2 files changed, 54 insertions(+)
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
index c00de32bfe..681de5e1df 100644
--- a/libavcodec/x86/dirac_dwt_10bit.asm
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -25,6 +25,7 @@ SECTION_RODATA
cextern pd_1
pd_2: times 4 dd 2
+pd_8: times 4 dd 8
SECTION .text
@@ -153,7 +154,44 @@ RET
%endmacro
+%macro DD97_VERTICAL_HI 0
+
+cglobal dd97_vertical_hi, 6, 6, 8, b0, b1, b2, b3, b4, w
+ mova m7, [pd_8]
+ shl wd, 2
+ add b0q, wq
+ add b1q, wq
+ add b2q, wq
+ add b3q, wq
+ add b4q, wq
+ neg wq
+
+ ALIGN 16
+ .loop:
+ mova m0, [b0q + wq]
+ mova m1, [b1q + wq]
+ mova m2, [b2q + wq]
+ mova m3, [b3q + wq]
+ mova m4, [b4q + wq]
+ pslld m5, m1, 3
+ pslld m6, m3, 3
+ paddd m5, m1
+ paddd m6, m3
+ psubd m5, m0
+ psubd m6, m4
+ paddd m5, m7
+ paddd m5, m6
+ psrad m5, 4
+ paddd m2, m5
+ mova [b2q + wq], m2
+ add wq, mmsize
+ jl .loop
+RET
+
+%endmacro
+
INIT_XMM sse2
+DD97_VERTICAL_HI
HAAR_HORIZONTAL
HAAR_VERTICAL
LEGALL53_VERTICAL_HI
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
index 88cf267d14..e7e7534050 100644
--- a/libavcodec/x86/dirac_dwt_init_10bit.c
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -23,6 +23,8 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dirac_dwt.h"
+void ff_dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int32_t *b3, int32_t *b4, int width);
+
void ff_legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
void ff_legall53_vertical_lo_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int width);
@@ -110,6 +112,16 @@ static void legall53_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2, int
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]);
}
+static void dd97_vertical_hi_sse2(int32_t *b0, int32_t *b1, int32_t *b2,
+ int32_t *b3, int32_t *b4, int width)
+{
+ int i = width & ~3;
+ ff_dd97_vertical_hi_sse2(b0, b1, b2, b3, b4, i);
+ for(; i<width; i++)
+ b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]);
+
+}
+
av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
{
#if HAVE_X86ASM
@@ -117,6 +129,10 @@ av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
if (EXTERNAL_SSE2(cpu_flags)) {
switch (type) {
+ case DWT_DIRAC_DD9_7:
+ d->vertical_compose_h0 = (void*)dd97_vertical_hi_sse2;
+ d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
+ break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_h0 = (void*)legall53_vertical_hi_sse2;
d->vertical_compose_l0 = (void*)legall53_vertical_lo_sse2;
--
2.17.1
More information about the ffmpeg-devel
mailing list