[FFmpeg-devel] [PATCH][RFC] JPEG2000: SSE optimisation for DWT decoding
maxime taisant
maximetaisant at hotmail.fr
Thu Jul 20 19:07:42 EEST 2017
From: Maxime Taisant <maximetaisant at hotmail.fr>
Hi,
I am currently working on SSE optimisations for the dwt functions used to decode JPEG2000.
For the moment, I have only managed to produce a SSE-optimized version of the sr_1d97_float function (with relatively good results).
I would like to have some comments on my work so far, to know if I am on the right track or if there is some parts that I need to improve or modify.
Thank you.
---
libavcodec/jpeg2000dwt.c | 5 +-
libavcodec/jpeg2000dwt.h | 2 +
libavcodec/x86/jpeg2000dsp.asm | 268 ++++++++++++++++++++++++++++++++++++++
libavcodec/x86/jpeg2000dsp_init.c | 3 +
4 files changed, 277 insertions(+), 1 deletion(-)
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..b2a952aa29 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -425,7 +425,10 @@ static void dwt_decode97_float(DWTContext *s, float *t)
for (i = 1 - mh; i < lh; i += 2, j++)
l[i] = data[w * lp + j];
- sr_1d97_float(line, mh, mh + lh);
+ if (ARCH_X86)
+ ff_sr_1d97_float_sse(line, mh, mh + lh);
+ else
+ sr_1d97_float(line, mh, mh + lh);
for (i = 0; i < lh; i++)
data[w * lp + i] = l[i];
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..59dec14478 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -65,4 +65,6 @@ int ff_dwt_decode(DWTContext *s, void *t);
void ff_dwt_destroy(DWTContext *s);
+void ff_sr_1d97_float_sse(float *p, int i0, int i1);
+
#endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..dabfb914b8 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -29,6 +29,16 @@ pf_ict1: times 8 dd 0.34413
pf_ict2: times 8 dd 0.71414
pf_ict3: times 8 dd 1.772
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
SECTION .text
;***********************************************************************
@@ -142,3 +152,261 @@ RCT_INT
INIT_YMM avx2
RCT_INT
%endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *p, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 0
+cglobal sr_1d97_float, 3, 5, 10, p, i0, i1, tmp0, tmp1
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ add tmp0q, 1
+ cmp tmp1q, tmp0q
+ jg .extend
+ sub tmp0q, 2
+ jnz .else
+ movss m0, [pq+4]
+ movss m1, [F_LFTG_K]
+ movss m2, [TWO]
+ divss m1, m2
+ mulss m0, m1
+ movss [pq+4], m0
+ jmp .end
+
+.else:
+ movss m0, [pq]
+ movss m1, [F_LFTG_X]
+ mulss m0, m1
+ movss [pq], m0
+ jmp .end
+
+.extend:
+ shl i0d, 2
+ shl i1d, 2
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ movups m0, [pq+tmp0q+4]
+ shufps m0, m0, 0x1B
+ movups [pq+tmp0q-16], m0
+ movups m0, [pq+tmp1q-20]
+ shufps m0, m0, 0x1B
+ movups [pq+tmp1q], m0
+
+ movups m3, [F_LFTG_DELTA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 8
+ cmp tmp0q, tmp1q
+ jge .beginloop2
+.loop1:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop1
+
+ movups m0, [pq+2*tmp0q-28]
+ movups m4, [pq+2*tmp0q-12]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-24]
+ movups m5, [pq+2*tmp0q-8]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-28], m1
+ movups [pq+2*tmp0q-12], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop2
+ jmp .loop1
+
+.endloop1:
+ sub tmp0q, 12
+.littleloop1:
+ movss m0, [pq+2*tmp0q]
+ movss m1, [pq+2*tmp0q-4]
+ movss m2, [pq+2*tmp0q+4]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [pq+2*tmp0q], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop1
+
+.beginloop2:
+ movups m3, [F_LFTG_GAMMA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop3
+.loop2:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop2
+
+ movups m0, [pq+2*tmp0q-24]
+ movups m4, [pq+2*tmp0q-8]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-20]
+ movups m5, [pq+2*tmp0q-4]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-24], m1
+ movups [pq+2*tmp0q-8], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop3
+ jmp .loop2
+
+.endloop2:
+ sub tmp0q, 12
+.littleloop2:
+ movss m0, [pq+2*tmp0q+4]
+ movss m1, [pq+2*tmp0q]
+ movss m2, [pq+2*tmp0q+8]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [pq+2*tmp0q+4], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop2
+
+.beginloop3:
+ movups m3, [F_LFTG_BETA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 8
+ cmp tmp0q, tmp1q
+ jge .beginloop4
+.loop3:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop3
+
+ movups m0, [pq+2*tmp0q-28]
+ movups m4, [pq+2*tmp0q-12]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-24]
+ movups m5, [pq+2*tmp0q-8]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-28], m1
+ movups [pq+2*tmp0q-12], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .beginloop4
+ jmp .loop3
+
+.endloop3:
+ sub tmp0q, 12
+.littleloop3:
+ movss m0, [pq+2*tmp0q]
+ movss m1, [pq+2*tmp0q-4]
+ movss m2, [pq+2*tmp0q+4]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [pq+2*tmp0q], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop3
+
+.beginloop4:
+ movups m3, [F_LFTG_ALPHA]
+ mov tmp0q, i0q
+ mov tmp1q, i1q
+ shr tmp0q, 1
+ sub tmp0q, 4
+ shr tmp1q, 1
+ add tmp1q, 4
+ cmp tmp0q, tmp1q
+ jge .end
+.loop4:
+ add tmp0q, 12
+ cmp tmp0q, tmp1q
+ jge .endloop4
+
+ movups m0, [pq+2*tmp0q-24]
+ movups m4, [pq+2*tmp0q-8]
+ movups m1, m0
+ shufps m0, m4, 0xDD
+ shufps m1, m4, 0x88
+ movups m2, [pq+2*tmp0q-20]
+ movups m5, [pq+2*tmp0q-4]
+ shufps m2, m5, 0xDD
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movups m4, m1
+ shufps m1, m0, 0x44
+ shufps m1, m1, 0xD8
+ shufps m4, m0, 0xEE
+ shufps m4, m4, 0xD8
+ movups [pq+2*tmp0q-24], m1
+ movups [pq+2*tmp0q-8], m4
+
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jge .end
+ jmp .loop4
+
+.endloop4:
+ sub tmp0q, 12
+.littleloop4:
+ movss m0, [pq+2*tmp0q+4]
+ movss m1, [pq+2*tmp0q]
+ movss m2, [pq+2*tmp0q+8]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [pq+2*tmp0q+4], m0
+ add tmp0q, 4
+ cmp tmp0q, tmp1q
+ jl .littleloop4
+
+.end:
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+SR1D97FLOAT
+
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..3d3735c43a 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -23,12 +23,15 @@
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+void ff_sr_1d97_float_sse(float *p, int i0, int i1);
+
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
--
2.11.0
More information about the ffmpeg-devel
mailing list