[FFmpeg-cvslog] x86/dcadec: add ff_lfe_fir1_float_{sse3,avx}
James Almer
git at videolan.org
Tue Feb 23 01:22:43 CET 2016
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Mon Feb 22 19:59:07 2016 -0300| [45d3af90593a8725ea72059fa3572577b30110c8] | committer: James Almer
x86/dcadec: add ff_lfe_fir1_float_{sse3,avx}
Reviewed-by: Christophe Gisquet <christophe.gisquet at gmail.com>
Signed-off-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=45d3af90593a8725ea72059fa3572577b30110c8
---
libavcodec/x86/dcadsp.asm | 79 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/dcadsp_init.c | 9 ++++-
2 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index fb13957..c5bf21a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -201,3 +201,82 @@ LFE_FIR0_FLOAT
INIT_XMM fma3
LFE_FIR0_FLOAT
%endif
+
+%macro LFE_FIR1_FLOAT 0
+cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
+ shr nblocksd, 2
+ sub lfeq, 3*sizeof_float
+ mov cnt1d, 64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ lea coeffq, [coeffq+cnt1q*4]
+ add samplesq, cnt1q
+ neg cnt1q
+
+.loop:
+%if cpuflag(avx)
+ cvtdq2ps m4, [lfeq]
+ shufps m5, m4, m4, q0123
+%elif cpuflag(sse2)
+ movu m4, [lfeq]
+ cvtdq2ps m4, m4
+ pshufd m5, m4, q0123
+%endif
+
+.inner_loop:
+ movaps m6, [coeffq+cnt1q*4 ]
+ movaps m7, [coeffq+cnt1q*4+16]
+ mulps m0, m5, m6
+ mulps m1, m5, m7
+%if ARCH_X86_64
+ movaps m8, [coeffq+cnt1q*4+32]
+ movaps m9, [coeffq+cnt1q*4+48]
+ mulps m2, m5, m8
+ mulps m3, m5, m9
+%else
+ mulps m2, m5, [coeffq+cnt1q*4+32]
+ mulps m3, m5, [coeffq+cnt1q*4+48]
+%endif
+
+ haddps m0, m1
+ haddps m2, m3
+ haddps m0, m2
+ movaps [samplesq+cnt1q], m0
+
+ mulps m6, m4
+ mulps m7, m4
+%if ARCH_X86_64
+ mulps m8, m4
+ mulps m9, m4
+
+ haddps m6, m7
+ haddps m8, m9
+ haddps m6, m8
+%else
+ mulps m2, m4, [coeffq+cnt1q*4+32]
+ mulps m3, m4, [coeffq+cnt1q*4+48]
+
+ haddps m6, m7
+ haddps m2, m3
+ haddps m6, m2
+%endif
+ movaps [samplesq+cnt2q], m6
+
+ sub cnt2d, 16
+ add cnt1q, 16
+ jl .inner_loop
+
+ add lfeq, sizeof_float
+ add samplesq, 128*sizeof_float
+ mov cnt1q, -64*sizeof_float
+ mov cnt2d, 64*sizeof_float-16
+ sub nblocksd, 1
+ jg .loop
+ RET
+%endmacro
+
+INIT_XMM sse3
+LFE_FIR1_FLOAT
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+LFE_FIR1_FLOAT
+%endif
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index bfe13e5..fc10fb8 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,10 +23,13 @@
#define LFE_FIR_FLOAT_FUNC(opt) \
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
+ const float *filter_coeff, ptrdiff_t npcmblocks); \
+void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks);
LFE_FIR_FLOAT_FUNC(sse)
LFE_FIR_FLOAT_FUNC(sse2)
+LFE_FIR_FLOAT_FUNC(sse3)
LFE_FIR_FLOAT_FUNC(avx)
LFE_FIR_FLOAT_FUNC(fma3)
@@ -38,8 +41,12 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
- if (EXTERNAL_AVX(cpu_flags))
+ if (EXTERNAL_SSE3(cpu_flags))
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
+ if (EXTERNAL_AVX(cpu_flags)) {
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
+ s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
+ }
if (EXTERNAL_FMA3(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}
More information about the ffmpeg-cvslog
mailing list