[FFmpeg-devel] [PATCH 04/11] x86: dcadsp: implement SSE lfe_dir

Michael Niedermayer michaelni at gmx.at
Tue Feb 11 03:01:15 CET 2014


On Fri, Feb 07, 2014 at 10:35:22PM +0100, Christophe Gisquet wrote:
> Hi,
> 
> 2014-02-07 Loren Merritt <lorenm at u.washington.edu>:
> > On Thu, 6 Feb 2014, Christophe Gisquet wrote:
> >
> >> diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
> >> index 03593ce..4a682be 100644
> >> --- a/libavcodec/x86/dcadsp.asm
> >> +++ b/libavcodec/x86/dcadsp.asm
> >> @@ -88,3 +88,108 @@ INT8X8_FMUL_INT32  3
> >>
> >>  INIT_XMM sse4
> >>  INT8X8_FMUL_INT32  3
> >> +
> >> +; %1=v0/v1  %2=in1  %3=in2
> >> +%macro FIR_LOOP 2-3
> >> +.loop%1:
> >> +%define va          m1
> >> +%define vb          m2
> >> +%if %1
> >> +%define OFFSET      0
> >> +%else
> >> +%define OFFSET      NUM_COEF*count
> >> +%endif
> >> +; for v0, incrementint and for v1, decrementing
> >> +    mova        va, [cf0q + OFFSET]
> >> +    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
> >> +%if %0 == 3
> >> +    mova        m4, [cf0q + OFFSET + mmsize]
> >> +    mova     SCALE, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
> >> +%endif
> >> +    mulps       va, %2
> >> +    mulps       vb, %2
> >> +%if %0 == 3
> >> +    mulps       m4, %3
> >> +    mulps    SCALE, %3
> >> +    addps       va, m4
> >> +    addps       vb, SCALE
> >> +%endif
> >> +    ; va = va1 va2 va3 va4
> >> +    ; vb = vb1 vb2 vb3 vb4
> >> +%if %1
> >> +%define   O1    vb
> >> +%define   O2    va
> >> +%else
> >> +%define   O1    va
> >> +%define   O2    vb
> >> +%endif
> >
> > Can this be simplified with
> > %if %1
> > SWAP va, vb
> > %endif
> > and no O1, O2 variables?
> >
> >> +    mova        m4, O1
> >> +    unpcklps    O1, O2 ; va3 vb3 va4 vb4
> >> +    unpckhps    m4, O2 ; va1 vb1 va2 vb2
> >> +    addps       m4, O1 ; va1+3 vb1+3 va2+4 vb2+4
> >> +    movhlps     O2, m4 ; va1+3  vb1+3
> >> +    addps       O2, m4 ; va0..4 vb0..4
> >> +%if %1
> >> +    movh    [outq + count], O2
> >> +    sub       cf0q, 8*NUM_COEF
> >> +%else
> >> +    movh    [outq + count], O2
> >
> > factor out of the %if
> 
> All was ok, so here's a new patch.
> 
> -- 
> Christophe

>  dcadsp.asm    |   99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  dcadsp_init.c |    6 +++
>  2 files changed, 105 insertions(+)
> e15392342e6a1a9c3b150d5dbc5b4054a16e1c91  0004-x86-dcadsp-implement-SSE-lfe_dir.patch
> From b60bf426995afef4e5673412a50a994f6d581b18 Mon Sep 17 00:00:00 2001
> From: Christophe Gisquet <christophe.gisquet at gmail.com>
> Date: Wed, 19 Dec 2012 20:26:05 +0100
> Subject: [PATCH 04/10] x86: dcadsp: implement SSE lfe_dir

I think you can merge the scale factor into the lfe_fir_* tables
avoiding some instructions

also the coeff table looks constant so you can reorder it any
way at no cost
and the whole code looks like a 4 input 128 output or
8 input 64 output matrix multiplication with a constant matrix

Not sure whats the fastest way to implement this but
you could form all 4 needed permutations of the input and then do a
simpler 4x(mova, mulps, addps) inner loop

I maybe have missed a detail here or there but i suspect this can
be done more efficiently than how its implemented (with differently
ordered coeff tables)

> 
> Results for Arrandale/Windows:
> 32: 1670 -> 316
> 64:  728 -> 298
> ---
>  libavcodec/x86/dcadsp.asm    | 99 ++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/dcadsp_init.c |  6 +++
>  2 files changed, 105 insertions(+)
> 
> diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
> index 214f514..731854e 100644
> --- a/libavcodec/x86/dcadsp.asm
> +++ b/libavcodec/x86/dcadsp.asm
> @@ -88,3 +88,102 @@ INT8X8_FMUL_INT32
>  
>  INIT_XMM sse4
>  INT8X8_FMUL_INT32
> +
> +; %1=v0/v1  %2=in1  %3=in2
> +%macro FIR_LOOP 2-3
> +.loop%1:
> +%define va          m1
> +%define vb          m2
> +%if %1
> +%define OFFSET      0
> +%else
> +%define OFFSET      NUM_COEF*count
> +%endif
> +; for v0, incrementint and for v1, decrementing
> +    mova        va, [cf0q + OFFSET]
> +    mova        vb, [cf0q + OFFSET + 4*NUM_COEF]
> +%if %0 == 3
> +    mova        m4, [cf0q + OFFSET + mmsize]
> +    mova     SCALE, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
> +%endif
> +    mulps       va, %2
> +    mulps       vb, %2
> +%if %0 == 3
> +    mulps       m4, %3
> +    mulps    SCALE, %3
> +    addps       va, m4
> +    addps       vb, SCALE
> +%endif
> +    ; va = va1 va2 va3 va4
> +    ; vb = vb1 vb2 vb3 vb4
> +%if %1
> +    SWAP        va, vb
> +%endif
> +    mova        m4, va
> +    unpcklps    va, vb ; va3 vb3 va4 vb4
> +    unpckhps    m4, vb ; va1 vb1 va2 vb2
> +    addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
> +    movhlps     vb, m4 ; va1+3  vb1+3
> +    addps       vb, m4 ; va0..4 vb0..4
> +    movh    [outq + count], vb
> +%if %1
> +    sub       cf0q, 8*NUM_COEF
> +%endif
> +    add      count, 8
> +    jl   .loop%1
> +%endmacro
> +
> +; dca_lfe_fir(float *out, float *in, float *coefs, float scale)
> +%macro DCA_LFE_FIR 1
> +cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0, scale
> +
> +%if WIN64
> +    SWAP 0, 3
> +%endif
> +%define SCALE     m0
> +%define IN1       m3
> +%define IN2       m5
> +%define count     inq
> +%define NUM_COEF  4*(2-%1)
> +%define NUM_OUT   32*(%1+1)
> +
> +%if ARCH_X86_32
> +    movss  SCALE, scalem
> +%endif
> +
> +    movu     IN1, [inq + 4 - 1*mmsize]
> +    shufps   IN1, IN1, q0123
> +%if %1 == 0
> +    movu     IN2, [inq + 4 - 2*mmsize]
> +    shufps   IN2, IN2, q0123
> +%endif
> +
> +    mov    count, -4*NUM_OUT
> +    SPLATD SCALE
> +    add     cf0q, 4*NUM_COEF*NUM_OUT
> +    add     outq, 4*NUM_OUT
> +    ; compute v0 first
> +    mulps    IN1, SCALE
> +%if %1 == 0
> +    mulps    IN2, SCALE
> +    FIR_LOOP   0, IN1, IN2
> +%else
> +    FIR_LOOP   0, IN1
> +%endif
> +    shufps   IN1, IN1, q0123
> +    mov    count, -4*NUM_OUT
> +    ; cf1 already correctly positioned
> +    add     outq, 4*NUM_OUT          ; outq now at out2
> +    sub     cf0q, 8*NUM_COEF
> +%if %1 == 0
> +    shufps   IN2, IN2, q0123
> +    FIR_LOOP   1, IN2, IN1
> +%else
> +    FIR_LOOP   1, IN1
> +%endif
> +    RET
> +%endmacro
> +
> +INIT_XMM sse
> +DCA_LFE_FIR 0
> +DCA_LFE_FIR 1
> diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
> index 976d8a3..d649ecd 100644
> --- a/libavcodec/x86/dcadsp_init.c
> +++ b/libavcodec/x86/dcadsp_init.c
> @@ -26,6 +26,10 @@
>  void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
>  void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
>  void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
> +void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs,
> +                         float scale);
> +void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs,
> +                         float scale);
>  
>  av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
>  {
> @@ -35,6 +39,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
>  #if ARCH_X86_32
>          s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
>  #endif
> +        s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
> +        s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
>      }
>  
>      if (EXTERNAL_SSE2(cpu_flags)) {
> -- 
> 1.8.0.msysgit.0
> 

> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

it is not once nor twice but times without number that the same ideas make
their appearance in the world. -- Aristotle
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140211/e0d10815/attachment.asc>


More information about the ffmpeg-devel mailing list