[FFmpeg-devel] [PATCH] SIMD-optimized float_to_int32_fmul_scalar()

Michael Niedermayer michaelni
Fri Jan 7 19:31:51 CET 2011


On Fri, Jan 07, 2011 at 01:15:37PM -0500, Justin Ruggles wrote:
> This patch implements float_to_int32_fmul_scalar() for 3dnow, sse, and
> sse2 and uses it in the AC3 encoder.
> 
> benchmarks (in dezicycles) for scale_coefficients() in ac3enc_float.c:
> 
> AMD Athlon 64 X2 6000+ (64-bit Ubuntu)
>     C: 137485
> 3DNow:  52110
>   SSE:  50257
>  SSE2:  53306
> 
> Intel Atom 330 (64-bit Ubuntu)
>     C: 595011
>   SSE: 149121
>  SSE2: 148662
> 
> Thanks,
> Justin
> 

>  ac3enc_float.c    |    6 ++---
>  dsputil.c         |    7 ++++++
>  dsputil.h         |    1 
>  x86/dsputil_mmx.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 74 insertions(+), 3 deletions(-)
> db8bf89c50552f3bc830e6b957f6b7ea4bf92d06  float_to_int32_fmul_scalar.patch
> diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
> index f324636..be2be8c 100644
> --- a/libavcodec/ac3enc_float.c
> +++ b/libavcodec/ac3enc_float.c
> @@ -107,9 +107,9 @@ static int normalize_samples(AC3EncodeContext *s)
>   */
>  static void scale_coefficients(AC3EncodeContext *s)
>  {
> -    int i;
> -    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
> -        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
> +    s->dsp.float_to_int32_fmul_scalar(s->fixed_coef_buffer, s->mdct_coef_buffer,
> +                                      16777216.0f, AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
> +    emms_c();
>  }
>  
>  
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 29ddb4d..06728ce 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -3866,6 +3866,12 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
>          dst[i] = src[i] * mul;
>  }
>  
> +static void float_to_int32_fmul_scalar_c(int32_t *dst, const float *src, float mul, int len){
> +    int i;
> +    for(i=0; i<len; i++)
> +        dst[i] = lrintf(src[i] * mul);
> +}
> +
>  static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
>                     uint32_t maxi, uint32_t maxisign)
>  {
> @@ -4440,6 +4446,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
>      c->vector_fmul_add = vector_fmul_add_c;
>      c->vector_fmul_window = ff_vector_fmul_window_c;
>      c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> +    c->float_to_int32_fmul_scalar = float_to_int32_fmul_scalar_c;
>      c->vector_clipf = vector_clipf_c;
>      c->float_to_int16 = ff_float_to_int16_c;
>      c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index 6c56a65..23a52f1 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -381,6 +381,7 @@ typedef struct DSPContext {
>      void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
>      /* assume len is a multiple of 8, and arrays are 16-byte aligned */
>      void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
> +    void (*float_to_int32_fmul_scalar)(int32_t *dst, const float *src, float mul, int len);

missing alignment requirements and len value requirements documentation (multiple of 16?)


>      void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
>      /**
>       * Multiply a vector of floats by a scalar float.  Source and
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 909ec41..41c55c1 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -2303,6 +2303,65 @@ static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mu
>      );
>  }
>  
> +static void float_to_int32_fmul_scalar_3dnow(int32_t *dst, const float *src, float mul, int len)
> +{
> +    /* note: pf2id conversion uses truncation, not round-to-nearest */
> +    x86_reg i = (len-4)*4;
> +    __asm__ volatile(
> +        "movq          %3,   %%mm1      \n\t"
> +        "punpckldq  %%mm1,   %%mm1      \n\t"
> +        "1:                             \n\t"
> +        "movq     (%2,%0),   %%mm0      \n\t"
> +        "pfmul      %%mm1,   %%mm0      \n\t"
> +        "pf2id      %%mm0,   %%mm0      \n\t"
> +        "movq       %%mm0, (%1,%0)      \n\t"
> +        "sub $8, %0                     \n\t"
> +        "jge 1b                         \n\t"
> +        "femms                          \n\t"

duplicate *emms
also some of these can be unrolled to gain a bit more speed

[ ...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

He who knows, does not speak. He who speaks, does not know. -- Lao Tsu
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20110107/15ff79f0/attachment.pgp>



More information about the ffmpeg-devel mailing list