[FFmpeg-devel] [RFC/PATCH] More flexible variafloat_to_int16 , WMA optimization, Vorbis

Loren Merritt lorenm
Wed Jul 16 02:32:34 CEST 2008


On Wed, 16 Jul 2008, Siarhei Siamashka wrote:
>
> Well, merging the loops that are run after iFFT and combining them with
> windowing code can probably provide interesting results. At least it should
> eliminate a lot of intermediate load and store operations. Maybe having iFFT
> output processed in a single loop could allow reading old saved data and
> also replace it with new saved data at the same time? At least in some
> simple cases when previous and current blocks have the same size.

sure, I'll try it.

>> See patch (which won't apply to svn, since it depends on other patches I
>> haven't committed yet, but the strategy should be clear).
>
> Hmm, did you forget to attach this patch?

oops

> But could you also benchmark SSE version of float_to_int16_interleave from
> my original submission on the cores where SSE2 was winning? It is quite a bit
> faster than the code from SVN in my tests:
> FLOAT_TO_INT16_INTERLEAVE(sse,
>    "1:                              \n"
>    "cvtps2pi  (%2,%0), %%mm0        \n"
>    "cvtps2pi 8(%2,%0), %%mm2        \n"
>    "cvtps2pi  (%3,%0), %%mm1        \n"
>    "cvtps2pi 8(%3,%0), %%mm3        \n"
>    "add         $16,   %0           \n"
>    "packssdw    %%mm1, %%mm0        \n"
>    "packssdw    %%mm3, %%mm2        \n"
>    "pshufw      $0xD8, %%mm0, %%mm0 \n"
>    "pshufw      $0xD8, %%mm2, %%mm2 \n"
>    "movq        %%mm0, -16(%1,%0)   \n"
>    "movq        %%mm2, -8(%1,%0)    \n"
>    "js 1b                           \n"
>    "emms                            \n"

k8:
1139 float_to_int16_interleave_siarhei
1161 float_to_int16_interleave_sse
1304 float_to_int16_interleave_sse2

conroe:
  978 float_to_int16_interleave_siarhei
1030 float_to_int16_interleave_sse
1071 float_to_int16_interleave_sse2

penryn:
  997 float_to_int16_interleave_siarhei
1062 float_to_int16_interleave_sse
  782 float_to_int16_interleave_sse2

prescott-celeron:
3846 float_to_int16_interleave_siarhei
3500 float_to_int16_interleave_sse
2219 float_to_int16_interleave_sse2

--Loren Merritt
-------------- next part --------------
Index: vorbis_dec.c
===================================================================
--- vorbis_dec.c	(revision 14251)
+++ vorbis_dec.c	(working copy)
@@ -152,7 +152,7 @@
     uint_fast8_t previous_window;
     float *channel_residues;
     float *channel_floors;
-    float *saved;
+    float **saved;
     uint_fast32_t add_bias; // for float->int conversion
     uint_fast32_t exp_bias;
 } vorbis_context;
@@ -177,6 +177,8 @@
 
     av_freep(&vc->channel_residues);
     av_freep(&vc->channel_floors);
+    for(i=0; i<=vc->audio_channels; i++)
+        av_freep(vc->saved+i);
     av_freep(&vc->saved);
 
     av_freep(&vc->residues);
@@ -844,6 +846,7 @@
 static int vorbis_parse_id_hdr(vorbis_context *vc){
     GetBitContext *gb=&vc->gb;
     uint_fast8_t bl0, bl1;
+    int i;
 
     if ((get_bits(gb, 8)!='v') || (get_bits(gb, 8)!='o') ||
     (get_bits(gb, 8)!='r') || (get_bits(gb, 8)!='b') ||
@@ -893,7 +896,9 @@
 
     vc->channel_residues= av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
     vc->channel_floors  = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
-    vc->saved           = av_mallocz((vc->blocksize[1]/4)*vc->audio_channels * sizeof(float));
+    vc->saved           = av_malloc(vc->audio_channels * sizeof(float*));
+    for(i=0; i<=vc->audio_channels; i++)
+        vc->saved[i]    = av_mallocz((vc->blocksize[1]/4) * sizeof(float));
     vc->previous_window=0;
 
     ff_mdct_init(&vc->mdct[0], bl0, 1);
@@ -1522,12 +1527,13 @@
         uint_fast16_t bs1=vc->blocksize[1];
         float *residue=vc->channel_residues+res_chan[j]*blocksize/2;
         float *floor=vc->channel_floors+j*blocksize/2;
-        float *saved=vc->saved+j*bs1/4;
+        float *saved=vc->saved[j];
         float *ret=vc->channel_residues+j*retlen;
         float *buf=floor;
         const float *win=vc->win[blockflag&previous_window];
 
-        vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor, residue);
+        FFSWAP(float*, vc->saved[j], vc->saved[vc->audio_channels]);
+        vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, vc->saved[j], floor, residue);
 
         if(blockflag == previous_window) {
             vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/4);
@@ -1538,7 +1544,6 @@
             copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
             vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/4);
         }
-        memcpy(saved, buf+blocksize/4, blocksize/4*sizeof(float));
     }
 
     vc->previous_window = blockflag;
Index: dsputil.h
===================================================================
--- dsputil.h	(revision 14207)
+++ dsputil.h	(working copy)
@@ -641,7 +641,7 @@
     void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
     void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
                        const FFTSample *input, FFTSample *tmp);
-    void (*imdct_half)(struct MDCTContext *s, FFTSample *output,
+    void (*imdct_half)(struct MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                        const FFTSample *input, FFTSample *tmp);
 } FFTContext;
 
@@ -688,15 +688,15 @@
 int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
 void ff_imdct_calc(MDCTContext *s, FFTSample *output,
                 const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
+void ff_imdct_half(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                    const FFTSample *input, FFTSample *tmp);
 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
                         const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                         const FFTSample *input, FFTSample *tmp);
 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
                        const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                        const FFTSample *input, FFTSample *tmp);
 void ff_mdct_calc(MDCTContext *s, FFTSample *out,
                const FFTSample *input, FFTSample *tmp);
Index: mdct.c
===================================================================
--- mdct.c	(revision 14207)
+++ mdct.c	(working copy)
@@ -166,11 +166,12 @@
 /**
  * Compute the middle half of the inverse MDCT of size N = 2^nbits,
  * thus excluding the parts that can be derived by symmetry
- * @param output N/2 samples
+ * @param out_left N/4 samples
+ * @param out_right N/4 samples
  * @param input N/2 samples
  * @param tmp N/2 samples
  */
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
+void ff_imdct_half(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                    const FFTSample *input, FFTSample *tmp)
 {
     int k, n8, n4, n;
@@ -181,11 +182,12 @@
 
     imdct_c(s, input, tmp);
 
+    out_left += n4-1;
     for(k = 0; k < n8; k++) {
-        output[n4-1-2*k]   =  z[n8+k].im;
-        output[n4-1-2*k-1] = -z[n8-k-1].re;
-        output[n4 + 2*k]   = -z[n8+k].re;
-        output[n4 + 2*k+1] =  z[n8-k-1].im;
+        out_left[-2*k]   =  z[n8+k].im;
+        out_left[-2*k-1] = -z[n8-k-1].re;
+        out_right[2*k]   = -z[n8+k].re;
+        out_right[2*k+1] =  z[n8-k-1].im;
     }
 }
 
Index: i386/fft_sse.c
===================================================================
--- i386/fft_sse.c	(revision 14207)
+++ i386/fft_sse.c	(working copy)
@@ -313,7 +313,7 @@
     );
 }
 
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                        const FFTSample *input, FFTSample *tmp)
 {
     x86_reg j, k;
@@ -331,8 +331,8 @@
     asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
     asm volatile(
         "1: \n\t"
-        "movaps     (%3,%1), %%xmm0 \n\t"
-        "movaps     (%3,%0), %%xmm1 \n\t"
+        "movaps     (%4,%1), %%xmm0 \n\t"
+        "movaps     (%4,%0), %%xmm1 \n\t"
         "xorps       %%xmm7, %%xmm0 \n\t"
         "movaps      %%xmm0, %%xmm2 \n\t"
         "shufps $141,%%xmm1, %%xmm0 \n\t"
@@ -340,13 +340,13 @@
         "shufps $54, %%xmm0, %%xmm0 \n\t"
         "shufps $156,%%xmm2, %%xmm2 \n\t"
         "xorps       %%xmm7, %%xmm0 \n\t"
-        "movaps      %%xmm2, (%2,%1) \n\t"
+        "movaps      %%xmm2, (%3,%1) \n\t"
         "movaps      %%xmm0, (%2,%0) \n\t"
         "sub $16, %1 \n\t"
         "add $16, %0 \n\t"
         "jl 1b \n\t"
         :"+r"(j), "+r"(k)
-        :"r"(output+n4), "r"(z+n8)
+        :"r"(out_left+n4), "r"(out_right), "r"(z+n8)
         :"memory"
     );
 }
Index: i386/fft_3dn2.c
===================================================================
--- i386/fft_3dn2.c	(revision 14207)
+++ i386/fft_3dn2.c	(working copy)
@@ -224,7 +224,7 @@
     asm volatile("femms");
 }
 
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
                         const FFTSample *input, FFTSample *tmp)
 {
     x86_reg j, k;
@@ -242,20 +242,20 @@
     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
     asm volatile(
         "1: \n\t"
-        "movq    (%3,%1), %%mm0 \n\t" // z[n8+k]
-        "pswapd  (%3,%0), %%mm1 \n\t" // z[n8-1-k]
+        "movq    (%4,%1), %%mm0 \n\t" // z[n8+k]
+        "pswapd  (%4,%0), %%mm1 \n\t" // z[n8-1-k]
         "movq      %%mm0, %%mm2 \n\t"
         "punpckldq %%mm1, %%mm0 \n\t"
         "punpckhdq %%mm2, %%mm1 \n\t"
         "pxor      %%mm7, %%mm0 \n\t"
         "pxor      %%mm7, %%mm1 \n\t"
-        "movq      %%mm0, (%2,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
+        "movq      %%mm0, (%3,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
         "movq      %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
         "sub $8, %1 \n\t"
         "add $8, %0 \n\t"
         "jl 1b \n\t"
         :"+r"(j), "+r"(k)
-        :"r"(output+n4), "r"(z+n8)
+        :"r"(out_left+n4), "r"(out_right), "r"(z+n8)
         :"memory"
     );
     asm volatile("femms");



More information about the ffmpeg-devel mailing list