[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS

Wed Jul 11 13:00:10 CEST 2012

Hello Vittor,
>> Hi,
>
>Hi and sorry for the delay, was busy getting married :-)

Congratulations :)

>
>>>>    libavcodec/dsputil.c              |    1 +
>>>>    libavcodec/dsputil.h              |    1 +
>>>>    libavcodec/fft.c                  |    1 +
>>>>    libavcodec/fft.h                  |   11 +
>>>>    libavcodec/fmtconvert.c           |    1 +
>>>>    libavcodec/fmtconvert.h           |    1 +
>>>>    libavcodec/mips/Makefile          |    4 +
>>>>    libavcodec/mips/dsputil_mips.c    |  168 +++++++++
>>>>    libavcodec/mips/fft_mips.c        |  689 +++++++++++++++++++++++++++++++++++++
>>>>    libavcodec/mips/fft_table.h       |  482 ++++++++++++++++++++++++++
>>>>    libavcodec/mips/fmtconvert_mips.c |  336 ++++++++++++++++++
>>>>    11 files changed, 1695 insertions(+), 0 deletions(-)
>>>>    create mode 100644 libavcodec/mips/dsputil_mips.c
>>>>    create mode 100644 libavcodec/mips/fft_mips.c
>>>>    create mode 100644 libavcodec/mips/fft_table.h
>>>>    create mode 100644 libavcodec/mips/fmtconvert_mips.c
>>>>
>>>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>>>> index 442b900..b7d928f 100644
>>>> --- a/libavcodec/dsputil.c
>>>> +++ b/libavcodec/dsputil.c
>>>> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>>>        if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
>>>>        if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
>>>>        if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
>>>> +    if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
>>>
>>>> --- a/libavcodec/fft.h
>>>> +++ b/libavcodec/fft.h
>>>> @@ -38,6 +38,16 @@
>>>>
>>>>    typedef float FFTDouble;
>>>>
>>>> +#if ARCH_MIPS
>>>> +enum _fftConsts{
>>>> +    MIN_LOG2_NFFT = 5, //!<    Specifies miniumum allowed fft size
>>>> +    MAX_LOG2_NFFT = 12 //!<    Specifies maxiumum allowed fft size
>>>> +};
>>>> +
>>>> +#define MAX_FFT_SIZE (1<<    MAX_LOG2_NFFT)
>>>> +#define MIN_FFT_SIZE (1<<    MAX_LOG2_NFFT)
>>>> +
>>>> +#endif
>>>
>>> MIPS-specific code should not be in common code.
>>
>> I will place this in appropriate MIPS file.
>>
>>>
>>>> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
>>>> new file mode 100644
>>>
>>> Nice, can you post the benchmarks results of "fft-
>>> test -s"?
>>
>> Posted below
>>
>>>
>>>> index 0000000..286c67f
>>>> --- /dev/null
>>>> +++ b/libavcodec/mips/fft_mips.c
>>>> @@ -0,0 +1,689 @@
>>>> +/*
>>>> + * Copyright (c) 2012
>>>> + *      MIPS Technologies, Inc., California.
>>>> + *
>>>> + * Redistribution and use in source and binary forms, with or without
>>>> + * modification, are permitted provided that the following conditions
>>>> + * are met:
>>>> + * 1. Redistributions of source code must retain the above copyright
>>>> + *    notice, this list of conditions and the following disclaimer.
>>>> + * 2. Redistributions in binary form must reproduce the above copyright
>>>> + *    notice, this list of conditions and the following disclaimer in the
>>>> + *    documentation and/or other materials provided with the distribution.
>>>> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
>>>> + *    contributors may be used to endorse or promote products derived from
>>>> + *    this software without specific prior written permission.
>>>> + *
>>>> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
>>>> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>>>> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>>>> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
>>>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>>>> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>>>> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>>>> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>>>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>>>> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>>>> + * SUCH DAMAGE.
>>>> + *
>>>> + * Author:  Stanisalv Ocovaj (socovaj at mips.com)
>>>> + * Author:  Zoran Lukic (zoranl at mips.com)
>>>> + *
>>>> + * Optimized MDCT/IMDCT and FFT transforms
>>>> + *
>>>> + * This file is part of FFmpeg.
>>>> + *
>>>> + * FFmpeg is free software; you can redistribute it and/or
>>>> + * modify it under the terms of the GNU Lesser General Public
>>>> + * License as published by the Free Software Foundation; either
>>>> + * version 2.1 of the License, or (at your option) any later version.
>>>> + *
>>>> + * FFmpeg is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>> + * Lesser General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU Lesser General Public
>>>> + * License along with FFmpeg; if not, write to the Free Software
>>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>>> + */
>>>> +#include "config.h"
>>>> +#include "libavcodec/fft.h"
>>>> +#include "fft_table.h"
>>>> +
>>>> +/**
>>>> + * FFT transform
>>>> + */
>>>> +
>>>> +#if HAVE_INLINE_ASM
>>>> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
>>>> +
>>>> +    int nbits, i, n, num_transforms, offset, step;
>>>> +    int n4, n2, n34;
>>>> +    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
>>>> +    FFTComplex *tmpz;
>>>> +    float w_re, w_im;
>>>> +    float *w_re_ptr;
>>>> +    const int fft_size = (1<<    s->nbits);
>>>> +    int s_n = s->nbits;
>>>> +    int tem1, tem2;
>>>> +    float pom,  pom1,  pom2,  pom3;
>>>> +    float temp, temp1, temp3, temp4;
>>>> +    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
>>>> +    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
>>>> +
>>>> +    /**
>>>> +    *num_transforms = (0x2aab>>    (16 - s->nbits)) | 1;
>>>> +    */
>>>> +    __asm__ __volatile__ (
>>>> +        "li   %[tem1], 16                                      \n\t"
>>>> +        "sub  %[s_n],  %[tem1], %[s_n]                         \n\t"
>>>> +        "li   %[tem2], 10923                                   \n\t"
>>>> +        "srav %[tem2], %[tem2], %[s_n]                         \n\t"
>>>> +        "ori  %[num_t],%[tem2], 1                              \n\t"
>>>> +        : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
>>>> +          [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
>>>> +    );
>>>> +
>>>> +
>>>> +    for (n=0; n<num_transforms; n++)
>>>> +    {
>>>> +        offset = fft_offsets_lut[n]<<    2;
>>>> +        tmpz = z + offset;
>>>
>>> What is the point of this LUT? If you want your input permutated in some
>>> particular order, you can just use revtab struct.
>>
>> We use this LUT because of the somewhat irregular (when compared to radix-2 FFT) structure
>> of split-radix FFT. It tells us at which starting offsets the sub-transforms of particular
>> size have to be performed. It has nothing to do with input permutation, which is the same
>> as in the original algorithm (we use the same revtab for that).
>
>So, if I understand correctly, your code need a LUT and the C version
>doesn't because you are not using a recursive algorithm. If this is the
>case, I think it would be worth to use large static tables and
>initialize them dynamically in fft_init(). That way, since you will not
>make the binary bigger, you can make your code work for all the
>supported FFT sizes.

You are correct and code will be changed according to your suggestion.

>
>>>> +
>>>> +        tmp1 = tmpz[0].re + tmpz[1].re;
>>>> +        tmp5 = tmpz[2].re + tmpz[3].re;
>>>> +        tmp2 = tmpz[0].im + tmpz[1].im;
>>>> +        tmp6 = tmpz[2].im + tmpz[3].im;
>>>> +        tmp3 = tmpz[0].re - tmpz[1].re;
>>>> +        tmp8 = tmpz[2].im - tmpz[3].im;
>>>> +        tmp4 = tmpz[0].im - tmpz[1].im;
>>>> +        tmp7 = tmpz[2].re - tmpz[3].re;
>>>> +
>>>> +        tmpz[0].re = tmp1 + tmp5;
>>>> +        tmpz[2].re = tmp1 - tmp5;
>>>> +        tmpz[0].im = tmp2 + tmp6;
>>>> +        tmpz[2].im = tmp2 - tmp6;
>>>> +        tmpz[1].re = tmp3 + tmp8;
>>>> +        tmpz[3].re = tmp3 - tmp8;
>>>> +        tmpz[1].im = tmp4 - tmp7;
>>>> +        tmpz[3].im = tmp4 + tmp7;
>>>> +
>>>> +}
>>>> +
>>>> +    if (fft_size<    8)
>>>> +        return;
>>>> +
>>>> +    num_transforms = (num_transforms>>    1) | 1;
>>>> +    for (n=0; n<num_transforms; n++)
>>>> +    {
>>>> +        offset = fft_offsets_lut[n]<<    3;
>>>> +        tmpz = z + offset;
>>>> +
>>>> +        __asm__ __volatile__ (
>>>> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
>>>> +            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
>>>> +            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
>>>> +            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
>>>> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
>>>> +            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
>>>> +            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
>>>> +            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
>>>> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
>>>> +            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
>>>> +            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
>>>> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
>>>> +            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
>>>> +            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
>>>> +            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
>>>> +            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
>>>> +            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
>>>> +            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
>>>> +            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
>>>> +            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
>>>> +            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
>>>> +            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
>>>> +            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
>>>> +            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
>>>> +            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
>>>> +            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
>>>> +            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
>>>> +            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
>>>> +            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
>>>> +            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
>>>> +            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
>>>> +            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
>>>> +            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
>>>> +            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
>>>> +            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
>>>> +            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
>>>> +            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
>>>> +            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
>>>> +            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
>>>> +            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
>>>> +            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
>>>> +            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
>>>> +            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
>>>> +            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
>>>> +            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
>>>> +            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
>>>> +            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
>>>> +            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
>>>> +            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
>>>> +            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
>>>> +            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
>>>> +            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
>>>> +            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
>>>> +            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
>>>> +            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
>>>> +            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
>>>> +            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
>>>> +            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
>>>> +            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
>>>> +            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
>>>> +            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
>>>> +            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
>>>> +            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
>>>> +            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
>>>> +            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
>>>> +            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
>>>> +            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
>>>> +            : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>>> +              [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5),  [tmp7]"=f"(tmp7),
>>>> +              [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
>>>> +              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
>>>> +            :
>>>> +            : "memory"
>>>> +        );
>>>> +    }
>>>> +
>>>> +    step = 1<<    (MAX_LOG2_NFFT - 4);
>>>> +    n4 = 4;
>>>> +    for (nbits=4; nbits<=s->nbits; nbits++)
>>>> +    {
>>>> +        /*
>>>> +        * num_transforms = (num_transforms>>    1) | 1;
>>>> +        */
>>>> +        __asm__ __volatile__ (
>>>> +            "sra %[num_t], %[num_t], 1               \n\t"
>>>> +            "ori %[num_t], %[num_t], 1               \n\t"
>>>> +
>>>> +            : [num_t] "+r" (num_transforms)
>>>> +        );
>>>> +        n2  = 2 * n4;
>>>> +        n34 = 3 * n4;
>>>> +
>>>> +        for (n=0; n<num_transforms; n++)
>>>> +        {
>>>> +            offset = fft_offsets_lut[n]<<    nbits;
>>>> +            tmpz = z + offset;
>>>> +
>>>> +            tmpz_n2  = tmpz +  n2;
>>>> +            tmpz_n4  = tmpz +  n4;
>>>> +            tmpz_n34 = tmpz +  n34;
>>>> +
>>>> +            __asm__ __volatile__ (
>>>> +                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
>>>> +                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
>>>> +                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
>>>> +                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
>>>> +                "lwc1  %[temp1],0(%[tmpz])               \n\t"
>>>> +                "lwc1  %[temp3],4(%[tmpz])               \n\t"
>>>> +                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
>>>> +                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
>>>> +                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
>>>> +                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
>>>> +                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
>>>> +                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
>>>> +                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
>>>> +                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
>>>> +                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
>>>> +                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
>>>> +                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
>>>> +                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
>>>> +                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
>>>> +                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
>>>> +                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
>>>> +                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
>>>> +                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
>>>> +                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
>>>> +                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
>>>> +                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
>>>> +                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
>>>> +                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
>>>> +                : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
>>>> +                  [tmp1]"=f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
>>>> +                  [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6),       [tmpz_n4]"+r"(tmpz_n4),   [pom3]"=&f"(pom3),
>>>> +                  [temp]"=f"(temp), [temp1]"=f"(temp1),     [temp3]"=f"(temp3),       [temp4]"=f"(temp4)
>>>> +                :
>>>> +                : "memory"
>>>> +            );
>>>> +
>>>> +            w_re_ptr = w_tab + step;
>>>> +
>>>> +            for (i=1; i<n4; i++)
>>>> +            {
>>>> +                w_re = w_re_ptr[0];
>>>> +                w_im = w_re_ptr[MAX_FFT_SIZE/4];
>>>
>>> Can you explain why you cannot use the same cos/sin tab that the C
>>> version uses?
>> We can use them and I will rewrite this part of code to use them.
>>
>> ========================
>> fft-test -s results on MIPS 74Kf board:
>> ========================
>> original fft:
>>
>> FFT 512 test
>> Checking...
>> max:0.000008 e:3.92148e-08
>> Speed test...
>> time: 136.9 us/transform [total time=1.12 s its=8192]
>>
>> ========================================================
>> optimized fft:
>>
>> FFT 512 test
>> Checking...
>> max:0.000005 e:3.86258e-08
>> Speed test...
>> time: 89.7 us/transform [total time=1.47 s its=16384]
>
>Pretty impressive :-D
Thanks.
The guys working on this did a good job :)

-Nedeljko
________________________________________
From: ffmpeg-devel-bounces at ffmpeg.org [ffmpeg-devel-bounces at ffmpeg.org] on behalf of Vitor Sessak [vitor1001 at gmail.com]
Sent: Wednesday, July 04, 2012 15:49
To: Babic, Nedeljko
Cc: Lukac, Zeljko; FFmpeg development discussions and patches
Subject: Re: [FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS

On 06/25/2012 10:59 AM, Babic, Nedeljko wrote:
> Hi,

Hi and sorry for the delay, was busy getting married :-)

>>>    libavcodec/dsputil.c              |    1 +
>>>    libavcodec/dsputil.h              |    1 +
>>>    libavcodec/fft.c                  |    1 +
>>>    libavcodec/fft.h                  |   11 +
>>>    libavcodec/fmtconvert.c           |    1 +
>>>    libavcodec/fmtconvert.h           |    1 +
>>>    libavcodec/mips/Makefile          |    4 +
>>>    libavcodec/mips/dsputil_mips.c    |  168 +++++++++
>>>    libavcodec/mips/fft_mips.c        |  689 +++++++++++++++++++++++++++++++++++++
>>>    libavcodec/mips/fft_table.h       |  482 ++++++++++++++++++++++++++
>>>    libavcodec/mips/fmtconvert_mips.c |  336 ++++++++++++++++++
>>>    11 files changed, 1695 insertions(+), 0 deletions(-)
>>>    create mode 100644 libavcodec/mips/dsputil_mips.c
>>>    create mode 100644 libavcodec/mips/fft_mips.c
>>>    create mode 100644 libavcodec/mips/fft_table.h
>>>    create mode 100644 libavcodec/mips/fmtconvert_mips.c
>>>
>>> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
>>> index 442b900..b7d928f 100644
>>> --- a/libavcodec/dsputil.c
>>> +++ b/libavcodec/dsputil.c
>>> @@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>>>        if (HAVE_MMI)        ff_dsputil_init_mmi   (c, avctx);
>>>        if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
>>>        if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
>>> +    if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
>>
>>> --- a/libavcodec/fft.h
>>> +++ b/libavcodec/fft.h
>>> @@ -38,6 +38,16 @@
>>>
>>>    typedef float FFTDouble;
>>>
>>> +#if ARCH_MIPS
>>> +enum _fftConsts{
>>> +    MIN_LOG2_NFFT = 5, //!<    Specifies miniumum allowed fft size
>>> +    MAX_LOG2_NFFT = 12 //!<    Specifies maxiumum allowed fft size
>>> +};
>>> +
>>> +#define MAX_FFT_SIZE (1<<    MAX_LOG2_NFFT)
>>> +#define MIN_FFT_SIZE (1<<    MAX_LOG2_NFFT)
>>> +
>>> +#endif
>>
>> MIPS-specific code should not be in common code.
>
> I will place this in appropriate MIPS file.
>
>>
>>> diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
>>> new file mode 100644
>>
>> Nice, can you post the benchmarks results of "fft-
>> test -s"?
>
> Posted below
>
>>
>>> index 0000000..286c67f
>>> --- /dev/null
>>> +++ b/libavcodec/mips/fft_mips.c
>>> @@ -0,0 +1,689 @@
>>> +/*
>>> + * Copyright (c) 2012
>>> + *      MIPS Technologies, Inc., California.
>>> + *
>>> + * Redistribution and use in source and binary forms, with or without
>>> + * modification, are permitted provided that the following conditions
>>> + * are met:
>>> + * 1. Redistributions of source code must retain the above copyright
>>> + *    notice, this list of conditions and the following disclaimer.
>>> + * 2. Redistributions in binary form must reproduce the above copyright
>>> + *    notice, this list of conditions and the following disclaimer in the
>>> + *    documentation and/or other materials provided with the distribution.
>>> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
>>> + *    contributors may be used to endorse or promote products derived from
>>> + *    this software without specific prior written permission.
>>> + *
>>> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
>>> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>>> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>>> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
>>> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>>> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>>> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>>> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>>> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>>> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>>> + * SUCH DAMAGE.
>>> + *
>>> + * Author:  Stanisalv Ocovaj (socovaj at mips.com)
>>> + * Author:  Zoran Lukic (zoranl at mips.com)
>>> + *
>>> + * Optimized MDCT/IMDCT and FFT transforms
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> + */
>>> +#include "config.h"
>>> +#include "libavcodec/fft.h"
>>> +#include "fft_table.h"
>>> +
>>> +/**
>>> + * FFT transform
>>> + */
>>> +
>>> +#if HAVE_INLINE_ASM
>>> +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
>>> +
>>> +    int nbits, i, n, num_transforms, offset, step;
>>> +    int n4, n2, n34;
>>> +    FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
>>> +    FFTComplex *tmpz;
>>> +    float w_re, w_im;
>>> +    float *w_re_ptr;
>>> +    const int fft_size = (1<<    s->nbits);
>>> +    int s_n = s->nbits;
>>> +    int tem1, tem2;
>>> +    float pom,  pom1,  pom2,  pom3;
>>> +    float temp, temp1, temp3, temp4;
>>> +    FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
>>> +    FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
>>> +
>>> +    /**
>>> +    *num_transforms = (0x2aab>>    (16 - s->nbits)) | 1;
>>> +    */
>>> +    __asm__ __volatile__ (
>>> +        "li   %[tem1], 16                                      \n\t"
>>> +        "sub  %[s_n],  %[tem1], %[s_n]                         \n\t"
>>> +        "li   %[tem2], 10923                                   \n\t"
>>> +        "srav %[tem2], %[tem2], %[s_n]                         \n\t"
>>> +        "ori  %[num_t],%[tem2], 1                              \n\t"
>>> +        : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
>>> +          [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
>>> +    );
>>> +
>>> +
>>> +    for (n=0; n<num_transforms; n++)
>>> +    {
>>> +        offset = fft_offsets_lut[n]<<    2;
>>> +        tmpz = z + offset;
>>
>> What is the point of this LUT? If you want your input permutated in some
>> particular order, you can just use revtab struct.
>
> We use this LUT because of the somewhat irregular (when compared to radix-2 FFT) structure
> of split-radix FFT. It tells us at which starting offsets the sub-transforms of particular
> size have to be performed. It has nothing to do with input permutation, which is the same
> as in the original algorithm (we use the same revtab for that).

So, if I understand correctly, your code need a LUT and the C version
doesn't because you are not using a recursive algorithm. If this is the
case, I think it would be worth to use large static tables and
initialize them dynamically in fft_init(). That way, since you will not
make the binary bigger, you can make your code work for all the
supported FFT sizes.

>>> +
>>> +        tmp1 = tmpz[0].re + tmpz[1].re;
>>> +        tmp5 = tmpz[2].re + tmpz[3].re;
>>> +        tmp2 = tmpz[0].im + tmpz[1].im;
>>> +        tmp6 = tmpz[2].im + tmpz[3].im;
>>> +        tmp3 = tmpz[0].re - tmpz[1].re;
>>> +        tmp8 = tmpz[2].im - tmpz[3].im;
>>> +        tmp4 = tmpz[0].im - tmpz[1].im;
>>> +        tmp7 = tmpz[2].re - tmpz[3].re;
>>> +
>>> +        tmpz[0].re = tmp1 + tmp5;
>>> +        tmpz[2].re = tmp1 - tmp5;
>>> +        tmpz[0].im = tmp2 + tmp6;
>>> +        tmpz[2].im = tmp2 - tmp6;
>>> +        tmpz[1].re = tmp3 + tmp8;
>>> +        tmpz[3].re = tmp3 - tmp8;
>>> +        tmpz[1].im = tmp4 - tmp7;
>>> +        tmpz[3].im = tmp4 + tmp7;
>>> +
>>> +}
>>> +
>>> +    if (fft_size<    8)
>>> +        return;
>>> +
>>> +    num_transforms = (num_transforms>>    1) | 1;
>>> +    for (n=0; n<num_transforms; n++)
>>> +    {
>>> +        offset = fft_offsets_lut[n]<<    3;
>>> +        tmpz = z + offset;
>>> +
>>> +        __asm__ __volatile__ (
>>> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
>>> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom1], 56(%[tmpz])                     \n\t"
>>> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom2], 44(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
>>> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
>>> +            "add.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re + tmpz[5].re;
>>> +            "add.s %[tmp3], %[tmp3],    %[pom1]             \n\t"  // tmp3 = tmpz[6].re + tmpz[7].re;
>>> +            "add.s %[tmp2], %[tmp2],    %[pom2]             \n\t"  // tmp2 = tmpz[4].im + tmpz[5].im;
>>> +            "lwc1  %[pom],  40(%[tmpz])                     \n\t"
>>> +            "add.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im + tmpz[7].im;
>>> +            "add.s %[tmp5], %[tmp1],    %[tmp3]             \n\t"  // tmp5 = tmp1 + tmp3;
>>> +            "sub.s %[tmp7], %[tmp1],    %[tmp3]             \n\t"  // tmp7 = tmp1 - tmp3;
>>> +            "lwc1  %[tmp1], 32(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom1], 44(%[tmpz])                     \n\t"
>>> +            "add.s %[tmp6], %[tmp2],    %[tmp4]             \n\t"  // tmp6 = tmp2 + tmp4;
>>> +            "sub.s %[tmp8], %[tmp2],    %[tmp4]             \n\t"  // tmp8 = tmp2 - tmp4;
>>> +            "lwc1  %[tmp2], 36(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom2], 56(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom3], 60(%[tmpz])                     \n\t"
>>> +            "lwc1  %[tmp3], 48(%[tmpz])                     \n\t"
>>> +            "lwc1  %[tmp4], 52(%[tmpz])                     \n\t"
>>> +            "sub.s %[tmp1], %[tmp1],    %[pom]              \n\t"  // tmp1 = tmpz[4].re - tmpz[5].re;
>>> +            "lwc1  %[pom],  0(%[tmpz])                      \n\t"
>>> +            "sub.s %[tmp2], %[tmp2],    %[pom1]             \n\t"  // tmp2 = tmpz[4].im - tmpz[5].im;
>>> +            "sub.s %[tmp3], %[tmp3],    %[pom2]             \n\t"  // tmp3 = tmpz[6].re - tmpz[7].re;
>>> +            "lwc1  %[pom2], 4(%[tmpz])                      \n\t"
>>> +            "sub.s %[pom1], %[pom],     %[tmp5]             \n\t"
>>> +            "sub.s %[tmp4], %[tmp4],    %[pom3]             \n\t"  // tmp4 = tmpz[6].im - tmpz[7].im;
>>> +            "add.s %[pom3], %[pom],     %[tmp5]             \n\t"
>>> +            "sub.s %[pom],  %[pom2],    %[tmp6]             \n\t"
>>> +            "add.s %[pom2], %[pom2],    %[tmp6]             \n\t"
>>> +            "swc1  %[pom1], 32(%[tmpz])                     \n\t"  // tmpz[4].re = tmpz[0].re - tmp5;
>>> +            "swc1  %[pom3], 0(%[tmpz])                      \n\t"  // tmpz[0].re = tmpz[0].re + tmp5;
>>> +            "swc1  %[pom],  36(%[tmpz])                     \n\t"  // tmpz[4].im = tmpz[0].im - tmp6;
>>> +            "swc1  %[pom2], 4(%[tmpz])                      \n\t"  // tmpz[0].im = tmpz[0].im + tmp6;
>>> +            "lwc1  %[pom1], 16(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom3], 20(%[tmpz])                     \n\t"
>>> +            "li.s  %[pom],  0.7071067812                    \n\t"  // float pom = 0.7071067812f;
>>> +            "add.s %[temp1],%[tmp1],    %[tmp2]             \n\t"
>>> +            "sub.s %[temp], %[pom1],    %[tmp8]             \n\t"
>>> +            "add.s %[pom2], %[pom3],    %[tmp7]             \n\t"
>>> +            "sub.s %[temp3],%[tmp3],    %[tmp4]             \n\t"
>>> +            "sub.s %[temp4],%[tmp2],    %[tmp1]             \n\t"
>>> +            "swc1  %[temp], 48(%[tmpz])                     \n\t"  // tmpz[6].re = tmpz[2].re - tmp8;
>>> +            "swc1  %[pom2], 52(%[tmpz])                     \n\t"  // tmpz[6].im = tmpz[2].im + tmp7;
>>> +            "add.s %[pom1], %[pom1],    %[tmp8]             \n\t"
>>> +            "sub.s %[pom3], %[pom3],    %[tmp7]             \n\t"
>>> +            "add.s %[tmp3], %[tmp3],    %[tmp4]             \n\t"
>>> +            "mul.s %[tmp5], %[pom],     %[temp1]            \n\t"  // tmp5 = pom * (tmp1 + tmp2);
>>> +            "mul.s %[tmp7], %[pom],     %[temp3]            \n\t"  // tmp7 = pom * (tmp3 - tmp4);
>>> +            "mul.s %[tmp6], %[pom],     %[temp4]            \n\t"  // tmp6 = pom * (tmp2 - tmp1);
>>> +            "mul.s %[tmp8], %[pom],     %[tmp3]             \n\t"  // tmp8 = pom * (tmp3 + tmp4);
>>> +            "swc1  %[pom1], 16(%[tmpz])                     \n\t"  // tmpz[2].re = tmpz[2].re + tmp8;
>>> +            "swc1  %[pom3], 20(%[tmpz])                     \n\t"  // tmpz[2].im = tmpz[2].im - tmp7;
>>> +            "add.s %[tmp1], %[tmp5],    %[tmp7]             \n\t"  // tmp1 = tmp5 + tmp7;
>>> +            "sub.s %[tmp3], %[tmp5],    %[tmp7]             \n\t"  // tmp3 = tmp5 - tmp7;
>>> +            "add.s %[tmp2], %[tmp6],    %[tmp8]             \n\t"  // tmp2 = tmp6 + tmp8;
>>> +            "sub.s %[tmp4], %[tmp6],    %[tmp8]             \n\t"  // tmp4 = tmp6 - tmp8;
>>> +            "lwc1  %[temp], 8(%[tmpz])                      \n\t"
>>> +            "lwc1  %[temp1],12(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom],  24(%[tmpz])                     \n\t"
>>> +            "lwc1  %[pom2], 28(%[tmpz])                     \n\t"
>>> +            "sub.s %[temp4],%[temp],    %[tmp1]             \n\t"
>>> +            "sub.s %[temp3],%[temp1],   %[tmp2]             \n\t"
>>> +            "add.s %[temp], %[temp],    %[tmp1]             \n\t"
>>> +            "add.s %[temp1],%[temp1],   %[tmp2]             \n\t"
>>> +            "sub.s %[pom1], %[pom],     %[tmp4]             \n\t"
>>> +            "add.s %[pom3], %[pom2],    %[tmp3]             \n\t"
>>> +            "add.s %[pom],  %[pom],     %[tmp4]             \n\t"
>>> +            "sub.s %[pom2], %[pom2],    %[tmp3]             \n\t"
>>> +            "swc1  %[temp4],40(%[tmpz])                     \n\t"  // tmpz[5].re = tmpz[1].re - tmp1;
>>> +            "swc1  %[temp3],44(%[tmpz])                     \n\t"  // tmpz[5].im = tmpz[1].im - tmp2;
>>> +            "swc1  %[temp], 8(%[tmpz])                      \n\t"  // tmpz[1].re = tmpz[1].re + tmp1;
>>> +            "swc1  %[temp1],12(%[tmpz])                     \n\t"  // tmpz[1].im = tmpz[1].im + tmp2;
>>> +            "swc1  %[pom1], 56(%[tmpz])                     \n\t"  // tmpz[7].re = tmpz[3].re - tmp4;
>>> +            "swc1  %[pom3], 60(%[tmpz])                     \n\t"  // tmpz[7].im = tmpz[3].im + tmp3;
>>> +            "swc1  %[pom],  24(%[tmpz])                     \n\t"  // tmpz[3].re = tmpz[3].re + tmp4;
>>> +            "swc1  %[pom2], 28(%[tmpz])                     \n\t"  // tmpz[3].im = tmpz[3].im - tmp3;
>>> +            : [tmpz]"+r"(tmpz), [tmp1]"=f"(tmp1), [pom]"=f"(pom),   [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
>>> +              [tmp3]"=f"(tmp3), [tmp2]"=f"(tmp2), [tmp4]"=f"(tmp4), [tmp5]"=f"(tmp5),  [tmp7]"=f"(tmp7),
>>> +              [tmp6]"=f"(tmp6), [tmp8]"=f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
>>> +              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
>>> +            :
>>> +            : "memory"
>>> +        );
>>> +    }
>>> +
>>> +    step = 1<<    (MAX_LOG2_NFFT - 4);
>>> +    n4 = 4;
>>> +    for (nbits=4; nbits<=s->nbits; nbits++)
>>> +    {
>>> +        /*
>>> +        * num_transforms = (num_transforms>>    1) | 1;
>>> +        */
>>> +        __asm__ __volatile__ (
>>> +            "sra %[num_t], %[num_t], 1               \n\t"
>>> +            "ori %[num_t], %[num_t], 1               \n\t"
>>> +
>>> +            : [num_t] "+r" (num_transforms)
>>> +        );
>>> +        n2  = 2 * n4;
>>> +        n34 = 3 * n4;
>>> +
>>> +        for (n=0; n<num_transforms; n++)
>>> +        {
>>> +            offset = fft_offsets_lut[n]<<    nbits;
>>> +            tmpz = z + offset;
>>> +
>>> +            tmpz_n2  = tmpz +  n2;
>>> +            tmpz_n4  = tmpz +  n4;
>>> +            tmpz_n34 = tmpz +  n34;
>>> +
>>> +            __asm__ __volatile__ (
>>> +                "lwc1  %[pom1], 0(%[tmpz_n2])            \n\t"
>>> +                "lwc1  %[pom],  0(%[tmpz_n34])           \n\t"
>>> +                "lwc1  %[pom2], 4(%[tmpz_n2])            \n\t"
>>> +                "lwc1  %[pom3], 4(%[tmpz_n34])           \n\t"
>>> +                "lwc1  %[temp1],0(%[tmpz])               \n\t"
>>> +                "lwc1  %[temp3],4(%[tmpz])               \n\t"
>>> +                "add.s %[tmp5], %[pom1],      %[pom]     \n\t"   //  tmp5 = tmpz[ n2].re + tmpz[n34].re;
>>> +                "sub.s %[tmp1], %[pom1],      %[pom]     \n\t"   //  tmp1 = tmpz[ n2].re - tmpz[n34].re;
>>> +                "add.s %[tmp6], %[pom2],      %[pom3]    \n\t"   //  tmp6 = tmpz[ n2].im + tmpz[n34].im;
>>> +                "sub.s %[tmp2], %[pom2],      %[pom3]    \n\t"   //  tmp2 = tmpz[ n2].im - tmpz[n34].im;
>>> +                "sub.s %[temp], %[temp1],     %[tmp5]    \n\t"
>>> +                "add.s %[temp1],%[temp1],     %[tmp5]    \n\t"
>>> +                "sub.s %[temp4],%[temp3],     %[tmp6]    \n\t"
>>> +                "add.s %[temp3],%[temp3],     %[tmp6]    \n\t"
>>> +                "swc1  %[temp], 0(%[tmpz_n2])            \n\t"   //  tmpz[ n2].re = tmpz[ 0].re - tmp5;
>>> +                "swc1  %[temp1],0(%[tmpz])               \n\t"   //  tmpz[  0].re = tmpz[ 0].re + tmp5;
>>> +                "lwc1  %[pom1], 0(%[tmpz_n4])            \n\t"
>>> +                "swc1  %[temp4],4(%[tmpz_n2])            \n\t"   //  tmpz[ n2].im = tmpz[ 0].im - tmp6;
>>> +                "lwc1  %[temp], 4(%[tmpz_n4])            \n\t"
>>> +                "swc1  %[temp3],4(%[tmpz])               \n\t"   //  tmpz[  0].im = tmpz[ 0].im + tmp6;
>>> +                "sub.s %[pom],  %[pom1],      %[tmp2]    \n\t"
>>> +                "add.s %[pom1], %[pom1],      %[tmp2]    \n\t"
>>> +                "add.s %[temp1],%[temp],      %[tmp1]    \n\t"
>>> +                "sub.s %[temp], %[temp],      %[tmp1]    \n\t"
>>> +                "swc1  %[pom],  0(%[tmpz_n34])           \n\t"   //  tmpz[n34].re = tmpz[n4].re - tmp2;
>>> +                "swc1  %[pom1], 0(%[tmpz_n4])            \n\t"   //  tmpz[ n4].re = tmpz[n4].re + tmp2;
>>> +                "swc1  %[temp1],4(%[tmpz_n34])           \n\t"   //  tmpz[n34].im = tmpz[n4].im + tmp1;
>>> +                "swc1  %[temp], 4(%[tmpz_n4])            \n\t"   //  tmpz[ n4].im = tmpz[n4].im - tmp1;
>>> +                : [tmpz]"+r"(tmpz), [tmpz_n2]"+r"(tmpz_n2), [tmpz_n34]"+r"(tmpz_n34), [tmp5]"=f"(tmp5),
>>> +                  [tmp1]"=f"(tmp1), [pom]"=&f"(pom),        [pom1]"=&f"(pom1),        [pom2]"=&f"(pom2),
>>> +                  [tmp2]"=f"(tmp2), [tmp6]"=f"(tmp6),       [tmpz_n4]"+r"(tmpz_n4),   [pom3]"=&f"(pom3),
>>> +                  [temp]"=f"(temp), [temp1]"=f"(temp1),     [temp3]"=f"(temp3),       [temp4]"=f"(temp4)
>>> +                :
>>> +                : "memory"
>>> +            );
>>> +
>>> +            w_re_ptr = w_tab + step;
>>> +
>>> +            for (i=1; i<n4; i++)
>>> +            {
>>> +                w_re = w_re_ptr[0];
>>> +                w_im = w_re_ptr[MAX_FFT_SIZE/4];
>>
>> Can you explain why you cannot use the same cos/sin tab that the C
>> version uses?
> We can use them and I will rewrite this part of code to use them.
>
> ========================
> fft-test -s results on MIPS 74Kf board:
> ========================
> original fft:
>
> FFT 512 test
> Checking...
> max:0.000008 e:3.92148e-08
> Speed test...
> time: 136.9 us/transform [total time=1.12 s its=8192]
>
> ========================================================
> optimized fft:
>
> FFT 512 test
> Checking...
> max:0.000005 e:3.86258e-08
> Speed test...
> time: 89.7 us/transform [total time=1.47 s its=16384]

Pretty impressive :-D

-Vitor

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel at ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel