[FFmpeg-devel] [PATCH] mips: Implementation of AC3 fixed point decoder and optimization for MIPS.

Vitor Sessak vitor1001 at gmail.com
Wed Aug 22 18:56:01 CEST 2012


Hi!

On 08/09/2012 03:50 PM, Nedeljko Babic wrote:
> AC3 fixed point decoder is implemented in C and appropriate functions
>   are optimized for MIPS architecture. Some of DSP, format convert
>   utils and FFT fixed point functions are optimized.
>
> Signed-off-by: Nedeljko Babic <nbabic at mips.com>
> ---
>   doc/mips.txt                            |    6 +
>   libavcodec/allcodecs.c                  |    3 +
>   libavcodec/dsputil.c                    |   24 +
>   libavcodec/dsputil.h                    |    4 +
>   libavcodec/fft.c                        |    1 +
>   libavcodec/fft.h                        |   12 +
>   libavcodec/fmtconvert.c                 |   79 ++
>   libavcodec/fmtconvert.h                 |   57 +-
>   libavcodec/kbdwin.c                     |   32 +
>   libavcodec/kbdwin.h                     |    6 +-
>   libavcodec/mips/Makefile                |    4 +
>   libavcodec/mips/ac3dec_fixed.c          | 1660 +++++++++++++++++++++++++++++++
>   libavcodec/mips/ac3dec_fixed.h          |  234 +++++
>   libavcodec/mips/dsputil_mips_fixed.c    |  153 +++
>   libavcodec/mips/fft_mips_fixed.c        |  906 +++++++++++++++++
>   libavcodec/mips/fft_table_fixed.h       |  105 ++
>   libavcodec/mips/fmtconvert_mips_fixed.c |  226 +++++
>   libavutil/common.h                      |   12 +

Just by looking at the list of files, I'd say that reviewing would be 
far easier if you split the patch: you should first send one patch 
implementing an AC3 fixed point decoder (with no mips-specific code and 
no changes inside libavcodec/mips) and then send a second patch with 
only the MIPS optimizations.

>   18 files changed, 3520 insertions(+), 4 deletions(-)
>   create mode 100644 libavcodec/mips/ac3dec_fixed.c
>   create mode 100644 libavcodec/mips/ac3dec_fixed.h
>   create mode 100644 libavcodec/mips/dsputil_mips_fixed.c
>   create mode 100644 libavcodec/mips/fft_mips_fixed.c
>   create mode 100644 libavcodec/mips/fft_table_fixed.h
>   create mode 100644 libavcodec/mips/fmtconvert_mips_fixed.c
>
> diff --git a/doc/mips.txt b/doc/mips.txt
> index 6fa6fb4..5b2e710 100644
> --- a/doc/mips.txt
> +++ b/doc/mips.txt
> @@ -47,6 +47,8 @@ Files that have MIPS copyright notice in them:
>   * libavutil/mips/
>         libm_mips.h
>   * libavcodec/mips/
> +      ac3dec_fixed.c
> +      ac3dec_fixed.h
>         acelp_filters_mips.c
>         acelp_vectors_mips.c
>         amrwbdec_mips.c
> @@ -57,9 +59,13 @@ Files that have MIPS copyright notice in them:
>         compute_antialias_float.h
>         lsp_mips.h
>         dsputil_mips.c
> +      dsputil_mips_fixed.c
>         fft_mips.c
> +      fft_mips_fixed.c
>         fft_table.h
> +      fft_table_fixed.h
>         fft_init_table.c
>         fmtconvert_mips.c
> +      fmtconvert_mips_fixed.c
>         mpegaudiodsp_mips_fixed.c
>         mpegaudiodsp_mips_float.c
> diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
> index 8305bc2..319286f 100644
> --- a/libavcodec/allcodecs.c
> +++ b/libavcodec/allcodecs.c
> @@ -273,6 +273,9 @@ void avcodec_register_all(void)
>       REGISTER_DECODER (AAC_LATM, aac_latm);
>       REGISTER_ENCDEC  (AC3, ac3);
>       REGISTER_ENCODER (AC3_FIXED, ac3_fixed);
> +#if (ARCH_MIPS)
> +    REGISTER_DECODER (AC3_FIXED, ac3_fixed);
> +#endif /* ARCH_MIPS */
>       REGISTER_ENCDEC  (ALAC, alac);
>       REGISTER_DECODER (ALS, als);
>       REGISTER_DECODER (AMRNB, amrnb);
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index a1d69c4..8f9aa1c 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -2515,6 +2515,26 @@ static void vector_fmul_window_c(float *dst, const float *src0,
>       }
>   }
>
> +#if (ARCH_MIPS)
> +static void vector_fmul_window_fixed_c(int *dst, const int16_t *src0,
> +                                       const int16_t *src1, const int16_t *win, int len)
> +{
> +    int i,j;
> +    dst += len;
> +    win += len;
> +    src0+= len;
> +
> +    for (i=-len, j=len-1; i<0; i++, j--) {
> +        int s0 = src0[i];
> +        int s1 = src1[j];
> +        int wi = win[i];
> +        int wj = win[j];
> +        dst[i] = (s0*wj - s1*wi + 0x4000) >> 15;
> +        dst[j] = (s0*wi + s1*wj + 0x4000) >> 15;
> +    }
> +}
> +#endif /* ARCH_MIPS */

Is there any speed loss in getting the "0x4000" and the shift value as a 
function parameter?

>   static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
>                                    int len)
>   {
> @@ -3042,6 +3062,9 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>       c->vector_fmul_reverse = vector_fmul_reverse_c;
>       c->vector_fmul_add = vector_fmul_add_c;
>       c->vector_fmul_window = vector_fmul_window_c;
> +#if (ARCH_MIPS)
> +    c->vector_fmul_window_fixed = vector_fmul_window_fixed_c;
> +#endif
>       c->vector_clipf = vector_clipf_c;
>       c->scalarproduct_int16 = scalarproduct_int16_c;
>       c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
> @@ -3177,6 +3200,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
>       if (ARCH_SH4)        ff_dsputil_init_sh4   (c, avctx);
>       if (ARCH_BFIN)       ff_dsputil_init_bfin  (c, avctx);
>       if (HAVE_MIPSFPU)    ff_dsputil_init_mips  (c, avctx);
> +    if (HAVE_MIPSDSPR2)  ff_dsputil_init_mips_fixed(c);

While I agree that separating fixed and float initialization makes 
sense, but as it is today it is not separated. Ideally, one could split 
dsputils in float and non-float code, but that should be done for all 
archs. Just for adding a mips optimization, I'd say you shoud just put 
it all in ff_dsputil_init_mips() for consistency.

>       for (i = 0; i < 4; i++) {
>           for (j = 0; j < 16; j++) {
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index 18dd316..d437844 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -398,6 +398,9 @@ typedef struct DSPContext {
>       void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
>       /* assume len is a multiple of 4, and arrays are 16-byte aligned */
>       void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
> +#if (ARCH_MIPS)
> +    void (*vector_fmul_window_fixed)(int *dst, const int16_t *src0, const int16_t *src1, const int16_t *win, int len);
> +#endif /* ARCH_MIPS */

That's not OK. DSPUtils should be a generic toolbox with at least a C 
version of every function implemented.

>       /* assume len is a multiple of 8, and arrays are 16-byte aligned */
>       void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
>       /**
> @@ -624,6 +627,7 @@ void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
>   void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
>   void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
>   void ff_dsputil_init_mips(DSPContext* c, AVCodecContext *avctx);
> +void ff_dsputil_init_mips_fixed(DSPContext* c);
>
>   void ff_dsputil_init_dwt(DSPContext *c);
>   void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
> diff --git a/libavcodec/fft.c b/libavcodec/fft.c
> index 39c8972..a57b62e 100644
> --- a/libavcodec/fft.c
> +++ b/libavcodec/fft.c
> @@ -167,6 +167,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
>   #else
>       if (CONFIG_MDCT)  s->mdct_calcw = ff_mdct_calcw_c;
>       if (ARCH_ARM)     ff_fft_fixed_init_arm(s);
> +    if (ARCH_MIPS)    ff_fft_fixed_init_mips(s);
>   #endif

That looks fine.

>       for(j=4; j<=nbits; j++) {
> diff --git a/libavcodec/fft.h b/libavcodec/fft.h
> index 15e5a12..deabbab 100644
> --- a/libavcodec/fft.h
> +++ b/libavcodec/fft.h
> @@ -80,6 +80,10 @@ struct FFTContext {
>       void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
>       void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
>       void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
> +#if (ARCH_MIPS)
> +    void (*fft_fixed_calc)(struct FFTContext *s, FFTComplex *z);
> +    void (*imdct_fixed_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
> +#endif /* ARCH_MIPS */

Again, this doesn't look good.

>       void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
>       void (*mdct_calcw)(struct FFTContext *s, FFTDouble *output, const FFTSample *input);
>       int fft_permutation;
> @@ -140,6 +144,9 @@ void ff_fft_init_arm(FFTContext *s);
>   void ff_fft_init_mips(FFTContext *s);
>   #else
>   void ff_fft_fixed_init_arm(FFTContext *s);
> +#if (ARCH_MIPS)
> +void ff_fft_fixed_init_mips(FFTContext *s);
> +#endif
>   #endif
>
>   void ff_fft_end(FFTContext *s);
> @@ -147,6 +154,11 @@ void ff_fft_end(FFTContext *s);
>   #define ff_mdct_init FFT_NAME(ff_mdct_init)
>   #define ff_mdct_end  FFT_NAME(ff_mdct_end)
>
> +#if (ARCH_MIPS)
> +int ff_mdct_fixed_init_hardcoded_128(FFTContext *s, int nbits, int inverse, int scale);
> +int ff_mdct_fixed_init_hardcoded(FFTContext *s, int nbits, int inverse, int scale);

Those two belong to something inside libavcodec/mips.

> +int ff_mdct_fixed_init(FFTContext *s, int nbits, int inverse, int scale);
> +#endif /* ARCH_MIPS */
>   int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
>   void ff_mdct_end(FFTContext *s);
>
> diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
> index e47c205..b7b0345 100644
> --- a/libavcodec/fmtconvert.c
> +++ b/libavcodec/fmtconvert.c
> @@ -28,7 +28,18 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
>       for(i=0; i<len; i++)
>           dst[i] = src[i] * mul;
>   }
> +#if (ARCH_MIPS)
> +static void int32_to_fixed_fmul_scalar_c(int16_t *dst, const int *src, int mul, int len) {
> +    int i;
> +    for(i=0; i<len; i++)
> +    dst[i] = (src[i] * mul + 0x8000) >> 16;
> +}
>
> +static av_always_inline int fixed_to_int16_one(const int *src)
> +{
> +    return av_clip_int16_c_fixed(*src);
> +}
> +#endif /* ARCH_MIPS */
>   static av_always_inline int float_to_int16_one(const float *src){
>       return av_clip_int16(lrintf(*src));
>   }
> @@ -56,6 +67,37 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
>       }
>   }
>
> +#if (ARCH_MIPS)
> +static void fixed_to_int16_interleave_c(int16_t *dst, const int **src,
> +                                        long len, int channels)
> +{
> +    int i,j,c;
> +    if(channels==2) {
> +        for(i=0; i<len; i++) {
> +            dst[2*i] = fixed_to_int16_one(src[0]+i);
> +            dst[2*i+1] = fixed_to_int16_one(src[1]+i);
> +        }
> +    }
> +    else {
> +        if(channels==6) {
> +            for(i=0; i<len; i++) {
> +                dst[6*i] = fixed_to_int16_one(src[0]+i);
> +                dst[6*i+1] = fixed_to_int16_one(src[1]+i);
> +                dst[6*i+2] = fixed_to_int16_one(src[2]+i);
> +                dst[6*i+3] = fixed_to_int16_one(src[3]+i);
> +                dst[6*i+4] = fixed_to_int16_one(src[4]+i);
> +                dst[6*i+5] = fixed_to_int16_one(src[5]+i);
> +            }
> +        }
> +        else {
> +            for(c=0; c<channels; c++)
> +                for(i=0, j=c; i<len; i++, j+=channels)
> +                    dst[j] = fixed_to_int16_one(src[c]+i);
> +        }
> +    }
> +}
> +#endif /* ARCH_MIPS */

Again, this should not be MIPS-specific.

>   void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
>                              int channels)
>   {
> @@ -75,9 +117,45 @@ void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
>       }
>   }
>
> +#if (ARCH_MIPS)
> +void ff_fixed_interleave_c(int *dst, const int **src, unsigned int len,
> +                           int channels)

Nor this.

> +{
> +    int j, c;
> +    unsigned int i;
> +    if (channels == 6) {
> +        for (i = 0; i < len; i++) {
> +            dst[6*i]   = src[0][i];
> +            dst[6*i+1] = src[1][i];
> +            dst[6*i+2] = src[2][i];
> +            dst[6*i+3] = src[3][i];
> +            dst[6*i+4] = src[4][i];
> +            dst[6*i+5] = src[5][i];
> +        }
> +    }
> +    else if (channels == 2) {
> +        for (i = 0; i < len; i++) {
> +            dst[2*i]   = src[0][i];
> +            dst[2*i+1] = src[1][i];
> +        }
> +    } else if (channels == 1 && len < INT_MAX / sizeof(int)) {
> +        memcpy(dst, src[0], len * sizeof(int));
> +    } else {
> +        for (c = 0; c < channels; c++)
> +            for (i = 0, j = c; i < len; i++, j += channels)
> +                dst[j] = src[c][i];
> +    }
> +}
> +#endif /* ARCH_MIPS */
> +
>   av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
>   {
>       c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
> +#if (ARCH_MIPS)
> +    c->int32_to_fixed_fmul_scalar = int32_to_fixed_fmul_scalar_c;
> +    c->fixed_to_int16_interleave  = fixed_to_int16_interleave_c;
> +    c->fixed_interleave           = ff_fixed_interleave_c;
> +#endif /* ARCH_MIPS */

And again.

>       c->float_to_int16             = float_to_int16_c;
>       c->float_to_int16_interleave  = float_to_int16_interleave_c;
>       c->float_interleave           = ff_float_interleave_c;
> @@ -86,6 +164,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
>       if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
>       if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
>       if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c);
> +    if (HAVE_MIPSDSPR1) ff_fmt_convert_init_mips_fixed(c, avctx);
>   }
>
>   /* ffdshow custom code */
> diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
> index ab2caa2..49e7992 100644
> --- a/libavcodec/fmtconvert.h
> +++ b/libavcodec/fmtconvert.h
> @@ -36,7 +36,55 @@ typedef struct FmtConvertContext {
>        *            constraints: multiple of 8
>        */
>       void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
> -
> +#if (ARCH_MIPS)
> +    /**
> +     * Multiply a array of int32_t by a int32_t value and convert to int16_t.
> +     * @param dst destination array of int16_t.
> +     *            constraints: 16-byte aligned
> +     * @param src source array of int32_t.
> +     *            constraints: 16-byte aligned
> +     * @param len number of elements in array.
> +     *            constraints: multiple of 8
> +     */
> +    void (*int32_to_fixed_fmul_scalar)(int16_t *dst, const int *src, int mul, int len);
> +    /**
> +     * Convert an array of int32_t to an array of int16_t.
> +     *
> +     * @param dst destination array of int16_t.
> +     *            constraints: 16-byte aligned
> +     * @param src source array of int32_t.
> +     *            constraints: 16-byte aligned
> +     * @param len number of elements to convert.
> +     *            constraints: multiple of 8
> +     */
> +    void (*fixed_to_int16)(int16_t *dst, const int *src, long len);
> +    /**
> +     * Convert multiple arrays of int32_t to an interleaved array of int16_t.
> +     *
> +     * @param dst destination array of interleaved int16_t.
> +     *            constraints: 16-byte aligned
> +     * @param src source array of int32_t arrays, one for each channel.
> +     *            constraints: 16-byte aligned
> +     * @param len number of elements to convert.
> +     *            constraints: multiple of 8
> +     * @param channels number of channels
> +     */
> +    void (*fixed_to_int16_interleave)(int16_t *dst, const int **src,
> +                                      long len, int channels);
> +    /**
> +     * Convert multiple arrays of int32_t to an array of interleaved int32_t.
> +     *
> +     * @param dst destination array of interleaved int32_t.
> +     *            constraints: 16-byte aligned
> +     * @param src source array of int32_t arrays, one for each channel.
> +     *            constraints: 16-byte aligned
> +     * @param len number of elements to convert.
> +     *            constraints: multiple of 8
> +     * @param channels number of channels
> +     */
> +    void (*fixed_interleave)(int *dst, const int **src, unsigned int len,
> +                             int channels);
> +#endif /* ARCH_MIPS */
>       /**
>        * Convert an array of float to an array of int16_t.
>        *
> @@ -86,7 +134,12 @@ typedef struct FmtConvertContext {
>
>   void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
>                              int channels);
> -
> +#if (ARCH_MIPS)
> +void ff_fixed_interleave_c(int *dst, const int **src, unsigned int len,
> +                           int channels);
> +void fixed_interleave(int *dst, const int **src, unsigned int len, int channels);
> +void ff_fmt_convert_init_mips_fixed(FmtConvertContext *c, AVCodecContext *avctx);
> +#endif /* ARCH_MIPS */
>   av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
>
>   void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
> diff --git a/libavcodec/kbdwin.c b/libavcodec/kbdwin.c
> index 2722312..4f76b20 100644
> --- a/libavcodec/kbdwin.c
> +++ b/libavcodec/kbdwin.c
> @@ -46,3 +46,35 @@ av_cold void ff_kbd_window_init(float *window, float alpha, int n)
>      for (i = 0; i < n; i++)
>          window[i] = sqrt(local_window[i] / sum);
>   }
> +
> +#if (ARCH_MIPS)
> +av_cold void ff_kbd_fixed_window_init(int16_t *window, float alpha, int n)
> +{
> +    int i, j;
> +    double sum = 0.0, bessel, tmp;
> +    double local_window[FF_KBD_WINDOW_MAX];
> +    double alpha2 = (alpha * M_PI / n) * (alpha * M_PI / n);
> +
> +    assert(n <= FF_KBD_WINDOW_MAX);
> +
> +    for (i = 0; i < n; i++) {
> +        tmp = i * (n - i) * alpha2;
> +        bessel = 1.0;
> +        for (j = BESSEL_I0_ITER; j > 0; j--)
> +            bessel = bessel * tmp / (j * j) + 1;
> +        sum += bessel;
> +        local_window[i] = sum;
> +    }
> +
> +    sum++;
> +    for (i = 0; i < n; i++)
> +    {
> +        int tmp;
> +
> +        tmp = (int)(32767*sqrt(local_window[i] / sum) + 0.5);
> +        if (tmp > 32767)
> +            tmp = 32767;
> +        window[i] = (int16_t)tmp;
> +    }
> +}
> +#endif

Again...

> diff --git a/libavcodec/kbdwin.h b/libavcodec/kbdwin.h
> index 4b93975..66621a2 100644
> --- a/libavcodec/kbdwin.h
> +++ b/libavcodec/kbdwin.h
> @@ -18,7 +18,7 @@
>
>   #ifndef AVCODEC_KBDWIN_H
>   #define AVCODEC_KBDWIN_H
> -
> +#include "config.h"
>   /**
>    * Maximum window size for ff_kbd_window_init.
>    */
> @@ -31,5 +31,7 @@
>    * @param   n       size of half window, max FF_KBD_WINDOW_MAX
>    */
>   void ff_kbd_window_init(float *window, float alpha, int n);
> -
> +#if (ARCH_MIPS)
> +void ff_kbd_fixed_window_init(int16_t *window, float alpha, int n);
> +#endif
>   #endif /* AVCODEC_KBDWIN_H */
> diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
> index ff46768..4830039 100644
> --- a/libavcodec/mips/Makefile
> +++ b/libavcodec/mips/Makefile
> @@ -17,3 +17,7 @@ OBJS-$(CONFIG_FFT)                        += mips/fft_init_table.o
>   MIPSFPU-OBJS-$(CONFIG_FFT)                += mips/fft_mips.o
>   MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/fmtconvert_mips.o
>   MIPSFPU-OBJS-$(HAVE_INLINE_ASM)           += mips/dsputil_mips.o
> +MIPSDSPR1-OBJS-$(HAVE_INLINE_ASM)         += mips/fmtconvert_mips_fixed.o
> +MIPSDSPR2-OBJS-$(HAVE_INLINE_ASM)         += mips/dsputil_mips_fixed.o
> +OBJS-$(CONFIG_FFT)                        += mips/fft_mips_fixed.o
> +OBJS-$(CONFIG_AC3_FIXED_DECODER)          += mips/ac3dec_fixed.o
> diff --git a/libavcodec/mips/ac3dec_fixed.c b/libavcodec/mips/ac3dec_fixed.c
> new file mode 100644
> index 0000000..50a30dd
> --- /dev/null
> +++ b/libavcodec/mips/ac3dec_fixed.c

This should definitely not be mips specific. If you are implementing an 
AC3 fixed-point decoder, I'd be very nice if people on every arch could 
use it.

> @@ -0,0 +1,1660 @@
> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author:  Stanislav Ocovaj (socovaj at mips.com)
> + *
> + * AC3 fixed-point decoder for MIPS platforms
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define CONFIG_FFT_FLOAT 0
> +
> +#include <stdio.h>
> +#include <stddef.h>
> +#include <math.h>
> +#include <string.h>
> +
> +#include "libavutil/crc.h"
> +#include "libavcodec/internal.h"
> +#include "libavcodec/aac_ac3_parser.h"
> +#include "libavcodec/ac3_parser.h"
> +#include "ac3dec_fixed.h"
> +#include "libavcodec/ac3dec_data.h"
> +#include "libavcodec/ac3dsp.h"
> +#include "libavcodec/kbdwin.h"
> +
> +#define INT2FIXED(x) (((x) << 15)  * ( x < -32767 ? -1 : 1))
> +#define MULT_FINT(x, y)  (((long long)(x) * (y) ) >> 16 )
> +#define ADD_FINT(x, y) ( (x) + (y) )
> +#define SUB_FINT(a, b) ( (a) - (b) )
> +#define SUB_INT_WITH_FINT_AND_CONVERT_TO_FINT(x, y) ( INT2FIXED(x) - (y) )
> +#define DIV_INT_WITH_FINT_AND_CONVERT_TO_FINT(x, y) \
> +                ( (((long long)(x) << 30)  * ( x < -32767 ? -1 : 1) ) / (y) )
> +#define MULT_INT_WITH_FINT_AND_CONVERT_TO_FINT(x, y) \
> +                                    (((long long)INT2FIXED(x) * (y) ) >> 15 )
> +
> +/**
> + * table for ungrouping 3 values in 7 bits.
> + * used for exponents and bap=2 mantissas
> + */
> +static uint8_t ungroup_3_in_7_bits_tab[128][3];
> +
> +
> +/** tables for ungrouping mantissas */
> +static int b1_mantissas[32][3];
> +static int b2_mantissas[128][3];
> +static int b3_mantissas[8];
> +static int b4_mantissas[128][2];
> +static int b5_mantissas[16];
> +
> +/**
> + * Quantization table: levels for symmetric. bits for asymmetric.
> + * reference: Table 7.18 Mapping of bap to Quantizer
> + */
> +static const uint8_t quantization_tab[16] = {
> +    0, 3, 5, 7, 11, 15,
> +    5, 6, 7, 8, 9, 10, 11, 12, 14, 16
> +};
> +
> +/** Adjustments in dB gain */
> +static const int gain_levels_fixed[9] = {
> +    LEVEL_FIXED_PLUS_3DB,
> +    LEVEL_FIXED_PLUS_1POINT5DB,
> +    LEVEL_FIXED_ONE,
> +    LEVEL_FIXED_MINUS_1POINT5DB,
> +    LEVEL_FIXED_MINUS_3DB,
> +    LEVEL_FIXED_MINUS_4POINT5DB,
> +    LEVEL_FIXED_MINUS_6DB,
> +    LEVEL_FIXED_ZERO,
> +    LEVEL_FIXED_MINUS_9DB
> +};
> +
> +/**
> + * Table for center mix levels
> + * reference: Section 5.4.2.4 cmixlev
> + */
> +static const uint8_t center_levels[4] = { 4, 5, 6, 5 };
> +
> +/**
> + * Table for surround mix levels
> + * reference: Section 5.4.2.5 surmixlev
> + */
> +static const uint8_t surround_levels[4] = { 4, 6, 7, 6 };
> +
> +/**
> + * Table for default stereo downmixing coefficients
> + * reference: Section 7.8.2 Downmixing Into Two Channels
> + */
> +static const uint8_t ac3_default_coeffs[8][5][2] = {
> +    { { 2, 7 }, { 7, 2 },                               },
> +    { { 4, 4 },                                         },
> +    { { 2, 7 }, { 7, 2 },                               },
> +    { { 2, 7 }, { 5, 5 }, { 7, 2 },                     },
> +    { { 2, 7 }, { 7, 2 }, { 6, 6 },                     },
> +    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 8, 8 },           },
> +    { { 2, 7 }, { 7, 2 }, { 6, 7 }, { 7, 6 },           },
> +    { { 2, 7 }, { 5, 5 }, { 7, 2 }, { 6, 7 }, { 7, 6 }, },
> +};
> +
> +/**
> + * Symmetrical Dequantization
> + * reference: Section 7.3.3 Expansion of Mantissas for Symmetrical Quantization
> + *            Tables 7.19 to 7.23
> + */
> +static inline int
> +symmetric_dequant(int code, int levels)
> +{
> +    return ((code - (levels >> 1)) << 24) / levels;
> +}
> +
> +/**
> + * Initialize tables at runtime.
> + */
> +static av_cold void ac3_tables_init(void)
> +{
> +    int i;
> +
> +    /* generate table for ungrouping 3 values in 7 bits
> +       reference: Section 7.1.3 Exponent Decoding */
> +    for(i=0; i<128; i++) {
> +        ungroup_3_in_7_bits_tab[i][0] =  i / 25;
> +        ungroup_3_in_7_bits_tab[i][1] = (i % 25) / 5;
> +        ungroup_3_in_7_bits_tab[i][2] = (i % 25) % 5;
> +    }
> +
> +    /* generate grouped mantissa tables
> +       reference: Section 7.3.5 Ungrouping of Mantissas */
> +    for(i=0; i<32; i++) {
> +        /* bap=1 mantissas */
> +        b1_mantissas[i][0] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][0], 3);
> +        b1_mantissas[i][1] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][1], 3);
> +        b1_mantissas[i][2] = symmetric_dequant(ff_ac3_ungroup_3_in_5_bits_tab[i][2], 3);
> +    }
> +    for(i=0; i<128; i++) {
> +        /* bap=2 mantissas */
> +        b2_mantissas[i][0] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][0], 5);
> +        b2_mantissas[i][1] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][1], 5);
> +        b2_mantissas[i][2] = symmetric_dequant(ungroup_3_in_7_bits_tab[i][2], 5);
> +
> +        /* bap=4 mantissas */
> +        b4_mantissas[i][0] = symmetric_dequant(i / 11, 11);
> +        b4_mantissas[i][1] = symmetric_dequant(i % 11, 11);
> +    }
> +    /* generate ungrouped mantissa tables
> +       reference: Tables 7.21 and 7.23 */
> +    for(i=0; i<7; i++) {
> +        /* bap=3 mantissas */
> +        b3_mantissas[i] = symmetric_dequant(i, 7);
> +    }
> +    for(i=0; i<15; i++) {
> +        /* bap=5 mantissas */
> +        b5_mantissas[i] = symmetric_dequant(i, 15);
> +    }
> +}
> +
> +/**
> + * AVCodec initialization
> + */
> +av_cold int ac3_fixed_decode_init(AVCodecContext *avctx)
> +{
> +    AC3FixedDecodeContext *s = avctx->priv_data;
> +    s->avctx = avctx;
> +
> +    ff_ac3_common_init();
> +    ac3_tables_init();
> +
> +#if !CONFIG_HARDCODED_TABLES
> +    ff_mdct_fixed_init(&s->imdct_256, 8, 1, 1);
> +    ff_mdct_fixed_init(&s->imdct_512, 9, 1, 1);
> +#else
> +    ff_mdct_fixed_init_hardcoded(&s->imdct_256, 8, 1, 1);
> +    ff_mdct_fixed_init_hardcoded_128(&s->imdct_512, 9, 1, 1);
> +#endif
> +
> +    ff_kbd_fixed_window_init(s->window, 5.0, 256);
> +    ff_dsputil_init(&s->dsp, avctx);
> +
> +    ff_fmt_convert_init(&s->fmt_conv, avctx);
> +    av_lfg_init(&s->dith_state, 0);
> +
> +
> +    if (avctx->sample_fmt == AV_SAMPLE_FMT_FLT) {
> +        //DONE s->mul_bias = 1.0f;
> +        s->mul_bias = 65536;
> +        avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
> +    } else {
> +        //DONE s->mul_bias = 32767.0f;
> +        s->mul_bias = 2147418112;
> +        avctx->sample_fmt = AV_SAMPLE_FMT_S16;
> +    }
> +
> +    /* allow downmixing to stereo or mono */
> +    if (avctx->channels > 0 && avctx->request_channels > 0 &&
> +            avctx->request_channels < avctx->channels &&
> +            avctx->request_channels <= 2) {
> +        avctx->channels = avctx->request_channels;
> +    }
> +    s->downmixed = 1;
> +
> +    avcodec_get_frame_defaults(&s->frame);
> +    avctx->coded_frame = &s->frame;
> +
> +    return 0;
> +}
> +
> +/**
> + * Parse the 'sync info' and 'bit stream info' from the AC-3 bitstream.
> + * GetBitContext within AC3DecodeContext must point to
> + * the start of the synchronized AC-3 bitstream.
> + */
> +static int ac3_parse_header_fixed(AC3FixedDecodeContext *s)
> +{
> +    GetBitContext *gbc = &s->gbc;
> +    int i;
> +
> +    /* read the rest of the bsi. read twice for dual mono mode. */
> +    i = !(s->channel_mode);
> +    do {
> +        skip_bits(gbc, 5); /* skip dialog normalization */
> +        if (get_bits1(gbc))
> +            skip_bits(gbc, 8); /* skip compression */
> +        if (get_bits1(gbc))
> +            skip_bits(gbc, 8); /* skip language code */
> +        if (get_bits1(gbc))
> +            skip_bits(gbc, 7); /* skip audio production information */
> +    } while (i--);
> +
> +    skip_bits(gbc, 2); /* skip copyright bit and original bitstream bit */
> +
> +    /* skip the timecodes (or extra bitstream information for Alternate Syntax)
> +       TODO: read & use the xbsi1 downmix levels */
> +    if (get_bits1(gbc))
> +        skip_bits(gbc, 14); /* kip timecode1 / xbsi1 */
> +    if (get_bits1(gbc))
> +        skip_bits(gbc, 14); /* skip timecode2 / xbsi2 */
> +
> +    /* skip additional bitstream info */
> +    if (get_bits1(gbc)) {
> +        i = get_bits(gbc, 6);
> +        do {
> +            skip_bits(gbc, 8);
> +        } while(i--);
> +    }
> +    return 0;
> +}

That's a lot of copy&pasted code. This is pretty bad for maintenance. 
The preferred solution in this case is using templates, in the same way 
it is done for the MP3 decoder. See commit 
b91d46614df189e7905538e7f5c4ed9c7ed0d274 to see how it was done for MP3 
(but it was done the way around, a float implementation was added to the 
fixed-point one).


> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Author:  Dragan Mrdjan (dmrdjan at mips.com)
> + *
> + * DSP utils optimized for MIPS fixed-point platforms
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Reference: libavcodec/dsputil.c
> + */
> +
> +#include "config.h"
> +#include "libavcodec/dsputil.h"
> +
> +static void vector_fmul_window_mips_fixed(int *dst, const int16_t *src0, const int16_t *src1, const int16_t *win, int len)
> +{
> +    int i,j;
> +    int *dst_i, *dst_j;
> +    const int16_t * src0_i, *src1_j;
> +    const int16_t *win_i, *win_j;
> +    int16_t s0, s01, s02, s03, s1, s11, s12, s13;
> +    int16_t wi, wi1, wi2, wi3, wj, wj1, wj2, wj3;
> +
> +    dst += len;
> +    win += len;
> +    src0 += len;
> +
> +    for(i=-len, j=len-1; i<0; i+=4, j-=4) {
> +        dst_i = dst + i;
> +        dst_j = dst + j;
> +        src0_i = src0 + i;
> +        src1_j = src1 + j;
> +        win_i = win + i;
> +        win_j = win + j;
> +
> +        __asm__ volatile (
> +            "lh             %[s0],      0(%[src0_i])                \n\t"
> +            "lh             %[s1],      0(%[src1_j])                \n\t"
> +            "lh             %[wi],      0(%[win_i])                 \n\t"
> +            "lh             %[wj],      0(%[win_j])                 \n\t"
> +            "append         %[s0],      %[s1],          16          \n\t"
> +            "append         %[wj],      %[wi],          16          \n\t"
> +            "mult           $ac0,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac0,       %[s0],          %[wj]       \n\t"
> +            "mult           $ac1,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac1,       %[s0],          %[wj]       \n\t"
> +            "lh             %[s01],     2(%[src0_i])                \n\t"
> +            "lh             %[s11],     -2(%[src1_j])               \n\t"
> +            "extr_r.w       %[s1],      $ac0,           16          \n\t"
> +            "lh             %[wi1],     2(%[win_i])                 \n\t"
> +            "lh             %[wj1],     -2(%[win_j])                \n\t"
> +            "extr_r.w       %[wj],      $ac1,           16          \n\t"
> +            "append         %[s01],     %[s11],         16          \n\t"
> +            "append         %[wj1],     %[wi1],         16          \n\t"
> +            "mult           $ac2,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac2,       %[s01],         %[wj1]      \n\t"
> +            "sw             %[s1],      0(%[dst_i])                 \n\t"
> +            "sw             %[wj],       0(%[dst_j])                \n\t"
> +            "mult           $ac3,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac3,       %[s01],         %[wj1]      \n\t"
> +            "extr_r.w       %[s11],     $ac2,           16          \n\t"
> +            "extr_r.w       %[wj1],     $ac3,           16          \n\t"
> +            "lh             %[s02],     4(%[src0_i])                \n\t"
> +            "lh             %[s12],     -4(%[src1_j])               \n\t"
> +            "lh             %[wi2],     4(%[win_i])                 \n\t"
> +            "lh             %[wj2],     -4(%[win_j])                \n\t"
> +            "append         %[s02],     %[s12],         16          \n\t"
> +            "append         %[wj2],     %[wi2],         16          \n\t"
> +            "mult           $ac0,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac0,       %[s02],         %[wj2]      \n\t"
> +            "sw             %[s11],     4(%[dst_i])                 \n\t"
> +            "sw             %[wj1],     -4(%[dst_j])                \n\t"
> +            "mult           $ac1,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac1,       %[s02],         %[wj2]      \n\t"
> +            "extr_r.w       %[s12],     $ac0,           16          \n\t"
> +            "lh             %[s03],     6(%[src0_i])                \n\t"
> +            "lh             %[s13],     -6(%[src1_j])               \n\t"
> +            "lh             %[wi3],     6(%[win_i])                 \n\t"
> +            "lh             %[wj3],     -6(%[win_j])                \n\t"
> +            "append         %[s03],     %[s13],         16          \n\t"
> +            "append         %[wj3],     %[wi3],         16          \n\t"
> +            "mult           $ac2,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac2,       %[s03],         %[wj3]      \n\t"
> +            "sw             %[s12],     8(%[dst_i])                 \n\t"
> +            "extr_r.w       %[wj2],     $ac1,           16          \n\t"
> +            "mult           $ac3,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac3,       %[s03],         %[wj3]      \n\t"
> +            "extr_r.w       %[s13],     $ac2,           16          \n\t"
> +            "extr_r.w       %[wj3],     $ac3,           16          \n\t"
> +            "sw             %[wj2],     -8(%[dst_j])                \n\t"
> +            "sw             %[s13],     12(%[dst_i])                \n\t"
> +            "sw             %[wj3],     -12(%[dst_j])               \n\t"
> +
> +            : [s0] "=&r" (s0), [s1] "=&r" (s1), [wi] "=&r" (wi),
> +              [wj] "=&r" (wj), [s03] "=&r" (s03), [s01] "=&r" (s01),
> +              [s11] "=&r" (s11), [wi1] "=&r" (wi1), [wj1] "=&r" (wj1),
> +              [s13] "=&r" (s13), [s02] "=&r" (s02), [s12] "=&r" (s12),
> +              [wi2] "=&r" (wi2), [wj2] "=&r" (wj2), [wi3] "=&r" (wi3),
> +              [wj3] "=&r" (wj3)
> +            : [src0_i] "r" (src0_i), [win_j] "r" (win_j ), [src1_j] "r" (src1_j),
> +              [win_i] "r" (win_i), [dst_i] "r" (dst_i), [dst_j] "r" (dst_j)
> +            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
> +              "$ac3hi", "$ac3lo"
> +        );
> +    }
> +}
> +
> +void ff_dsputil_init_mips_fixed(DSPContext* c) {
> +    c->vector_fmul_window_fixed = vector_fmul_window_mips_fixed;
> +}
> diff --git a/libavcodec/mips/fft_mips_fixed.c b/libavcodec/mips/fft_mips_fixed.c
> new file mode 100644
> index 0000000..9fc9287
> --- /dev/null
> +++ b/libavcodec/mips/fft_mips_fixed.c
> @@ -0,0 +1,906 @@

This also deserves a patch on its own. Your decoder should work fine 
using the C FFT and the code using a fixed-point FFT can use this 
without needing and AC3 decoder.

If you split your patches into smaller chunks, it is easier to review 
and easier to understand regressions from GIT history.

> +/*
> + * Copyright (c) 2012
> + *      MIPS Technologies, Inc., California.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
> + *    contributors may be used to endorse or promote products derived from
> + *    this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * Authors:  Stanislav Ocovaj (socovaj at mips.com)
> + *           Dragan Mrdjan    (dmrdjan at mips.com)
> + *           Zoran Lukic      (zlukic at mips.com)
> + *           Bojan Zivkovic   (bojan at mips.com)
> + *
> + * Optimization of FFT and MDCT/IMDCT transforms for MIPS fixed-point
> + * architecture
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#define CONFIG_FFT_FLOAT 0
> +#include "libavcodec/fft.h"
> +#include "libavcodec/mips/fft_table.h"
> +
> +#include "fft_table_fixed.h"
> +
> +av_cold int ff_mdct_fixed_init(FFTContext *s, int nbits, int inverse, int scale)
> +{
> +    int n, n4, i;
> +    double alpha, theta;
> +    int tstep;
> +
> +    memset(s, 0, sizeof(*s));
> +    n = 1 << nbits;
> +    s->mdct_bits = nbits;
> +    s->mdct_size = n;
> +    n4 = n >> 2;
> +    s->mdct_permutation = FF_MDCT_PERM_NONE;
> +
> +    if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
> +        goto fail;
> +
> +    s->tcos = av_malloc((n * sizeof(int)) / 2);
> +
> +    if (!s->tcos)
> +        goto fail;
> +
> +    switch (s->mdct_permutation) {
> +    case FF_MDCT_PERM_NONE:
> +        s->tsin = s->tcos + n4;
> +        tstep = 1;
> +        break;
> +    case FF_MDCT_PERM_INTERLEAVE:
> +        s->tsin = s->tcos + 1;
> +        tstep = 2;
> +        break;
> +    default:
> +        goto fail;
> +    }
> +    theta = 0.125 + (scale < 0 ? n4 : 0);
> +
> +    for(i=0;i<n4;i++) {
> +    int tmp;
> +
> +        alpha = 2 * M_PI * (i + theta) / n;
> +        tmp = (int)(-cos(alpha) * 65536);
> +        tmp = (tmp + 1) >> 1;
> +        if (tmp > 32767)
> +          tmp = 32767;
> +        s->tcos[i*tstep] = (FFTSample)tmp;
> +        tmp = (int)(-sin(alpha) * 65536);
> +        tmp = (tmp + 1) >> 1;
> +        if (tmp > 32767)
> +          tmp = 32767;
> +        s->tsin[i*tstep] = tmp;
> +    }
> +
> +    return 0;
> +fail:
> +    ff_mdct_end(s);
> +    return -1;
> +}
> +
> +av_cold int ff_mdct_fixed_init_hardcoded_128(FFTContext *s, int nbits, int inverse, int scale)
> +    {
> +        int n, n4, i;
> +        int tstep;
> +
> +        memset(s, 0, sizeof(*s));
> +        n = 1 << nbits;
> +        s->mdct_bits = nbits;
> +        s->mdct_size = n;
> +        n4 = n >> 2;
> +        s->mdct_permutation = FF_MDCT_PERM_NONE;
> +
> +        if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
> +            goto fail;
> +
> +        s->tcos = av_malloc((n * sizeof(int)) / 2);
> +
> +        if (!s->tcos)
> +            goto fail;
> +
> +        switch (s->mdct_permutation) {
> +        case FF_MDCT_PERM_NONE:
> +            s->tsin = s->tcos + n4;
> +            tstep = 1;
> +            break;
> +        case FF_MDCT_PERM_INTERLEAVE:
> +            s->tsin = s->tcos + 1;
> +            tstep = 2;
> +            break;
> +        default:
> +            goto fail;
> +        }
> +        for(i=0;i<n4;i++) {
> +            s->tcos[i*tstep] = tcos_fixed_128[i];
> +            s->tsin[i*tstep] = tsin_fixed_128[i];
> +        }
> +    return 0;
> +fail:
> +    ff_mdct_end(s);
> +    return -1;
> +}
> +
> +#if HAVE_MIPSDSPR2 && HAVE_INLINE_ASM
> +static void ff_imdct_fixed_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
> +{
> +    int k, n8, n4, n2, n, j, j2;
> +    int ax0, ax1, ax2, ax3;
> +    const uint16_t *revtab = s->revtab;
> +    const FFTSample *tcos = s->tcos;
> +    const FFTSample *tsin = s->tsin;
> +    const FFTSample *in1, *in2, *in3, *in4;
> +    FFTComplex *z = (FFTComplex *)output;
> +
> +    FFTSample t0, t1, t2, t3, t01, t11, t21, t31;
> +
> +    n = 1 << s->mdct_bits;
> +    n2 = n >> 1;
> +    n4 = n >> 2;
> +    n8 = n >> 3;
> +
> +    /* pre rotation */
> +    in1 = input;
> +    in3 = input + 2;
> +    in2 = input + n2 - 1;
> +    in4 = input + n2 - 3;
> +
> +    for(k=0; k<n4; k+=4) {
> +        int k1 = k * 2;
> +        int k2 = k1 + 2;
> +
> +        __asm__ volatile (
> +            "lh             %[ax0],     0(%[in2])                   \n\t"
> +            "lh             %[ax1],     0(%[in1])                   \n\t"
> +            "lhx            %[ax2],     %[k1](%[tcos])              \n\t"
> +            "lhx            %[ax3],     %[k1](%[tsin])              \n\t"
> +            "multu          $ac0,       $0,             $0          \n\t"
> +            "multu          $ac1,       $0,             $0          \n\t"
> +            "append         %[ax0],     %[ax1],         16          \n\t"
> +            "append         %[ax2],     %[ax3],         16          \n\t"
> +            "multu          $ac2,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
> +            "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
> +            "lh             %[ax0],     -4(%[in2])                  \n\t"
> +            "lh             %[ax1],     4(%[in1])                   \n\t"
> +            "lhx            %[ax2],     %[k2](%[tcos])              \n\t"
> +            "lhx            %[ax3],     %[k2](%[tsin])              \n\t"
> +            "append         %[ax0],     %[ax1],         16          \n\t"
> +            "append         %[ax2],     %[ax3],         16          \n\t"
> +            "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
> +            "multu          $ac3,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
> +            "extr_r.w       %[t0],      $ac0,           16          \n\t"
> +            "extr_r.w       %[t2],      $ac1,           16          \n\t"
> +            "extr_r.w       %[t1],      $ac2,           16          \n\t"
> +            "extr_r.w       %[t3],      $ac3,           16          \n\t"
> +
> +            : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2),[ax1]  "=&r"  (ax1), [ax3] "=&r" (ax3),
> +              [t0] "=&r" (t0),  [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
> +            : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
> +              [tsin] "r" (tsin), [k1] "r" (k1), [k2] "r" (k2)
> +            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
> +              "$ac3hi", "$ac3lo"
> +        );
> +
> +        j  = revtab[k];
> +        j2 = revtab[k+1];
> +
> +        z[j].re = t0;
> +        z[j].im = t2;
> +        z[j2].re = t1;
> +        z[j2].im = t3;
> +
> +        k1 += 4;
> +        k2 += 4;
> +
> +        __asm__ volatile (
> +            "lh             %[ax0],     -8(%[in2])                  \n\t"
> +            "lh             %[ax1],     8(%[in1])                   \n\t"
> +            "lhx            %[ax2],     %[k1](%[tcos])              \n\t"
> +            "lhx            %[ax3],     %[k1](%[tsin])              \n\t"
> +            "multu          $ac0,       $0,             $0          \n\t"
> +            "multu          $ac1,       $0,             $0          \n\t"
> +            "append         %[ax0],     %[ax1],         16          \n\t"
> +            "append         %[ax2],     %[ax3],         16          \n\t"
> +            "multu          $ac2,       $0,             $0          \n\t"
> +            "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
> +            "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
> +            "lh             %[ax0],     -12(%[in2])                 \n\t"
> +            "lh             %[ax1],     12(%[in1])                  \n\t"
> +            "lhx            %[ax2],     %[k2](%[tcos])              \n\t"
> +            "lhx            %[ax3],     %[k2](%[tsin])              \n\t"
> +            "append         %[ax0],     %[ax1],         16          \n\t"
> +            "append         %[ax2],     %[ax3],         16          \n\t"
> +            "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
> +            "multu          $ac3,       $0,             $0          \n\t"
> +            "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
> +            "extr_r.w       %[t0],      $ac0,           16          \n\t"
> +            "extr_r.w       %[t2],      $ac1,           16          \n\t"
> +            "extr_r.w       %[t1],      $ac2,           16          \n\t"
> +            "extr_r.w       %[t3],      $ac3,           16          \n\t"
> +
> +            : [ax0] "=&r" (ax0), [ax2] "=&r" (ax2), [ax1] "=&r" (ax1), [ax3] "=&r" (ax3),
> +              [t0] "=&r" (t0), [t2] "=&r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
> +            : [in1] "r" (in1), [in2] "r" (in2), [tcos] "r" (tcos),
> +              [tsin] "r"  (tsin),[k1] "r" (k1), [k2] "r" (k2)
> +            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
> +              "$ac3hi", "$ac3lo"
> +        );
> +
> +        j  = revtab[k+2];
> +        j2 = revtab[k+3];
> +
> +        z[j ].re = t0;
> +        z[j ].im = t2;
> +        z[j2].re = t1;
> +        z[j2].im = t3;
> +        in1 += 8;
> +        in2 -= 8;
> +    }
> +
> +    s->fft_fixed_calc(s, z);
> +
> +    /* post rotation + reordering */
> +
> +    for(k=0; k<n8; k+=2 ) {
> +        int k1 = 2 * (n8 - k - 1), k2 = k1 - 2;
> +        int k11 = 2 * (n8 + k), k21 = k11 + 2;
> +        in1 = (const FFTSample*)(z + (n8 - k - 1));
> +        in2 = (const FFTSample*)(z + (n8 + k));
> +
> +         __asm__ volatile (
> +             "lh             %[ax0],     2(%[in1])                   \n\t"
> +             "lh             %[ax1],     0(%[in1])                   \n\t"
> +             "lhx            %[ax2],     %[k1](%[tsin])              \n\t"
> +             "lhx            %[ax3],     %[k1](%[tcos])              \n\t"
> +             "multu          $ac0,       $0,             $0          \n\t"
> +             "multu          $ac1,       $0,             $0          \n\t"
> +             "append         %[ax0],     %[ax1],         16          \n\t"
> +             "append         %[ax2],     %[ax3],         16          \n\t"
> +             "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
> +             "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
> +             "lh             %[ax0],     -2(%[in1])                  \n\t"
> +             "lh             %[ax1],     -4(%[in1])                  \n\t"
> +             "lhx            %[ax2],     %[k2](%[tsin])              \n\t"
> +             "lhx            %[ax3],     %[k2](%[tcos])              \n\t"
> +             "append         %[ax0],     %[ax1],         16          \n\t"
> +             "append         %[ax2],     %[ax3],         16          \n\t"
> +             "multu          $ac2,       $0,             $0          \n\t"
> +             "mulsaq_s.w.ph  $ac2,       %[ax0],         %[ax2]      \n\t"
> +             "multu          $ac3,       $0,             $0          \n\t"
> +             "dpaqx_s.w.ph   $ac3,       %[ax0],         %[ax2]      \n\t"
> +             "extr_r.w       %[t0],      $ac0,           16          \n\t"
> +             "extr_r.w       %[t2],      $ac1,           16          \n\t"
> +             "extr_r.w       %[t1],      $ac2,           16          \n\t"
> +             "extr_r.w       %[t3],      $ac3,           16          \n\t"
> +
> +            : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
> +              [t0] "=r" (t0), [t2] "=r" (t2), [t1] "=r" (t1), [t3] "=r" (t3)
> +            : [in1] "r" (in1), [k1] "r" (k1), [tsin] "r" (tsin), [tcos] "r" (tcos),
> +              [z] "r" (z), [k2] "r" (k2)
> +            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
> +              "$ac3hi", "$ac3lo"
> +         );
> +
> +         __asm__ volatile (
> +             "lh             %[ax0],     2(%[in2])                   \n\t"
> +             "lh             %[ax1],     0(%[in2])                   \n\t"
> +             "lhx            %[ax2],     %[k11](%[tsin])             \n\t"
> +             "lhx            %[ax3],     %[k11](%[tcos])             \n\t"
> +             "multu          $ac0,       $0,             $0          \n\t"
> +             "multu          $ac1,       $0,             $0          \n\t"
> +             "append         %[ax0],     %[ax1],         16          \n\t"
> +             "append         %[ax2],     %[ax3],         16          \n\t"
> +             "mulsaq_s.w.ph  $ac0,       %[ax0],         %[ax2]      \n\t"
> +             "dpaqx_s.w.ph   $ac1,       %[ax0],         %[ax2]      \n\t"
> +             "lh             %[ax0],     6(%[in2])                   \n\t"
> +             "lh             %[ax1],     4(%[in2])                   \n\t"
> +             "lhx            %[ax2],     %[k21](%[tsin])             \n\t"
> +             "lhx            %[ax3],     %[k21](%[tcos])             \n\t"
> +             "append         %[ax0],     %[ax1],        16           \n\t"
> +             "append         %[ax2],     %[ax3],        16           \n\t"
> +             "multu          $ac2,       $0,            $0           \n\t"
> +             "mulsaq_s.w.ph  $ac2,       %[ax0],        %[ax2]       \n\t"
> +             "multu          $ac3,       $0,            $0           \n\t"
> +             "dpaqx_s.w.ph   $ac3,       %[ax0],        %[ax2]       \n\t"
> +             "extr_r.w       %[t01],     $ac0,          16           \n\t"
> +             "extr_r.w       %[t21],     $ac1,          16           \n\t"
> +             "extr_r.w       %[t11],     $ac2,          16           \n\t"
> +             "extr_r.w       %[t31],     $ac3,          16           \n\t"
> +
> +            : [ax0] "=&r" (ax0), [ax1] "=&r" (ax1), [ax2] "=&r" (ax2), [ax3] "=&r" (ax3),
> +              [t01] "=r" (t01), [t21] "=r" (t21), [t11] "=r" (t11), [t31] "=r" (t31)
> +            : [in2] "r" (in2), [k11] "r" (k11), [tsin] "r" (tsin),[tcos] "r" (tcos),
> +              [z] "r" (z), [k21] "r" (k21)
> +            : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
> +              "$ac3hi", "$ac3lo"
> +        );
> +
> +        z[n8-k-1].re = t0;
> +        z[n8+k  ].im = t2;
> +        z[n8-k-1].im = t21;
> +        z[n8+k  ].re = t01;
> +
> +        z[n8-k-2].re = t1;
> +        z[n8+k+1].im = t3;
> +        z[n8-k-2].im = t31;
> +        z[n8+k+1].re = t11;
> +        z[n8+k+1].im = t3;
> +    }
> +}
> +#else
> +#define CMUL_SR(dre, dim, are, aim, bre, bim) do { \
> +        (dre) = ( ((are) * (bre) - (aim) * (bim) + 0x4000) >> 15 );  \
> +        (dim) = ( ((are) * (bim) + (aim) * (bre) + 0x4000) >> 15 );  \
> +    } while(0)
> +
> +static void ff_imdct_fixed_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
> +{

Do you really need to recopy the C version here?

-Vitor


More information about the ffmpeg-devel mailing list