[FFmpeg-devel] [PATCH] mips: Optimization of AC3 floating-point encoder

Fri Jul 27 16:01:15 CEST 2012

Signed-off-by: Nedeljko Babic <nbabic at mips.com>
---
 libavcodec/ac3dsp.c                 |    2 +
 libavcodec/ac3dsp.h                 |    1 +
 libavcodec/ac3enc.c                 |    8 +
 libavcodec/ac3enc.h                 |   17 +-
 libavcodec/ac3enc_template.c        |   12 +-
 libavcodec/mips/Makefile            |    2 +
 libavcodec/mips/ac3dsp_mips.c       |  265 ++++++++++++++
 libavcodec/mips/ac3enc_float_mips.c |  651 +++++++++++++++++++++++++++++++++++
 8 files changed, 951 insertions(+), 7 deletions(-)
 create mode 100644 libavcodec/mips/ac3dsp_mips.c
 create mode 100644 libavcodec/mips/ac3enc_float_mips.c

diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 581e5f5..0d23372 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -232,4 +232,6 @@ av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
         ff_ac3dsp_init_arm(c, bit_exact);
     if (HAVE_MMX)
         ff_ac3dsp_init_x86(c, bit_exact);
+    if (ARCH_MIPS)
+        ff_ac3dsp_init_mips(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index f0a6999..7fd57ec 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -136,5 +136,6 @@ typedef struct AC3DSPContext {
 void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact);
 void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 05cddda..f361e83 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -2458,10 +2458,18 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)
         s->mdct_end                     = ff_ac3_fixed_mdct_end;
         s->mdct_init                    = ff_ac3_fixed_mdct_init;
         s->allocate_sample_buffers      = ff_ac3_fixed_allocate_sample_buffers;
+        s->deinterleave_input_samples   = ff_ac3_fixed_deinterleave_input_samples;
+        s->apply_mdct                   = ff_ac3_fixed_apply_mdct;
+        s->apply_channel_coupling       = ff_ac3_fixed_apply_channel_coupling;
     } else if (CONFIG_AC3_ENCODER || CONFIG_EAC3_ENCODER) {
         s->mdct_end                     = ff_ac3_float_mdct_end;
         s->mdct_init                    = ff_ac3_float_mdct_init;
         s->allocate_sample_buffers      = ff_ac3_float_allocate_sample_buffers;
+        s->deinterleave_input_samples   = ff_ac3_float_deinterleave_input_samples;
+        s->apply_mdct                   = ff_ac3_float_apply_mdct;
+        s->apply_channel_coupling       = ff_ac3_float_apply_channel_coupling;
+
+        if(HAVE_MIPSFPU) ff_ac3_float_encode_init_mips(s);
     }
     if (CONFIG_EAC3_ENCODER && s->eac3)
         s->output_frame_header = ff_eac3_output_frame_header;
diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h
index be9dcf2..b6d0974 100644
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -255,7 +255,10 @@ typedef struct AC3EncodeContext {
 
     /* fixed vs. float templated function pointers */
     int  (*allocate_sample_buffers)(struct AC3EncodeContext *s);
-
+    void (*deinterleave_input_samples)(struct AC3EncodeContext *s,
+                                       const SampleType *samples);
+    void (*apply_mdct)(struct AC3EncodeContext *s);
+    void (*apply_channel_coupling)(struct AC3EncodeContext *s);
     /* AC-3 vs. E-AC-3 function pointers */
     void (*output_frame_header)(struct AC3EncodeContext *s);
 } AC3EncodeContext;
@@ -264,6 +267,7 @@ typedef struct AC3EncodeContext {
 extern const uint64_t ff_ac3_channel_layouts[19];
 
 int ff_ac3_encode_init(AVCodecContext *avctx);
+void ff_ac3_float_encode_init_mips(AC3EncodeContext *avctx);
 
 int ff_ac3_encode_close(AVCodecContext *avctx);
 
@@ -300,6 +304,17 @@ int ff_ac3_float_mdct_init(AC3EncodeContext *s);
 int ff_ac3_fixed_allocate_sample_buffers(AC3EncodeContext *s);
 int ff_ac3_float_allocate_sample_buffers(AC3EncodeContext *s);
 
+void ff_ac3_fixed_deinterleave_input_samples(AC3EncodeContext *s,
+                                             const SampleType *samples);
+void ff_ac3_float_deinterleave_input_samples(AC3EncodeContext *s,
+                                             const SampleType *samples);
+
+void ff_ac3_fixed_apply_mdct(AC3EncodeContext *s);
+void ff_ac3_float_apply_mdct(AC3EncodeContext *s);
+
+void ff_ac3_fixed_apply_channel_coupling(AC3EncodeContext *s);
+void ff_ac3_float_apply_channel_coupling(AC3EncodeContext *s);
+
 int ff_ac3_fixed_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                               const AVFrame *frame, int *got_packet_ptr);
 int ff_ac3_float_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c
index e81bfce..6df8282 100644
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -71,7 +71,7 @@ alloc_fail:
  * Deinterleave input samples.
  * Channels are reordered from FFmpeg's default order to AC-3 order.
  */
-static void deinterleave_input_samples(AC3EncodeContext *s,
+void AC3_NAME(deinterleave_input_samples)(AC3EncodeContext *s,
                                        const SampleType *samples)
 {
     int ch, i;
@@ -101,7 +101,7 @@ static void deinterleave_input_samples(AC3EncodeContext *s,
  * This applies the KBD window and normalizes the input to reduce precision
  * loss due to fixed-point calculations.
  */
-static void apply_mdct(AC3EncodeContext *s)
+void AC3_NAME(apply_mdct)(AC3EncodeContext *s)
 {
     int blk, ch;
 
@@ -131,7 +131,7 @@ static void apply_mdct(AC3EncodeContext *s)
 /*
  * Calculate coupling channel and coupling coordinates.
  */
-static void apply_channel_coupling(AC3EncodeContext *s)
+void AC3_NAME(apply_channel_coupling)(AC3EncodeContext *s)
 {
     LOCAL_ALIGNED_16(CoefType, cpl_coords,      [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
 #if CONFIG_AC3ENC_FLOAT
@@ -407,9 +407,9 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
     if (s->bit_alloc.sr_code == 1 || s->eac3)
         ff_ac3_adjust_frame_size(s);
 
-    deinterleave_input_samples(s, samples);
+    s->deinterleave_input_samples(s, samples);
 
-    apply_mdct(s);
+    s->apply_mdct(s);
 
     if (s->fixed_point)
         scale_coefficients(s);
@@ -421,7 +421,7 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, AVPacket *avpkt,
     ff_ac3_compute_coupling_strategy(s);
 
     if (s->cpl_on)
-        apply_channel_coupling(s);
+        s->apply_channel_coupling(s);
 
     compute_rematrixing_strategy(s);
 
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 13fb324..7e0fa7e 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -17,3 +17,5 @@ MIPSDSPR1-OBJS-$(HAVE_INLINE_ASM)         += mips/fmtconvert_mips_fixed.o
 MIPSDSPR2-OBJS-$(HAVE_INLINE_ASM)         += mips/dsputil_mips_fixed.o
 OBJS-$(CONFIG_FFT)                        += mips/fft_mips_fixed.o
 OBJS-$(CONFIG_AC3_FIXED_DECODER)          += mips/ac3dec_fixed.o
+OBJS-$(CONFIG_AC3DSP)                     += mips/ac3dsp_mips.o
+MIPSFPU-OBJS-$(CONFIG_AC3_ENCODER)        += mips/ac3enc_float_mips.o
diff --git a/libavcodec/mips/ac3dsp_mips.c b/libavcodec/mips/ac3dsp_mips.c
new file mode 100644
index 0000000..fb6b56b
--- /dev/null
+++ b/libavcodec/mips/ac3dsp_mips.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Branimir Vasic (bvasic at mips.com)
+ *
+ * Various AC-3 DSP Utils optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/ac3dsp.c
+ */
+
+#include "config.h"
+#include "libavcodec/ac3dsp.h"
+#include "libavcodec/ac3.h"
+
+static void ac3_extract_exponents_mips(uint8_t *exp, int32_t *coef, int nb_coefs)
+{
+    int i;
+
+    for (i = 0; i < nb_coefs; i++) {
+        int e;
+        int v = abs(coef[i]);
+        if (v == 0)
+            e = 24;
+        else {
+            e = 23 - av_log2(v);
+            if (e < 0) {
+                e = 0;
+                coef[i] = av_clip(coef[i], -16777215, 16777215);
+            }
+        }
+        exp[i] = e;
+    }
+}
+
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSPR1
+static void ac3_bit_alloc_calc_bap_mips(int16_t *mask, int16_t *psd,
+                                        int start, int end,
+                                        int snr_offset, int floor,
+                                        const uint8_t *bap_tab, uint8_t *bap)
+{
+    int bin, band, band_end, address;
+    int val, temp1, temp2;
+
+    /* special case, if snr offset is -960, set all bap's to zero */
+    if (snr_offset == -960) {
+        memset(bap, 0, AC3_MAX_COEFS);
+        return;
+    }
+
+    bin  = start;
+    band = ff_ac3_bin_to_band_tab[start];
+    do {
+        int m = (FFMAX(mask[band] - snr_offset - floor, 0) & 0x1FE0) + floor;
+        band_end = ff_ac3_band_start_tab[++band];
+        band_end = FFMIN(band_end, end);
+
+        for (; bin < band_end; bin++) {
+            val = (psd[bin] - m) >> 5;
+            __asm__ __volatile__ (
+                "sra    %[temp1],   %[val],     31          \n\t"
+                "xor    %[address], %[temp1],   %[val]      \n\t"
+                "addiu  %[temp1],   %[val],     -63         \n\t"
+                "sra    %[temp2],   %[temp1],   31          \n\t"
+                "xor    %[temp1],   %[temp1],   %[temp2]    \n\t"
+                "subu   %[address], %[address], %[temp1]    \n\t"
+                "addiu  %[address], %[address], 63          \n\t"
+                "sra    %[address], %[address], 1           \n\t"
+                : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+                  [address] "=&r" (address)
+                : [val] "r" (val)
+            );
+            bap[bin] = bap_tab[address];
+        }
+    } while (end > band_end);
+}
+
+static void ac3_update_bap_counts_mips(uint16_t mant_cnt[16], uint8_t *bap,
+                                    int len)
+{
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    __asm__ __volatile__(
+        "andi   %[temp3],   %[len],         3               \n\t"
+        "addu   %[temp2],   %[bap],         %[len]          \n\t"
+        "addu   %[temp4],   %[bap],         %[temp3]        \n\t"
+        "beq    %[temp2],   %[temp4],       4f              \n\t"
+        "1:                                                 \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        "lbu    %[temp5],   -2(%[temp2])                    \n\t"
+        "lbu    %[temp6],   -3(%[temp2])                    \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        "addu   %[temp0],   %[mant_cnt],    %[temp0]        \n\t"
+        "sll    %[temp5],   %[temp5],       1               \n\t"
+        "addu   %[temp5],   %[mant_cnt],    %[temp5]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "sll    %[temp6],   %[temp6],       1               \n\t"
+        "addu   %[temp6],   %[mant_cnt],    %[temp6]        \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "lhu    %[temp1],   0(%[temp5])                     \n\t"
+        "lbu    %[temp7],   -4(%[temp2])                    \n\t"
+        "addiu  %[temp2],   %[temp2],       -4              \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp5])                     \n\t"
+        "lhu    %[temp1],   0(%[temp6])                     \n\t"
+        "sll    %[temp7],   %[temp7],       1               \n\t"
+        "addu   %[temp7],   %[mant_cnt],    %[temp7]        \n\t"
+        "addiu  %[temp1],   %[temp1],1                      \n\t"
+        "sh     %[temp1],   0(%[temp6])                     \n\t"
+        "lhu    %[temp1],   0(%[temp7])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp7])                     \n\t"
+        "bne    %[temp2],   %[temp4],       1b              \n\t"
+        "4:                                                 \n\t"
+        "beqz   %[temp3],   2f                              \n\t"
+        "3:                                                 \n\t"
+        "addiu  %[temp3],   %[temp3],       -1              \n\t"
+        "lbu    %[temp0],   -1(%[temp2])                    \n\t"
+        "addiu  %[temp2],   %[temp2],       -1              \n\t"
+        "sll    %[temp0],   %[temp0],       1               \n\t"
+        "addu   %[temp0],   %[mant_cnt],    %[temp0]        \n\t"
+        "lhu    %[temp1],   0(%[temp0])                     \n\t"
+        "addiu  %[temp1],   %[temp1],       1               \n\t"
+        "sh     %[temp1],   0(%[temp0])                     \n\t"
+        "bgtz   %[temp3],   3b                              \n\t"
+        "2:                                                 \n\t"
+
+        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+          [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
+          [temp6] "=&r" (temp6), [temp7] "=&r" (temp7)
+        : [len] "r" (len), [bap] "r" (bap),
+          [mant_cnt] "r" (mant_cnt)
+        : "memory"
+    );
+}
+#endif
+
+#if HAVE_MIPSFPU
+static void float_to_fixed24_mips(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = 1 << 24;
+    float src0, src1, src2, src3, src4, src5, src6, src7;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+    do {
+        __asm__ __volatile__ (
+            "lwc1       %[src0],    0(%[src])               \n\t"
+            "lwc1       %[src1],    4(%[src])               \n\t"
+            "lwc1       %[src2],    8(%[src])               \n\t"
+            "lwc1       %[src3],    12(%[src])              \n\t"
+            "lwc1       %[src4],    16(%[src])              \n\t"
+            "lwc1       %[src5],    20(%[src])              \n\t"
+            "lwc1       %[src6],    24(%[src])              \n\t"
+            "lwc1       %[src7],    28(%[src])              \n\t"
+            "mul.s      %[src0],    %[src0],    %[scale]    \n\t"
+            "mul.s      %[src1],    %[src1],    %[scale]    \n\t"
+            "mul.s      %[src2],    %[src2],    %[scale]    \n\t"
+            "mul.s      %[src3],    %[src3],    %[scale]    \n\t"
+            "mul.s      %[src4],    %[src4],    %[scale]    \n\t"
+            "mul.s      %[src5],    %[src5],    %[scale]    \n\t"
+            "mul.s      %[src6],    %[src6],    %[scale]    \n\t"
+            "mul.s      %[src7],    %[src7],    %[scale]    \n\t"
+            "cvt.w.s    %[src0],    %[src0]                 \n\t"
+            "cvt.w.s    %[src1],    %[src1]                 \n\t"
+            "cvt.w.s    %[src2],    %[src2]                 \n\t"
+            "cvt.w.s    %[src3],    %[src3]                 \n\t"
+            "cvt.w.s    %[src4],    %[src4]                 \n\t"
+            "cvt.w.s    %[src5],    %[src5]                 \n\t"
+            "cvt.w.s    %[src6],    %[src6]                 \n\t"
+            "cvt.w.s    %[src7],    %[src7]                 \n\t"
+            "mfc1       %[temp0],   %[src0]                 \n\t"
+            "mfc1       %[temp1],   %[src1]                 \n\t"
+            "mfc1       %[temp2],   %[src2]                 \n\t"
+            "mfc1       %[temp3],   %[src3]                 \n\t"
+            "mfc1       %[temp4],   %[src4]                 \n\t"
+            "mfc1       %[temp5],   %[src5]                 \n\t"
+            "mfc1       %[temp6],   %[src6]                 \n\t"
+            "mfc1       %[temp7],   %[src7]                 \n\t"
+            "sw         %[temp0],   0(%[dst])               \n\t"
+            "sw         %[temp1],   4(%[dst])               \n\t"
+            "sw         %[temp2],   8(%[dst])               \n\t"
+            "sw         %[temp3],   12(%[dst])              \n\t"
+            "sw         %[temp4],   16(%[dst])              \n\t"
+            "sw         %[temp5],   20(%[dst])              \n\t"
+            "sw         %[temp6],   24(%[dst])              \n\t"
+            "sw         %[temp7],   28(%[dst])              \n\t"
+
+            : [dst] "+r" (dst), [src] "+r" (src),
+              [src0] "=&f" (src0), [src1] "=&f" (src1),
+              [src2] "=&f" (src2), [src3] "=&f" (src3),
+              [src4] "=&f" (src4), [src5] "=&f" (src5),
+              [src6] "=&f" (src6), [src7] "=&f" (src7),
+              [temp0] "=r" (temp0), [temp1] "=r" (temp1),
+              [temp2] "=r" (temp2), [temp3] "=r" (temp3),
+              [temp4] "=r" (temp4), [temp5] "=r" (temp5),
+              [temp6] "=r" (temp6), [temp7] "=r" (temp7)
+            : [scale] "f" (scale)
+            : "memory"
+        );
+        src = src + 8;
+        dst = dst + 8;
+        len -= 8;
+    } while (len > 0);
+}
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+void ff_ac3dsp_init_mips(AC3DSPContext *c, int bit_exact) {
+    c->extract_exponents  = ac3_extract_exponents_mips;
+#if HAVE_INLINE_ASM
+#if HAVE_MIPSDSPR1
+    c->bit_alloc_calc_bap = ac3_bit_alloc_calc_bap_mips;
+    c->update_bap_counts  = ac3_update_bap_counts_mips;
+#endif
+#if HAVE_MIPSFPU
+    c->float_to_fixed24 = float_to_fixed24_mips;
+#endif
+#endif
+}
diff --git a/libavcodec/mips/ac3enc_float_mips.c b/libavcodec/mips/ac3enc_float_mips.c
new file mode 100644
index 0000000..d2701a3
--- /dev/null
+++ b/libavcodec/mips/ac3enc_float_mips.c
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Branimir Vasic (bvasic at mips.com)
+ *
+ * Various AC3 floating point encoder functions optimized for MIPS
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Reference: libavcodec/ac3enc_template.c
+ */
+
+#define CONFIG_AC3ENC_FLOAT 1
+#include "libavcodec/ac3enc.h"
+#include "libavcodec/eac3enc.h"
+
+#if HAVE_INLINE_ASM
+static void clip_coefficients(DSPContext *dsp, float *coef, unsigned int len)
+{
+    dsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len);
+}
+
+static CoefType calc_cpl_coord(CoefSumType energy_ch, CoefSumType energy_cpl)
+{
+    float coord = 0.125;
+    if (energy_cpl > 0)
+        coord *= sqrtf(energy_ch / energy_cpl);
+    return FFMIN(coord, COEF_MAX);
+}
+
+static void ff_ac3_float_deinterleave_input_samples_mips(AC3EncodeContext *s,
+                                                         const SampleType *samples)
+{
+    int ch, i;
+    int *sptrs[6];
+    int *sptrs_2[2];
+    int *sptr1,*sptr2,*sptr3,*sptr4,*sptr5,*sptr6;
+
+    int temp1, temp2, temp3, temp4, temp5, temp6;
+    const int *sptr = (const int*) samples;
+
+    for (ch = 0; ch < s->channels; ch++) {
+        memcpy(&s->planar_samples[ch][0], &s->planar_samples[ch][AC3_FRAME_SIZE],
+               AC3_BLOCK_SIZE * sizeof(s->planar_samples[0][0]));
+    }
+
+    if (s->channels == 6) {
+        sptrs[s->channel_map[0]] = (int*)(&s->planar_samples[0][0]);
+        sptrs[s->channel_map[1]] = (int*)(&s->planar_samples[1][0]);
+        sptrs[s->channel_map[2]] = (int*)(&s->planar_samples[2][0]);
+        sptrs[s->channel_map[3]] = (int*)(&s->planar_samples[3][0]);
+        sptrs[s->channel_map[4]] = (int*)(&s->planar_samples[4][0]);
+        sptrs[s->channel_map[5]] = (int*)(&s->planar_samples[5][0]);
+        sptr1 = sptrs[0] + AC3_BLOCK_SIZE;
+        sptr2 = sptrs[1] + AC3_BLOCK_SIZE;
+        sptr3 = sptrs[2] + AC3_BLOCK_SIZE;
+        sptr4 = sptrs[3] + AC3_BLOCK_SIZE;
+        sptr5 = sptrs[4] + AC3_BLOCK_SIZE;
+        sptr6 = sptrs[5] + AC3_BLOCK_SIZE;
+
+        /* deinterleave and remap input samples */
+        for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i+=4) {
+            __asm__ __volatile__ (
+                "lw    %[temp1],  0(%[sptr])                               \n\t"
+                "lw    %[temp2],  4(%[sptr])                               \n\t"
+                "lw    %[temp3],  8(%[sptr])                               \n\t"
+                "lw    %[temp4],  12(%[sptr])                              \n\t"
+                "lw    %[temp5],  16(%[sptr])                              \n\t"
+                "lw    %[temp6],  20(%[sptr])                              \n\t"
+                "sw    %[temp1],  0(%[sptr1])                              \n\t"
+                "sw    %[temp2],  0(%[sptr2])                              \n\t"
+                "sw    %[temp3],  0(%[sptr3])                              \n\t"
+                "sw    %[temp4],  0(%[sptr4])                              \n\t"
+                "sw    %[temp5],  0(%[sptr5])                              \n\t"
+                "sw    %[temp6],  0(%[sptr6])                              \n\t"
+                "lw    %[temp1],  24(%[sptr])                              \n\t"
+                "lw    %[temp2],  28(%[sptr])                              \n\t"
+                "lw    %[temp3],  32(%[sptr])                              \n\t"
+                "lw    %[temp4],  36(%[sptr])                              \n\t"
+                "lw    %[temp5],  40(%[sptr])                              \n\t"
+                "lw    %[temp6],  44(%[sptr])                              \n\t"
+                "sw    %[temp1],  4(%[sptr1])                              \n\t"
+                "sw    %[temp2],  4(%[sptr2])                              \n\t"
+                "sw    %[temp3],  4(%[sptr3])                              \n\t"
+                "sw    %[temp4],  4(%[sptr4])                              \n\t"
+                "sw    %[temp5],  4(%[sptr5])                              \n\t"
+                "sw    %[temp6],  4(%[sptr6])                              \n\t"
+                "lw    %[temp1],  48(%[sptr])                              \n\t"
+                "lw    %[temp2],  52(%[sptr])                              \n\t"
+                "lw    %[temp3],  56(%[sptr])                              \n\t"
+                "lw    %[temp4],  60(%[sptr])                              \n\t"
+                "lw    %[temp5],  64(%[sptr])                              \n\t"
+                "lw    %[temp6],  68(%[sptr])                              \n\t"
+                "sw    %[temp1],  8(%[sptr1])                              \n\t"
+                "sw    %[temp2],  8(%[sptr2])                              \n\t"
+                "sw    %[temp3],  8(%[sptr3])                              \n\t"
+                "sw    %[temp4],  8(%[sptr4])                              \n\t"
+                "sw    %[temp5],  8(%[sptr5])                              \n\t"
+                "sw    %[temp6],  8(%[sptr6])                              \n\t"
+                "lw    %[temp1],  72(%[sptr])                              \n\t"
+                "lw    %[temp2],  76(%[sptr])                              \n\t"
+                "lw    %[temp3],  80(%[sptr])                              \n\t"
+                "lw    %[temp4],  84(%[sptr])                              \n\t"
+                "lw    %[temp5],  88(%[sptr])                              \n\t"
+                "lw    %[temp6],  92(%[sptr])                              \n\t"
+                "sw    %[temp1],  12(%[sptr1])                             \n\t"
+                "sw    %[temp2],  12(%[sptr2])                             \n\t"
+                "sw    %[temp3],  12(%[sptr3])                             \n\t"
+                "sw    %[temp4],  12(%[sptr4])                             \n\t"
+                "sw    %[temp5],  12(%[sptr5])                             \n\t"
+                "sw    %[temp6],  12(%[sptr6])                             \n\t"
+                "addiu %[sptr],   %[sptr],     96                          \n\t"
+                "addiu %[sptr1],  %[sptr1],    16                          \n\t"
+                "addiu %[sptr2],  %[sptr2],    16                          \n\t"
+                "addiu %[sptr3],  %[sptr3],    16                          \n\t"
+                "addiu %[sptr4],  %[sptr4],    16                          \n\t"
+                "addiu %[sptr5],  %[sptr5],    16                          \n\t"
+                "addiu %[sptr6],  %[sptr6],    16                          \n\t"
+
+                :[temp1] "=&r" (temp1),[temp2] "=&r" (temp2),[temp3] "=&r" (temp3),
+                 [temp4] "=&r" (temp4),[temp5] "=&r" (temp5),[temp6] "=&r" (temp6),
+                 [sptr] "+r" (sptr),[sptr1] "+r" (sptr1), [sptr2] "+r" (sptr2),
+                 [sptr3] "+r" (sptr3), [sptr4] "+r" (sptr4), [sptr5] "+r" (sptr5),
+                 [sptr6] "+r" (sptr6)
+                :
+                : "memory"
+            );
+        }
+    }
+    else if (s->channels == 2){
+        sptrs_2[s->channel_map[0]] = (int*)(&s->planar_samples[0][0]);
+        sptrs_2[s->channel_map[1]] = (int*)(&s->planar_samples[1][0]);
+        sptr1 = sptrs_2[0] + AC3_BLOCK_SIZE;
+        sptr2 = sptrs_2[1] + AC3_BLOCK_SIZE;
+
+        for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i+=4) {
+            __asm__ __volatile__ (
+                "lw    %[temp1],  0(%[sptr])                               \n\t"
+                "lw    %[temp2],  4(%[sptr])                               \n\t"
+                "lw    %[temp3],  8(%[sptr])                               \n\t"
+                "lw    %[temp4],  12(%[sptr])                              \n\t"
+                "sw    %[temp1],  0(%[sptr1])                              \n\t"
+                "sw    %[temp2],  0(%[sptr2])                              \n\t"
+                "sw    %[temp3],  4(%[sptr1])                              \n\t"
+                "sw    %[temp4],  4(%[sptr2])                              \n\t"
+                "lw    %[temp1],  16(%[sptr])                              \n\t"
+                "lw    %[temp2],  20(%[sptr])                              \n\t"
+                "lw    %[temp3],  24(%[sptr])                              \n\t"
+                "lw    %[temp4],  28(%[sptr])                              \n\t"
+                "sw    %[temp1],  8(%[sptr1])                              \n\t"
+                "sw    %[temp2],  8(%[sptr2])                              \n\t"
+                "sw    %[temp3],  12(%[sptr1])                             \n\t"
+                "sw    %[temp4],  12(%[sptr2])                             \n\t"
+                "addiu %[sptr],   %[sptr],     32                          \n\t"
+                "addiu %[sptr1],  %[sptr1],    16                          \n\t"
+                "addiu %[sptr2],  %[sptr2],    16                          \n\t"
+
+                :[temp1] "=&r" (temp1),[temp2] "=&r" (temp2),[temp3] "=&r" (temp3),
+                 [temp4] "=&r" (temp4),[sptr] "+r" (sptr),[sptr1] "+r" (sptr1),
+                 [sptr2] "+r" (sptr2)
+                :
+                : "memory"
+            );
+        }
+    }
+    else
+    {
+        for (ch = 0; ch < s->channels; ch++)
+        {
+            const SampleType *sptr;
+            int sinc;
+            sinc = s->channels;
+            sptr = samples + s->channel_map[ch];
+            for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i++) {
+                s->planar_samples[ch][i] = *sptr;
+                sptr += sinc;
+            }
+        }
+     }
+}
+
+static void ff_ac3_float_apply_mdct_mips(AC3EncodeContext *s)
+{
+    int blk, ch, i;
+    float scr0_1, scr0_2, scr0_3, scr0_4, scr1_1, scr1_2, scr1_3, scr1_4;
+    float *win_smpl, *pl_smpl;
+    const float *mdct_win;
+
+    for (ch = 0; ch < s->channels; ch++) {
+        for (blk = 0; blk < s->num_blocks; blk++) {
+            AC3Block *block = &s->blocks[blk];
+            win_smpl = &s->windowed_samples[0];
+            pl_smpl  = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];
+            mdct_win = (const float*) (&s->mdct_window[0]);
+
+            for(i=0; i<AC3_WINDOW_SIZE; i+=4){
+                __asm__ __volatile__ (
+                    "lwc1   %[scr0_1],      0(%[pl_smpl])           \n\t"
+                    "lwc1   %[scr1_1],      0(%[mdct_win])          \n\t"
+                    "lwc1   %[scr0_2],      4(%[pl_smpl])           \n\t"
+                    "lwc1   %[scr1_2],      4(%[mdct_win])          \n\t"
+                    "lwc1   %[scr0_3],      8(%[pl_smpl])           \n\t"
+                    "lwc1   %[scr1_3],      8(%[mdct_win])          \n\t"
+                    "lwc1   %[scr0_4],      12(%[pl_smpl])          \n\t"
+                    "lwc1   %[scr1_4],      12(%[mdct_win])         \n\t"
+                    "mul.s  %[scr0_1],      %[scr0_1],  %[scr1_1]   \n\t"
+                    "mul.s  %[scr0_2],      %[scr0_2],  %[scr1_2]   \n\t"
+                    "mul.s  %[scr0_3],      %[scr0_3],  %[scr1_3]   \n\t"
+                    "mul.s  %[scr0_4],      %[scr0_4],  %[scr1_4]   \n\t"
+                    "swc1   %[scr0_1],      0(%[win_smpl])          \n\t"
+                    "swc1   %[scr0_2],      4(%[win_smpl])          \n\t"
+                    "swc1   %[scr0_3],      8(%[win_smpl])          \n\t"
+                    "swc1   %[scr0_4],      12(%[win_smpl])         \n\t"
+                    "addiu  %[pl_smpl],     16                      \n\t"
+                    "addiu  %[mdct_win],    16                      \n\t"
+                    "addiu  %[win_smpl],    16                      \n\t"
+
+                    : [pl_smpl] "+r" (pl_smpl), [mdct_win] "+r" (mdct_win),
+                      [win_smpl] "+r" (win_smpl), [scr0_1] "=&f" (scr0_1),
+                      [scr0_2] "=&f" (scr0_2), [scr0_3] "=&f" (scr0_3),
+                      [scr0_4] "=&f" (scr0_4), [scr1_1] "=&f" (scr1_1),
+                      [scr1_2] "=&f" (scr1_2), [scr1_3] "=&f" (scr1_3),
+                      [scr1_4] "=&f" (scr1_4)
+                    :
+                    : "memory"
+                );
+            }
+            s->mdct.mdct_calcw(&s->mdct, block->mdct_coef[ch+1],
+                               s->windowed_samples);
+        }
+    }
+}
+
+static void ff_ac3_float_apply_channel_coupling_mips(AC3EncodeContext *s)
+{
+    LOCAL_ALIGNED_16(CoefType, cpl_coords,      [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
+    LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]);
+    int blk, ch, bnd, i, j;
+    CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
+    int cpl_start, num_cpl_coefs;
+    int32_t  *dst;
+    const float *src;
+    unsigned int len;
+    uint8_t *exp;
+    float scale = 1 << 24;
+    float src0, src1, src2, src3, src4, src5, src6, src7;
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    int e,v;
+
+    memset(cpl_coords,       0, AC3_MAX_BLOCKS * sizeof(*cpl_coords));
+    memset(fixed_cpl_coords, 0, AC3_MAX_BLOCKS * sizeof(*cpl_coords));
+
+    /* align start to 16-byte boundary. align length to multiple of 32.
+        note: coupling start bin % 4 will always be 1 */
+    cpl_start     = s->start_freq[CPL_CH] - 1;
+    num_cpl_coefs = FFALIGN(s->num_cpl_subbands * 12 + 1, 32);
+    cpl_start     = FFMIN(256, cpl_start + num_cpl_coefs) - num_cpl_coefs;
+
+    /* calculate coupling channel from fbw channels */
+    for (blk = 0; blk < s->num_blocks; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][cpl_start];
+        if (!block->cpl_in_use)
+            continue;
+        memset(cpl_coef, 0, num_cpl_coefs * sizeof(*cpl_coef));
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            CoefType *ch_coef = &block->mdct_coef[ch][cpl_start];
+            if (!block->channel_in_cpl[ch])
+                continue;
+            for (i = 0; i < num_cpl_coefs; i++)
+                cpl_coef[i] += ch_coef[i];
+        }
+
+        /* coefficients must be clipped in order to be encoded */
+        clip_coefficients(&s->dsp, cpl_coef, num_cpl_coefs);
+    }
+
+    /* calculate energy in each band in coupling channel and each fbw channel */
+    /* TODO: possibly use SIMD to speed up energy calculation */
+    bnd = 0;
+    i = s->start_freq[CPL_CH];
+    while (i < s->cpl_end_freq) {
+        int band_size = s->cpl_band_sizes[bnd];
+        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
+            for (blk = 0; blk < s->num_blocks; blk++) {
+                AC3Block *block = &s->blocks[blk];
+                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
+                    continue;
+                for (j = 0; j < band_size; j++) {
+                    CoefType v = block->mdct_coef[ch][i+j];
+                    MAC_COEF(energy[blk][ch][bnd], v, v);
+                }
+            }
+        }
+        i += band_size;
+        bnd++;
+    }
+
+    /* calculate coupling coordinates for all blocks for all channels */
+    for (blk = 0; blk < s->num_blocks; blk++) {
+        AC3Block *block  = &s->blocks[blk];
+        if (!block->cpl_in_use)
+            continue;
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            if (!block->channel_in_cpl[ch])
+                continue;
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy[blk][ch][bnd],
+                                                          energy[blk][CPL_CH][bnd]);
+            }
+        }
+    }
+
+    /* determine which blocks to send new coupling coordinates for */
+    for (blk = 0; blk < s->num_blocks; blk++) {
+        AC3Block *block  = &s->blocks[blk];
+        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
+
+        memset(block->new_cpl_coords, 0, sizeof(block->new_cpl_coords));
+
+        if (block->cpl_in_use) {
+            /* send new coordinates if this is the first block, if previous
+             * block did not use coupling but this block does, the channels
+             * using coupling has changed from the previous block, or the
+             * coordinate difference from the last block for any channel is
+             * greater than a threshold value. */
+            if (blk == 0 || !block0->cpl_in_use) {
+                for (ch = 1; ch <= s->fbw_channels; ch++)
+                    block->new_cpl_coords[ch] = 1;
+            } else {
+                for (ch = 1; ch <= s->fbw_channels; ch++) {
+                    if (!block->channel_in_cpl[ch])
+                        continue;
+                    if (!block0->channel_in_cpl[ch]) {
+                        block->new_cpl_coords[ch] = 1;
+                    } else {
+                        CoefSumType coord_diff = 0;
+                        for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                            coord_diff += FFABS(cpl_coords[blk-1][ch][bnd] -
+                                                cpl_coords[blk  ][ch][bnd]);
+                        }
+                        coord_diff /= s->num_cpl_bands;
+                        if (coord_diff > NEW_CPL_COORD_THRESHOLD)
+                            block->new_cpl_coords[ch] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    /* calculate final coupling coordinates, taking into account reusing of
+       coordinates in successive blocks */
+    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+        blk = 0;
+        while (blk < s->num_blocks) {
+            int av_uninit(blk1);
+            AC3Block *block  = &s->blocks[blk];
+
+            if (!block->cpl_in_use) {
+                blk++;
+                continue;
+            }
+
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                CoefSumType energy_ch, energy_cpl;
+                if (!block->channel_in_cpl[ch])
+                    continue;
+                energy_cpl = energy[blk][CPL_CH][bnd];
+                energy_ch = energy[blk][ch][bnd];
+                blk1 = blk+1;
+                while (!s->blocks[blk1].new_cpl_coords[ch] && blk1 < s->num_blocks) {
+                    if (s->blocks[blk1].cpl_in_use) {
+                        energy_cpl += energy[blk1][CPL_CH][bnd];
+                        energy_ch += energy[blk1][ch][bnd];
+                    }
+                    blk1++;
+                }
+                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy_ch, energy_cpl);
+            }
+            blk = blk1;
+        }
+    }
+
+    /* calculate exponents/mantissas for coupling coordinates */
+    for (blk = 0; blk < s->num_blocks; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        if (!block->cpl_in_use || !block->new_cpl_coords)
+            continue;
+        dst = (int32_t*)fixed_cpl_coords[blk][1];
+        src = cpl_coords[blk][1];
+        len = s->fbw_channels * 16;
+        exp = block->cpl_coord_exp[1];
+
+        do {
+            __asm__ __volatile__ (
+                "lwc1       %[src0],    0(%[src])               \n\t"
+                "lwc1       %[src1],    4(%[src])               \n\t"
+                "lwc1       %[src2],    8(%[src])               \n\t"
+                "lwc1       %[src3],    12(%[src])              \n\t"
+                "lwc1       %[src4],    16(%[src])              \n\t"
+                "lwc1       %[src5],    20(%[src])              \n\t"
+                "lwc1       %[src6],    24(%[src])              \n\t"
+                "lwc1       %[src7],    28(%[src])              \n\t"
+                "mul.s      %[src0],    %[src0],    %[scale]    \n\t"
+                "mul.s      %[src1],    %[src1],    %[scale]    \n\t"
+                "mul.s      %[src2],    %[src2],    %[scale]    \n\t"
+                "mul.s      %[src3],    %[src3],    %[scale]    \n\t"
+                "mul.s      %[src4],    %[src4],    %[scale]    \n\t"
+                "mul.s      %[src5],    %[src5],    %[scale]    \n\t"
+                "mul.s      %[src6],    %[src6],    %[scale]    \n\t"
+                "mul.s      %[src7],    %[src7],    %[scale]    \n\t"
+                "cvt.w.s    %[src0],    %[src0]                 \n\t"
+                "cvt.w.s    %[src1],    %[src1]                 \n\t"
+                "cvt.w.s    %[src2],    %[src2]                 \n\t"
+                "cvt.w.s    %[src3],    %[src3]                 \n\t"
+                "cvt.w.s    %[src4],    %[src4]                 \n\t"
+                "cvt.w.s    %[src5],    %[src5]                 \n\t"
+                "cvt.w.s    %[src6],    %[src6]                 \n\t"
+                "cvt.w.s    %[src7],    %[src7]                 \n\t"
+                "mfc1       %[temp0],   %[src0]                 \n\t"
+                "mfc1       %[temp1],   %[src1]                 \n\t"
+                "mfc1       %[temp2],   %[src2]                 \n\t"
+                "mfc1       %[temp3],   %[src3]                 \n\t"
+                "mfc1       %[temp4],   %[src4]                 \n\t"
+                "mfc1       %[temp5],   %[src5]                 \n\t"
+                "mfc1       %[temp6],   %[src6]                 \n\t"
+                "mfc1       %[temp7],   %[src7]                 \n\t"
+
+                : [src] "+r" (src),
+                  [src0] "=&f" (src0), [src1] "=&f" (src1),
+                  [src2] "=&f" (src2), [src3] "=&f" (src3),
+                  [src4] "=&f" (src4), [src5] "=&f" (src5),
+                  [src6] "=&f" (src6), [src7] "=&f" (src7),
+                  [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+                  [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+                  [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
+                  [temp6] "=&r" (temp6), [temp7] "=&r" (temp7)
+                : [scale] "f" (scale)
+            );
+
+            v = abs(temp0);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp0 = av_clip(temp0, -16777215, 16777215);
+                }
+            }
+            exp[0] = e;
+
+            v = abs(temp1);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp1 = av_clip(temp1, -16777215, 16777215);
+                }
+            }
+            exp[1] = e;
+
+            v = abs(temp2);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp2 = av_clip(temp2, -16777215, 16777215);
+                }
+            }
+            exp[2] = e;
+
+            v = abs(temp3);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp3 = av_clip(temp3, -16777215, 16777215);
+                }
+            }
+            exp[3] = e;
+
+            v = abs(temp4);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp4 = av_clip(temp4, -16777215, 16777215);
+                }
+            }
+            exp[4] = e;
+
+            v = abs(temp5);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp5 = av_clip(temp5, -16777215, 16777215);
+                }
+            }
+            exp[5] = e;
+
+            v = abs(temp6);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp6 = av_clip(temp6, -16777215, 16777215);
+                }
+            }
+            exp[6] = e;
+
+             v = abs(temp7);
+            if (v == 0)
+                e = 24;
+            else {
+                e = 23 - av_log2(v);
+                if (e < 0) {
+                    e = 0;
+                    temp7 = av_clip(temp7, -16777215, 16777215);
+                }
+            }
+            exp[7] = e;
+
+            __asm__ __volatile__ (
+                "sw       %[temp0],    0(%[dst])                \n\t"
+                "sw       %[temp1],    4(%[dst])                \n\t"
+                "sw       %[temp2],    8(%[dst])                \n\t"
+                "sw       %[temp3],    12(%[dst])               \n\t"
+                "sw       %[temp4],    16(%[dst])               \n\t"
+                "sw       %[temp5],    20(%[dst])               \n\t"
+                "sw       %[temp6],    24(%[dst])               \n\t"
+                "sw       %[temp7],    28(%[dst])               \n\t"
+
+                : [dst] "+r" (dst)
+                : [temp0] "r" (temp0), [temp1] "r" (temp1),
+                  [temp2] "r" (temp2), [temp3] "r" (temp3),
+                  [temp4] "r" (temp4), [temp5] "r" (temp5),
+                  [temp6] "r" (temp6), [temp7] "r" (temp7)
+                : "memory"
+            );
+
+            src = src + 8;
+            dst = dst + 8;
+            exp = exp + 8;
+            len -= 8;
+        } while (len > 0);
+
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            int bnd, min_exp, max_exp, master_exp;
+
+            if (!block->new_cpl_coords[ch])
+                continue;
+
+            /* determine master exponent */
+            min_exp = max_exp = block->cpl_coord_exp[ch][0];
+            for (bnd = 1; bnd < s->num_cpl_bands; bnd++) {
+                int exp = block->cpl_coord_exp[ch][bnd];
+                min_exp = FFMIN(exp, min_exp);
+                max_exp = FFMAX(exp, max_exp);
+            }
+            master_exp = ((max_exp - 15) + 2) / 3;
+            master_exp = FFMAX(master_exp, 0);
+            while (min_exp < master_exp * 3)
+                master_exp--;
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                block->cpl_coord_exp[ch][bnd] = av_clip(block->cpl_coord_exp[ch][bnd] -
+                                                        master_exp * 3, 0, 15);
+            }
+            block->cpl_master_exp[ch] = master_exp;
+
+            /* quantize mantissas */
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                int cpl_exp  = block->cpl_coord_exp[ch][bnd];
+                int cpl_mant = (fixed_cpl_coords[blk][ch][bnd] << (5 + cpl_exp + master_exp * 3)) >> 24;
+                if (cpl_exp == 15)
+                    cpl_mant >>= 1;
+                else
+                    cpl_mant -= 16;
+
+                block->cpl_coord_mant[ch][bnd] = cpl_mant;
+            }
+        }
+    }
+
+    if (CONFIG_EAC3_ENCODER && s->eac3)
+        ff_eac3_set_cpl_states(s);
+}
+#endif
+
+void ff_ac3_float_encode_init_mips(AC3EncodeContext *s) {
+#if HAVE_INLINE_ASM
+    s->deinterleave_input_samples = ff_ac3_float_deinterleave_input_samples_mips;
+    s->apply_mdct                 = ff_ac3_float_apply_mdct_mips;
+    s->apply_channel_coupling     = ff_ac3_float_apply_channel_coupling_mips;
+#endif
+}
-- 
1.7.3.4