[FFmpeg-devel] [PATCH] Convert vorbisdsp x86 functions from inline asm to yasm.

Thu Jan 17 17:25:18 CET 2013

From: "Ronald S. Bultje" <rsbultje at gmail.com>

---
 libavcodec/x86/Makefile         |  1 +
 libavcodec/x86/dsputil_mmx.c    |  3 --
 libavcodec/x86/dsputil_mmx.h    |  2 -
 libavcodec/x86/vorbisdsp.asm    | 84 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/vorbisdsp_init.c | 73 +++--------------------------------
 5 files changed, 91 insertions(+), 72 deletions(-)
 create mode 100644 libavcodec/x86/vorbisdsp.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6069968..0bade86 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -62,6 +62,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
+YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp56dsp.o
 YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1a18bb2..e7efae5 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -38,9 +38,6 @@
 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
 
-DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
-    { 0x8000000080000000ULL, 0x8000000080000000ULL };
-
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index a142406..49688dc 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -31,8 +31,6 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
 extern const uint64_t ff_bone;
 extern const uint64_t ff_wtwo;
 
-extern const uint64_t ff_pdw_80000000[2];
-
 extern const xmm_reg  ff_pw_3;
 extern const xmm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
new file mode 100644
index 0000000..e2168ef
--- /dev/null
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -0,0 +1,84 @@
+;******************************************************************************
+;* Vorbis x86 optimizations
+;* Copyright (C) 2006 Loren Merritt <lorenm at u.washington.edu>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pdw_80000000: times 4 dd 0x80000000
+
+SECTION .text
+
+%if ARCH_X86_32
+INIT_MMX 3dnow
+cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
+    pxor                     m7, m7
+    lea                    magq, [magq+block_sizeq*4]
+    lea                    angq, [angq+block_sizeq*4]
+    neg             block_sizeq
+.loop:
+    mova                     m0, [magq+block_sizeq*4]
+    mova                     m1, [angq+block_sizeq*4]
+    mova                     m2, m0
+    mova                     m3, m1
+    pfcmpgq                  m2, m7     ; m <= 0.0
+    pfcmpgq                  m3, m7     ; a <= 0.0
+    pslld                    m2, 31     ; keep only the sign bit
+    pxor                     m1, m2
+    mova                     m4, m3
+    pand                     m3, m1
+    pandn                    m4, m1
+    pfadd                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
+    pfsub                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
+    mova   [angq+block_sizeq*4], m3
+    mova   [magq+block_sizeq*4], m0
+    add             block_sizeq, 2
+    jl .loop
+    femms
+    RET
+%endif
+
+INIT_XMM sse
+cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
+    movsxdifnidn    block_sizeq, block_sized
+    mova                     m5, [pdw_80000000]
+    lea                    magq, [magq+block_sizeq*4]
+    lea                    angq, [angq+block_sizeq*4]
+    neg             block_sizeq
+.loop:
+    mova                     m0, [magq+block_sizeq*4]
+    mova                     m1, [angq+block_sizeq*4]
+    xorps                    m2, m2
+    xorps                    m3, m3
+    cmpleps                  m2, m0     ; m <= 0.0
+    cmpleps                  m3, m1     ; a <= 0.0
+    andps                    m2, m5     ; keep only the sign bit
+    xorps                    m1, m2
+    mova                     m4, m3
+    andps                    m3, m1
+    andnps                   m4, m1
+    addps                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
+    subps                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
+    mova   [angq+block_sizeq*4], m3
+    mova   [magq+block_sizeq*4], m0
+    add             block_sizeq, 4
+    jl .loop
+    RET
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index 5243095..1272636 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -21,81 +21,20 @@
 #include "config.h"
 #include "libavutil/cpu.h"
 #include "libavcodec/vorbisdsp.h"
-#include "dsputil_mmx.h" // for ff_pdw_80000000
 
-#if HAVE_INLINE_ASM
-#if ARCH_X86_32
-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
-{
-    int i;
-    __asm__ volatile ("pxor %%mm7, %%mm7":);
-    for (i = 0; i < blocksize; i += 2) {
-        __asm__ volatile (
-            "movq       %0, %%mm0   \n\t"
-            "movq       %1, %%mm1   \n\t"
-            "movq    %%mm0, %%mm2   \n\t"
-            "movq    %%mm1, %%mm3   \n\t"
-            "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
-            "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
-            "pslld     $31, %%mm2   \n\t" // keep only the sign bit
-            "pxor    %%mm2, %%mm1   \n\t"
-            "movq    %%mm3, %%mm4   \n\t"
-            "pand    %%mm1, %%mm3   \n\t"
-            "pandn   %%mm1, %%mm4   \n\t"
-            "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movq    %%mm3, %1      \n\t"
-            "movq    %%mm0, %0      \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
-    __asm__ volatile ("femms");
-}
-#endif
-
-static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
-{
-    int i;
-
-    __asm__ volatile (
-        "movaps  %0, %%xmm5 \n\t"
-        :: "m"(ff_pdw_80000000[0])
-    );
-    for (i = 0; i < blocksize; i += 4) {
-        __asm__ volatile (
-            "movaps      %0, %%xmm0 \n\t"
-            "movaps      %1, %%xmm1 \n\t"
-            "xorps   %%xmm2, %%xmm2 \n\t"
-            "xorps   %%xmm3, %%xmm3 \n\t"
-            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
-            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
-            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
-            "xorps   %%xmm2, %%xmm1 \n\t"
-            "movaps  %%xmm3, %%xmm4 \n\t"
-            "andps   %%xmm1, %%xmm3 \n\t"
-            "andnps  %%xmm1, %%xmm4 \n\t"
-            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movaps  %%xmm3, %1     \n\t"
-            "movaps  %%xmm0, %0     \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
-}
-#endif
+void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize);
+void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize);
 
 void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
 
 #if ARCH_X86_32
     if (mm_flags & AV_CPU_FLAG_3DNOW)
-        dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+        dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
 #endif /* ARCH_X86_32 */
     if (mm_flags & AV_CPU_FLAG_SSE)
-        dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-#endif /* HAVE_INLINE_ASM */
+        dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
+#endif /* HAVE_YASM */
 }
-- 
1.7.11.3