[FFmpeg-cvslog] Merge commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6'

Wed Sep 27 00:49:11 EEST 2017

ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Sep 26 18:48:06 2017 -0300| [0c005fa86f01df75be8c9cacad7530978af80900] | committer: James Almer

Merge commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6'

* commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6':
  asm: Consistently uppercase SECTION markers

Merged-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0c005fa86f01df75be8c9cacad7530978af80900
---

 libavcodec/x86/dirac_dwt.asm            | 2 +-
 libavcodec/x86/diracdsp.asm             | 2 +-
 libavcodec/x86/dnxhdenc.asm             | 2 +-
 libavcodec/x86/huffyuvencdsp.asm        | 2 +-
 libavcodec/x86/lossless_videoencdsp.asm | 2 +-
 libavcodec/x86/vc1dsp_loopfilter.asm    | 2 +-
 libavcodec/x86/vc1dsp_mc.asm            | 2 +-
 libavutil/x86/x86inc.asm                | 4 ++--
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
index 89806899a2..22a5c2bbbb 100644
--- a/libavcodec/x86/dirac_dwt.asm
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -29,7 +29,7 @@ cextern pw_2
 cextern pw_8
 cextern pw_16
 
-section .text
+SECTION .text
 
 ; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
 %macro COMPOSE_53iL0 4
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 6b3f780e41..cc8a26fca5 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -30,7 +30,7 @@ cextern pw_16
 cextern pw_32
 cextern pb_80
 
-section .text
+SECTION .text
 
 %macro UNPACK_ADD 6
     mov%5   %1, %3
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index 9dd6d51ee6..b4f759552e 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -22,7 +22,7 @@
 
 %include "libavutil/x86/x86util.asm"
 
-section .text
+SECTION .text
 
 ; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
 ;                              ptrdiff_t line_size)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 1228aa8355..eeef81ab8e 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -25,7 +25,7 @@
 
 %include "libavutil/x86/x86util.asm"
 
-section .text
+SECTION .text
 
 ; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 ;                    unsigned mask, int w);
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
index 63fd72174a..3cb7dce07f 100644
--- a/libavcodec/x86/lossless_videoencdsp.asm
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -25,7 +25,7 @@
 
 %include "libavutil/x86/x86util.asm"
 
-section .text
+SECTION .text
 
 ; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 ;                    intptr_t w);
diff --git a/libavcodec/x86/vc1dsp_loopfilter.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index 1838f6f235..fd33bd13dc 100644
--- a/libavcodec/x86/vc1dsp_loopfilter.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -24,7 +24,7 @@
 cextern pw_4
 cextern pw_5
 
-section .text
+SECTION .text
 
 ; dst_low, dst_high (src), zero
 ; zero-extends one vector from 8 to 16 bits
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 2850ca861d..0e6d87dd8b 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -24,7 +24,7 @@
 cextern pw_9
 cextern pw_128
 
-section .text
+SECTION .text
 
 %if HAVE_MMX_INLINE
 
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c4ec29bd9d..6a054a3e09 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -87,9 +87,9 @@
 ; keep supporting OS/2.
 %macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,aout
-        section .text
+        SECTION .text
     %elifidn __OUTPUT_FORMAT__,coff
-        section .text
+        SECTION .text
     %else
         SECTION .rodata align=%1
     %endif


======================================================================

diff --cc libavcodec/x86/dirac_dwt.asm
index 89806899a2,0000000000..22a5c2bbbb
mode 100644,000000..100644
--- a/libavcodec/x86/dirac_dwt.asm
+++ b/libavcodec/x86/dirac_dwt.asm
@@@ -1,307 -1,0 +1,307 @@@
 +;******************************************************************************
 +;* x86 optimized discrete wavelet trasnform
 +;* Copyright (c) 2010 David Conrad
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
 +SECTION_RODATA
 +pw_1991: times 4 dw 9,-1
 +
 +cextern pw_1
 +cextern pw_2
 +cextern pw_8
 +cextern pw_16
 +
- section .text
++SECTION .text
 +
 +; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
 +%macro COMPOSE_53iL0 4
 +    paddw   %2, %3
 +    paddw   %2, %4
 +    psraw   %2, 2
 +    psubw   %1, %2
 +%endm
 +
 +; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
 +; if %4 is supplied, %1 is loaded unaligned from there
 +; m2: clobbered  m3: pw_8  m4: pw_1991
 +%macro COMPOSE_DD97iH0 3-4
 +    paddw   m0, %3
 +    paddw   m1, %2
 +    psubw   m0, m3
 +    mova    m2, m1
 +    punpcklwd m1, m0
 +    punpckhwd m2, m0
 +    pmaddwd m1, m4
 +    pmaddwd m2, m4
 +%if %0 > 3
 +    movu    %1, %4
 +%endif
 +    psrad   m1, 4
 +    psrad   m2, 4
 +    packssdw m1, m2
 +    paddw   m1, %1
 +%endm
 +
 +%macro COMPOSE_VERTICAL 1
 +; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 +;                                  int width)
 +cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
 +    mova    m2, [pw_2]
 +%if ARCH_X86_64
 +    mov     widthd, widthd
 +%endif
 +.loop:
 +    sub     widthq, mmsize/2
 +    mova    m1, [b0q+2*widthq]
 +    mova    m0, [b1q+2*widthq]
 +    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
 +    mova    [b1q+2*widthq], m0
 +    jg      .loop
 +    REP_RET
 +
 +; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 +;                                  int width)
 +cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
 +    mova    m1, [pw_1]
 +%if ARCH_X86_64
 +    mov     widthd, widthd
 +%endif
 +.loop:
 +    sub     widthq, mmsize/2
 +    mova    m0, [b0q+2*widthq]
 +    paddw   m0, [b2q+2*widthq]
 +    paddw   m0, m1
 +    psraw   m0, 1
 +    paddw   m0, [b1q+2*widthq]
 +    mova    [b1q+2*widthq], m0
 +    jg      .loop
 +    REP_RET
 +
 +; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 +;                               IDWTELEM *b3, IDWTELEM *b4, int width)
 +cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
 +    mova    m3, [pw_8]
 +    mova    m4, [pw_1991]
 +%if ARCH_X86_64
 +    mov     widthd, widthd
 +%endif
 +.loop:
 +    sub     widthq, mmsize/2
 +    mova    m0, [b0q+2*widthq]
 +    mova    m1, [b1q+2*widthq]
 +    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
 +    mova    [b2q+2*widthq], m1
 +    jg      .loop
 +    REP_RET
 +
 +; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
 +;                                IDWTELEM *b3, IDWTELEM *b4, int width)
 +cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
 +    mova    m3, [pw_16]
 +    mova    m4, [pw_1991]
 +%if ARCH_X86_64
 +    mov     widthd, widthd
 +%endif
 +.loop:
 +    sub     widthq, mmsize/2
 +    mova    m0, [b0q+2*widthq]
 +    mova    m1, [b1q+2*widthq]
 +    mova    m5, [b2q+2*widthq]
 +    paddw   m0, [b4q+2*widthq]
 +    paddw   m1, [b3q+2*widthq]
 +    psubw   m0, m3
 +    mova    m2, m1
 +    punpcklwd m1, m0
 +    punpckhwd m2, m0
 +    pmaddwd m1, m4
 +    pmaddwd m2, m4
 +    psrad   m1, 5
 +    psrad   m2, 5
 +    packssdw m1, m2
 +    psubw   m5, m1
 +    mova    [b2q+2*widthq], m5
 +    jg      .loop
 +    REP_RET
 +
 +; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
 +cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
 +    mova    m3, [pw_1]
 +%if ARCH_X86_64
 +    mov     widthd, widthd
 +%endif
 +.loop:
 +    sub     widthq, mmsize/2
 +    mova    m1, [b1q+2*widthq]
 +    mova    m0, [b0q+2*widthq]
 +    mova    m2, m1
 +    paddw   m1, m3
 +    psraw   m1, 1
 +    psubw   m0, m1
 +    mova    [b0q+2*widthq], m0
 +    paddw   m2, m0
 +    mova    [b1q+2*widthq], m2
 +    jg      .loop
 +    REP_RET
 +%endmacro
 +
 +; extend the left and right edges of the tmp array by %1 and %2 respectively
 +%macro EDGE_EXTENSION 3
 +    mov     %3, [tmpq]
 +%assign %%i 1
 +%rep %1
 +    mov     [tmpq-2*%%i], %3
 +    %assign %%i %%i+1
 +%endrep
 +    mov     %3, [tmpq+2*w2q-2]
 +%assign %%i 0
 +%rep %2
 +    mov     [tmpq+2*w2q+2*%%i], %3
 +    %assign %%i %%i+1
 +%endrep
 +%endmacro
 +
 +
 +%macro HAAR_HORIZONTAL 2
 +; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
 +cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
 +    mov    w2d, wd
 +    xor     xq, xq
 +    shr    w2d, 1
 +    lea  b_w2q, [bq+wq]
 +    mova    m3, [pw_1]
 +.lowpass_loop:
 +    movu    m1, [b_w2q + 2*xq]
 +    mova    m0, [bq    + 2*xq]
 +    paddw   m1, m3
 +    psraw   m1, 1
 +    psubw   m0, m1
 +    mova    [tmpq + 2*xq], m0
 +    add     xq, mmsize/2
 +    cmp     xq, w2q
 +    jl      .lowpass_loop
 +
 +    xor     xq, xq
 +    and    w2q, ~(mmsize/2 - 1)
 +    cmp    w2q, mmsize/2
 +    jl      .end
 +
 +.highpass_loop:
 +    movu    m1, [b_w2q + 2*xq]
 +    mova    m0, [tmpq  + 2*xq]
 +    paddw   m1, m0
 +
 +    ; shift and interleave
 +%if %2 == 1
 +    paddw   m0, m3
 +    paddw   m1, m3
 +    psraw   m0, 1
 +    psraw   m1, 1
 +%endif
 +    mova    m2, m0
 +    punpcklwd m0, m1
 +    punpckhwd m2, m1
 +    mova    [bq+4*xq], m0
 +    mova    [bq+4*xq+mmsize], m2
 +
 +    add     xq, mmsize/2
 +    cmp     xq, w2q
 +    jl      .highpass_loop
 +.end:
 +    REP_RET
 +%endmacro
 +
 +
 +INIT_XMM
 +; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
 +cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
 +    mov    w2d, wd
 +    xor     xd, xd
 +    shr    w2d, 1
 +    lea  b_w2q, [bq+wq]
 +    movu    m4, [bq+wq]
 +    mova    m7, [pw_2]
 +    pslldq  m4, 14
 +.lowpass_loop:
 +    movu    m1, [b_w2q + 2*xq]
 +    mova    m0, [bq    + 2*xq]
 +    mova    m2, m1
 +    palignr m1, m4, 14
 +    mova    m4, m2
 +    COMPOSE_53iL0 m0, m1, m2, m7
 +    mova    [tmpq + 2*xq], m0
 +    add     xd, mmsize/2
 +    cmp     xd, w2d
 +    jl      .lowpass_loop
 +
 +    EDGE_EXTENSION 1, 2, xw
 +    ; leave the last up to 7 (sse) or 3 (mmx) values for C
 +    xor     xd, xd
 +    and    w2d, ~(mmsize/2 - 1)
 +    cmp    w2d, mmsize/2
 +    jl      .end
 +
 +    mova    m7, [tmpq-mmsize]
 +    mova    m0, [tmpq]
 +    mova    m5, [pw_1]
 +    mova    m3, [pw_8]
 +    mova    m4, [pw_1991]
 +.highpass_loop:
 +    mova    m6, m0
 +    palignr m0, m7, 14
 +    mova    m7, [tmpq + 2*xq + 16]
 +    mova    m1, m7
 +    mova    m2, m7
 +    palignr m1, m6, 2
 +    palignr m2, m6, 4
 +    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
 +    mova    m0, m7
 +    mova    m7, m6
 +
 +    ; shift and interleave
 +    paddw   m6, m5
 +    paddw   m1, m5
 +    psraw   m6, 1
 +    psraw   m1, 1
 +    mova    m2, m6
 +    punpcklwd m6, m1
 +    punpckhwd m2, m1
 +    mova    [bq+4*xq], m6
 +    mova    [bq+4*xq+mmsize], m2
 +
 +    add     xd, mmsize/2
 +    cmp     xd, w2d
 +    jl      .highpass_loop
 +.end:
 +    REP_RET
 +
 +
 +%if ARCH_X86_64 == 0
 +INIT_MMX
 +COMPOSE_VERTICAL mmx
 +HAAR_HORIZONTAL mmx, 0
 +HAAR_HORIZONTAL mmx, 1
 +%endif
 +
 +;;INIT_XMM
 +INIT_XMM
 +COMPOSE_VERTICAL sse2
 +HAAR_HORIZONTAL sse2, 0
 +HAAR_HORIZONTAL sse2, 1
diff --cc libavcodec/x86/diracdsp.asm
index 6b3f780e41,0000000000..cc8a26fca5
mode 100644,000000..100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@@ -1,347 -1,0 +1,347 @@@
 +;******************************************************************************
 +;* Copyright (c) 2010 David Conrad
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
 +SECTION_RODATA
 +pw_7: times 8 dw 7
 +convert_to_unsigned_10bit: times 4 dd 0x200
 +clip_10bit:                times 8 dw 0x3ff
 +
 +cextern pw_3
 +cextern pw_16
 +cextern pw_32
 +cextern pb_80
 +
- section .text
++SECTION .text
 +
 +%macro UNPACK_ADD 6
 +    mov%5   %1, %3
 +    mov%6   m5, %4
 +    mova    m4, %1
 +    mova    %2, m5
 +    punpcklbw %1, m7
 +    punpcklbw m5, m7
 +    punpckhbw m4, m7
 +    punpckhbw %2, m7
 +    paddw   %1, m5
 +    paddw   %2, m4
 +%endmacro
 +
 +%macro HPEL_FILTER 1
 +; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
 +cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
 +    mov     src0q, srcq
 +    lea     stridex3q, [3*strideq]
 +    sub     src0q, stridex3q
 +    pxor    m7, m7
 +.loop:
 +    ; 7*(src[0] + src[1])
 +    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
 +    pmullw  m0, [pw_7]
 +    pmullw  m1, [pw_7]
 +
 +    ; 3*( ... + src[-2] + src[3])
 +    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
 +    paddw   m0, m2
 +    paddw   m1, m3
 +    pmullw  m0, [pw_3]
 +    pmullw  m1, [pw_3]
 +
 +    ; ... - 7*(src[-1] + src[2])
 +    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
 +    pmullw  m2, [pw_7]
 +    pmullw  m3, [pw_7]
 +    psubw   m0, m2
 +    psubw   m1, m3
 +
 +    ; ... - (src[-3] + src[4])
 +    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
 +    psubw   m0, m2
 +    psubw   m1, m3
 +
 +    paddw   m0, [pw_16]
 +    paddw   m1, [pw_16]
 +    psraw   m0, 5
 +    psraw   m1, 5
 +    packuswb m0, m1
 +    mova    [dstq], m0
 +    add     dstq, mmsize
 +    add     srcq, mmsize
 +    add     src0q, mmsize
 +    sub     widthd, mmsize
 +    jg      .loop
 +    RET
 +
 +; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
 +cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
 +    dec     widthd
 +    pxor    m7, m7
 +    and     widthd, ~(mmsize-1)
 +.loop:
 +    ; 7*(src[0] + src[1])
 +    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
 +    pmullw  m0, [pw_7]
 +    pmullw  m1, [pw_7]
 +
 +    ; 3*( ... + src[-2] + src[3])
 +    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
 +    paddw   m0, m2
 +    paddw   m1, m3
 +    pmullw  m0, [pw_3]
 +    pmullw  m1, [pw_3]
 +
 +    ; ... - 7*(src[-1] + src[2])
 +    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
 +    pmullw  m2, [pw_7]
 +    pmullw  m3, [pw_7]
 +    psubw   m0, m2
 +    psubw   m1, m3
 +
 +    ; ... - (src[-3] + src[4])
 +    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
 +    psubw   m0, m2
 +    psubw   m1, m3
 +
 +    paddw   m0, [pw_16]
 +    paddw   m1, [pw_16]
 +    psraw   m0, 5
 +    psraw   m1, 5
 +    packuswb m0, m1
 +    mova    [dstq + widthq], m0
 +    sub     widthd, mmsize
 +    jge     .loop
 +    RET
 +%endmacro
 +
 +%macro PUT_RECT 1
 +; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
 +cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
 +    mova    m0, [pb_80]
 +    add     wd, (mmsize-1)
 +    and     wd, ~(mmsize-1)
 +
 +%if ARCH_X86_64
 +    movsxd   dst_strideq, dst_strided
 +    movsxd   src_strideq, src_strided
 +    mov   r7d, r5m
 +    mov   r8d, wd
 +    %define wspill r8d
 +    %define hd r7d
 +%else
 +    mov    r4m, wd
 +    %define wspill r4m
 +    %define hd r5mp
 +%endif
 +
 +.loopy:
 +    lea     src2q, [srcq+src_strideq]
 +    lea     dst2q, [dstq+dst_strideq]
 +.loopx:
 +    sub      wd, mmsize
 +    mova     m1, [srcq +2*wq]
 +    mova     m2, [src2q+2*wq]
 +    packsswb m1, [srcq +2*wq+mmsize]
 +    packsswb m2, [src2q+2*wq+mmsize]
 +    paddb    m1, m0
 +    paddb    m2, m0
 +    mova    [dstq +wq], m1
 +    mova    [dst2q+wq], m2
 +    jg      .loopx
 +
 +    lea   srcq, [srcq+src_strideq*2]
 +    lea   dstq, [dstq+dst_strideq*2]
 +    sub     hd, 2
 +    mov     wd, wspill
 +    jg      .loopy
 +    RET
 +%endm
 +
 +%macro ADD_RECT 1
 +; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
 +cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
 +    mova    m0, [pw_32]
 +    add     wd, (mmsize-1)
 +    and     wd, ~(mmsize-1)
 +
 +%if ARCH_X86_64
 +    movsxd   strideq, strided
 +    movsxd   idwt_strideq, idwt_strided
 +    mov   r8d, wd
 +    %define wspill r8d
 +%else
 +    mov    r5m, wd
 +    %define wspill r5m
 +%endif
 +
 +.loop:
 +    sub     wd, mmsize
 +    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
 +    paddw   m1, m0
 +    psraw   m1, 6
 +    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
 +    paddw   m2, m0
 +    psraw   m2, 6
 +    paddw   m1, [idwtq+2*wq]
 +    paddw   m2, [idwtq+2*wq+mmsize]
 +    packuswb m1, m2
 +    mova    [dstq +wq], m1
 +    jg      .loop
 +
 +    lea   srcq, [srcq + 2*strideq]
 +    add   dstq, strideq
 +    lea  idwtq, [idwtq+ 2*idwt_strideq]
 +    sub     hd, 1
 +    mov     wd, wspill
 +    jg      .loop
 +    RET
 +%endm
 +
 +%macro ADD_OBMC 2
 +; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
 +cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
 +    pxor        m4, m4
 +.loop:
 +%assign i 0
 +%rep %1 / mmsize
 +    mova        m0, [srcq+i]
 +    mova        m1, m0
 +    punpcklbw   m0, m4
 +    punpckhbw   m1, m4
 +    mova        m2, [obmcq+i]
 +    mova        m3, m2
 +   punpcklbw   m2, m4
 +    punpckhbw   m3, m4
 +    pmullw      m0, m2
 +    pmullw      m1, m3
 +    movu        m2, [dstq+2*i]
 +    movu        m3, [dstq+2*i+mmsize]
 +    paddw       m0, m2
 +    paddw       m1, m3
 +    movu        [dstq+2*i], m0
 +    movu        [dstq+2*i+mmsize], m1
 +%assign i i+mmsize
 +%endrep
 +    lea         srcq, [srcq+strideq]
 +    lea         dstq, [dstq+2*strideq]
 +    add         obmcq, 32
 +    sub         yblend, 1
 +    jg          .loop
 +    RET
 +%endm
 +
 +INIT_MMX
 +%if ARCH_X86_64 == 0
 +PUT_RECT mmx
 +ADD_RECT mmx
 +
 +HPEL_FILTER mmx
 +ADD_OBMC 32, mmx
 +ADD_OBMC 16, mmx
 +%endif
 +ADD_OBMC 8, mmx
 +
 +INIT_XMM
 +PUT_RECT sse2
 +ADD_RECT sse2
 +
 +HPEL_FILTER sse2
 +ADD_OBMC 32, sse2
 +ADD_OBMC 16, sse2
 +
 +INIT_XMM sse4
 +
 +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
 +cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
 +    movd   m2, qfd
 +    movd   m3, qsd
 +    SPLATD m2
 +    SPLATD m3
 +    mov    r4, tot_hq
 +    mov    r3, dstq
 +
 +    .loop_v:
 +    mov    tot_hq, r4
 +    mov    dstq,   r3
 +
 +    .loop_h:
 +    movu   m0, [srcq]
 +
 +    pabsd  m1, m0
 +    pmulld m1, m2
 +    paddd  m1, m3
 +    psrld  m1,  2
 +    psignd m1, m0
 +
 +    movu   [dstq], m1
 +
 +    add    srcq, mmsize
 +    add    dstq, mmsize
 +    sub    tot_hd, 4
 +    jg     .loop_h
 +
 +    add    r3, strideq
 +    dec    tot_vd
 +    jg     .loop_v
 +
 +    RET
 +
 +INIT_XMM sse4
 +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
 +%if ARCH_X86_64
 +cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
 +%else
 +cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
 +    %define  hd  r5mp
 +%endif
 +    shl      wd, 2
 +    add    srcq, wq
 +    neg      wq
 +    mov     t2q, dstq
 +    mov     t1q, wq
 +    pxor     m2, m2
 +    mova     m3, [clip_10bit]
 +    mova     m4, [convert_to_unsigned_10bit]
 +
 +    .loop_h:
 +    mov    dstq, t2q
 +    mov      wq, t1q
 +
 +    .loop_w:
 +    movu     m0, [srcq+wq+0*mmsize]
 +    movu     m1, [srcq+wq+1*mmsize]
 +
 +    paddd    m0, m4
 +    paddd    m1, m4
 +    packusdw m0, m0, m1
 +    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine
 +
 +    movu     [dstq], m0
 +
 +    add      dstq, 1*mmsize
 +    add      wq,   2*mmsize
 +    jl       .loop_w
 +
 +    add    srcq, src_strideq
 +    add     t2q, dst_strideq
 +    sub      hd, 1
 +    jg       .loop_h
 +
 +    RET
diff --cc libavcodec/x86/huffyuvencdsp.asm
index 1228aa8355,0000000000..eeef81ab8e
mode 100644,000000..100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@@ -1,143 -1,0 +1,143 @@@
 +;************************************************************************
 +;* SIMD-optimized HuffYUV encoding functions
 +;* Copyright (c) 2000, 2001 Fabrice Bellard
 +;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
 +;*
 +;* MMX optimization by Nick Kurshev <nickols_k at mail.ru>
 +;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99 at gmail.com>
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
- section .text
++SECTION .text
 +
 +; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 +;                    unsigned mask, int w);
 +%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
 +    movd    m4, maskd
 +    SPLATW  m4, m4
 +    add     wd, wd
 +    test    wq, 2*mmsize - 1
 +    jz %%.tomainloop
 +    push  tmpq
 +%%.wordloop:
 +    sub     wq, 2
 +%ifidn %2, add
 +    mov   tmpw, [srcq+wq]
 +    add   tmpw, [dstq+wq]
 +%else
 +    mov   tmpw, [src1q+wq]
 +    sub   tmpw, [src2q+wq]
 +%endif
 +    and   tmpw, maskw
 +    mov     [dstq+wq], tmpw
 +    test    wq, 2*mmsize - 1
 +    jnz %%.wordloop
 +    pop   tmpq
 +%%.tomainloop:
 +%ifidn %2, add
 +    add     srcq, wq
 +%else
 +    add     src1q, wq
 +    add     src2q, wq
 +%endif
 +    add     dstq, wq
 +    neg     wq
 +    jz      %%.end
 +%%.loop:
 +%ifidn %2, add
 +    mov%1   m0, [srcq+wq]
 +    mov%1   m1, [dstq+wq]
 +    mov%1   m2, [srcq+wq+mmsize]
 +    mov%1   m3, [dstq+wq+mmsize]
 +%else
 +    mov%1   m0, [src1q+wq]
 +    mov%1   m1, [src2q+wq]
 +    mov%1   m2, [src1q+wq+mmsize]
 +    mov%1   m3, [src2q+wq+mmsize]
 +%endif
 +    p%2w    m0, m1
 +    p%2w    m2, m3
 +    pand    m0, m4
 +    pand    m2, m4
 +    mov%1   [dstq+wq]       , m0
 +    mov%1   [dstq+wq+mmsize], m2
 +    add     wq, 2*mmsize
 +    jl %%.loop
 +%%.end:
 +    RET
 +%endmacro
 +
 +%if ARCH_X86_32
 +INIT_MMX mmx
 +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
 +    INT16_LOOP a, sub
 +%endif
 +
 +INIT_XMM sse2
 +cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
 +    test src1q, mmsize-1
 +    jnz .unaligned
 +    test src2q, mmsize-1
 +    jnz .unaligned
 +    test dstq, mmsize-1
 +    jnz .unaligned
 +    INT16_LOOP a, sub
 +.unaligned:
 +    INT16_LOOP u, sub
 +
 +INIT_MMX mmxext
 +cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
 +    add      wd, wd
 +    movd    mm7, maskd
 +    SPLATW  mm7, mm7
 +    movq    mm0, [src1q]
 +    movq    mm2, [src2q]
 +    psllq   mm0, 16
 +    psllq   mm2, 16
 +    movd    mm6, [left_topq]
 +    por     mm0, mm6
 +    movd    mm6, [leftq]
 +    por     mm2, mm6
 +    xor     maskq, maskq
 +.loop:
 +    movq    mm1, [src1q + maskq]
 +    movq    mm3, [src2q + maskq]
 +    movq    mm4, mm2
 +    psubw   mm2, mm0
 +    paddw   mm2, mm1
 +    pand    mm2, mm7
 +    movq    mm5, mm4
 +    pmaxsw  mm4, mm1
 +    pminsw  mm1, mm5
 +    pminsw  mm4, mm2
 +    pmaxsw  mm4, mm1
 +    psubw   mm3, mm4
 +    pand    mm3, mm7
 +    movq    [dstq + maskq], mm3
 +    add     maskq, 8
 +    movq    mm0, [src1q + maskq - 2]
 +    movq    mm2, [src2q + maskq - 2]
 +    cmp     maskq, wq
 +        jb .loop
 +    movzx maskd, word [src1q + wq - 2]
 +    mov [left_topq], maskd
 +    movzx maskd, word [src2q + wq - 2]
 +    mov [leftq], maskd
 +    RET
diff --cc libavcodec/x86/lossless_videoencdsp.asm
index 63fd72174a,0000000000..3cb7dce07f
mode 100644,000000..100644
--- a/libavcodec/x86/lossless_videoencdsp.asm
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@@ -1,150 -1,0 +1,150 @@@
 +;************************************************************************
 +;* SIMD-optimized lossless video encoding functions
 +;* Copyright (c) 2000, 2001 Fabrice Bellard
 +;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
 +;*
 +;* MMX optimization by Nick Kurshev <nickols_k at mail.ru>
 +;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99 at gmail.com>
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
- section .text
++SECTION .text
 +
 +; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
 +;                    intptr_t w);
 +%macro DIFF_BYTES_PROLOGUE 0
 +%if ARCH_X86_32
 +cglobal diff_bytes, 3,5,2, dst, src1, src2
 +%define wq r4q
 +    DECLARE_REG_TMP 3
 +    mov               wq, r3mp
 +%else
 +cglobal diff_bytes, 4,5,2, dst, src1, src2, w
 +    DECLARE_REG_TMP 4
 +%endif ; ARCH_X86_32
 +%define i t0q
 +%endmacro
 +
 +; label to jump to if w < regsize
 +%macro DIFF_BYTES_LOOP_PREP 1
 +    mov                i, wq
 +    and                i, -2 * regsize
 +        jz            %1
 +    add             dstq, i
 +    add            src1q, i
 +    add            src2q, i
 +    neg                i
 +%endmacro
 +
 +; mov type used for src1q, dstq, first reg, second reg
 +%macro DIFF_BYTES_LOOP_CORE 4
 +%if mmsize != 16
 +    mov%1             %3, [src1q + i]
 +    mov%1             %4, [src1q + i + regsize]
 +    psubb             %3, [src2q + i]
 +    psubb             %4, [src2q + i + regsize]
 +    mov%2           [dstq + i], %3
 +    mov%2 [regsize + dstq + i], %4
 +%else
 +    ; SSE enforces alignment of psubb operand
 +    mov%1             %3, [src1q + i]
 +    movu              %4, [src2q + i]
 +    psubb             %3, %4
 +    mov%2     [dstq + i], %3
 +    mov%1             %3, [src1q + i + regsize]
 +    movu              %4, [src2q + i + regsize]
 +    psubb             %3, %4
 +    mov%2 [regsize + dstq + i], %3
 +%endif
 +%endmacro
 +
 +%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
 +    %define regsize mmsize
 +.loop_%1%2:
 +    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
 +    add                i, 2 * regsize
 +        jl    .loop_%1%2
 +.skip_main_%1%2:
 +    and               wq, 2 * regsize - 1
 +        jz     .end_%1%2
 +%if mmsize > 16
 +    ; fall back to narrower xmm
 +    %define regsize mmsize / 2
 +    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
 +.loop2_%1%2:
 +    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
 +    add                i, 2 * regsize
 +        jl   .loop2_%1%2
 +.setup_loop_gpr_%1%2:
 +    and               wq, 2 * regsize - 1
 +        jz     .end_%1%2
 +%endif
 +    add             dstq, wq
 +    add            src1q, wq
 +    add            src2q, wq
 +    neg               wq
 +.loop_gpr_%1%2:
 +    mov              t0b, [src1q + wq]
 +    sub              t0b, [src2q + wq]
 +    mov      [dstq + wq], t0b
 +    inc               wq
 +        jl .loop_gpr_%1%2
 +.end_%1%2:
 +    REP_RET
 +%endmacro
 +
 +%if ARCH_X86_32
 +INIT_MMX mmx
 +DIFF_BYTES_PROLOGUE
 +    %define regsize mmsize
 +    DIFF_BYTES_LOOP_PREP .skip_main_aa
 +    DIFF_BYTES_BODY    a, a
 +%undef i
 +%endif
 +
 +INIT_XMM sse2
 +DIFF_BYTES_PROLOGUE
 +    %define regsize mmsize
 +    DIFF_BYTES_LOOP_PREP .skip_main_aa
 +    test            dstq, regsize - 1
 +        jnz     .loop_uu
 +    test           src1q, regsize - 1
 +        jnz     .loop_ua
 +    DIFF_BYTES_BODY    a, a
 +    DIFF_BYTES_BODY    u, a
 +    DIFF_BYTES_BODY    u, u
 +%undef i
 +
 +%if HAVE_AVX2_EXTERNAL
 +INIT_YMM avx2
 +DIFF_BYTES_PROLOGUE
 +    %define regsize mmsize
 +    ; Directly using unaligned SSE2 version is marginally faster than
 +    ; branching based on arguments.
 +    DIFF_BYTES_LOOP_PREP .skip_main_uu
 +    test            dstq, regsize - 1
 +        jnz     .loop_uu
 +    test           src1q, regsize - 1
 +        jnz     .loop_ua
 +    DIFF_BYTES_BODY    a, a
 +    DIFF_BYTES_BODY    u, a
 +    DIFF_BYTES_BODY    u, u
 +%undef i
 +%endif
diff --cc libavcodec/x86/vc1dsp_loopfilter.asm
index 1838f6f235,0000000000..fd33bd13dc
mode 100644,000000..100644
--- a/libavcodec/x86/vc1dsp_loopfilter.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@@ -1,317 -1,0 +1,317 @@@
 +;******************************************************************************
 +;* VC1 loopfilter optimizations
 +;* Copyright (c) 2009 David Conrad
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
 +cextern pw_4
 +cextern pw_5
 +
- section .text
++SECTION .text
 +
 +; dst_low, dst_high (src), zero
 +; zero-extends one vector from 8 to 16 bits
 +%macro UNPACK_8TO16 4
 +    mova      m%2, m%3
 +    punpckh%1 m%3, m%4
 +    punpckl%1 m%2, m%4
 +%endmacro
 +
 +%macro STORE_4_WORDS 6
 +%if cpuflag(sse4)
 +    pextrw %1, %5, %6+0
 +    pextrw %2, %5, %6+1
 +    pextrw %3, %5, %6+2
 +    pextrw %4, %5, %6+3
 +%else
 +    movd  %6d, %5
 +%if mmsize==16
 +    psrldq %5, 4
 +%else
 +    psrlq  %5, 32
 +%endif
 +    mov    %1, %6w
 +    shr    %6, 16
 +    mov    %2, %6w
 +    movd  %6d, %5
 +    mov    %3, %6w
 +    shr    %6, 16
 +    mov    %4, %6w
 +%endif
 +%endmacro
 +
 +; in:  p1 p0 q0 q1, clobbers p0
 +; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
 +%macro VC1_LOOP_FILTER_A0 4
 +    psubw  %1, %4
 +    psubw  %2, %3
 +    paddw  %1, %1
 +    pmullw %2, [pw_5]
 +    psubw  %1, %2
 +    paddw  %1, [pw_4]
 +    psraw  %1, 3
 +%endmacro
 +
 +; in: p0 q0 a0 a1 a2
 +;     m0 m1 m7 m6 m5
 +; %1: size
 +; out: m0=p0' m1=q0'
 +%macro VC1_FILTER 1
 +    PABSW   m4, m7
 +    PABSW   m3, m6
 +    PABSW   m2, m5
 +    mova    m6, m4
 +    pminsw  m3, m2
 +    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
 +    psubw   m3, m4
 +    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
 +    PABSW   m2, m3
 +    psraw   m2, 3   ; abs(d/8)
 +    pxor    m7, m3  ; d_sign ^= a0_sign
 +
 +    pxor    m5, m5
 +    movd    m3, r2d
 +%if %1 > 4
 +    punpcklbw m3, m3
 +%endif
 +    punpcklbw m3, m5
 +    pcmpgtw m3, m4  ; if (a0 < pq)
 +    pand    m6, m3
 +
 +    mova    m3, m0
 +    psubw   m3, m1
 +    PABSW   m4, m3
 +    psraw   m4, 1
 +    pxor    m3, m7  ; d_sign ^ clip_sign
 +    psraw   m3, 15
 +    pminsw  m2, m4  ; min(d, clip)
 +    pcmpgtw m4, m5
 +    pand    m6, m4  ; filt3 (C return value)
 +
 +; each set of 4 pixels is not filtered if the 3rd is not
 +%if mmsize==16
 +    pshuflw m4, m6, 0xaa
 +%if %1 > 4
 +    pshufhw m4, m4, 0xaa
 +%endif
 +%else
 +    pshufw  m4, m6, 0xaa
 +%endif
 +    pandn   m3, m4
 +    pand    m2, m6
 +    pand    m3, m2  ; d final
 +
 +    psraw   m7, 15
 +    pxor    m3, m7
 +    psubw   m3, m7
 +    psubw   m0, m3
 +    paddw   m1, m3
 +    packuswb m0, m0
 +    packuswb m1, m1
 +%endmacro
 +
 +; 1st param: size of filter
 +; 2nd param: mov suffix equivalent to the filter size
 +%macro VC1_V_LOOP_FILTER 2
 +    pxor      m5, m5
 +    mov%2     m6, [r4]
 +    mov%2     m4, [r4+r1]
 +    mov%2     m7, [r4+2*r1]
 +    mov%2     m0, [r4+r3]
 +    punpcklbw m6, m5
 +    punpcklbw m4, m5
 +    punpcklbw m7, m5
 +    punpcklbw m0, m5
 +
 +    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
 +    mov%2     m1, [r0]
 +    mov%2     m2, [r0+r1]
 +    punpcklbw m1, m5
 +    punpcklbw m2, m5
 +    mova      m4, m0
 +    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
 +    mov%2     m3, [r0+2*r1]
 +    mov%2     m4, [r0+r3]
 +    punpcklbw m3, m5
 +    punpcklbw m4, m5
 +    mova      m5, m1
 +    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
 +
 +    VC1_FILTER %1
 +    mov%2 [r4+r3], m0
 +    mov%2 [r0],    m1
 +%endmacro
 +
 +; 1st param: size of filter
 +;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
 +; 2nd (optional) param: temp register to use for storing words
 +%macro VC1_H_LOOP_FILTER 1-2
 +%if %1 == 4
 +    movq      m0, [r0     -4]
 +    movq      m1, [r0+  r1-4]
 +    movq      m2, [r0+2*r1-4]
 +    movq      m3, [r0+  r3-4]
 +    TRANSPOSE4x4B 0, 1, 2, 3, 4
 +%else
 +    movq      m0, [r0     -4]
 +    movq      m4, [r0+  r1-4]
 +    movq      m1, [r0+2*r1-4]
 +    movq      m5, [r0+  r3-4]
 +    movq      m2, [r4     -4]
 +    movq      m6, [r4+  r1-4]
 +    movq      m3, [r4+2*r1-4]
 +    movq      m7, [r4+  r3-4]
 +    punpcklbw m0, m4
 +    punpcklbw m1, m5
 +    punpcklbw m2, m6
 +    punpcklbw m3, m7
 +    TRANSPOSE4x4W 0, 1, 2, 3, 4
 +%endif
 +    pxor      m5, m5
 +
 +    UNPACK_8TO16 bw, 6, 0, 5
 +    UNPACK_8TO16 bw, 7, 1, 5
 +    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
 +    UNPACK_8TO16 bw, 4, 2, 5
 +    mova    m0, m1                      ; m0 = p0
 +    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
 +    UNPACK_8TO16 bw, 1, 3, 5
 +    mova    m5, m4
 +    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
 +    SWAP 1, 4                           ; m1 = q0
 +
 +    VC1_FILTER %1
 +    punpcklbw m0, m1
 +%if %0 > 1
 +    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
 +%if %1 > 4
 +    psrldq m0, 4
 +    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
 +%endif
 +%else
 +    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
 +    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
 +%endif
 +%endmacro
 +
 +
 +%macro START_V_FILTER 0
 +    mov  r4, r0
 +    lea  r3, [4*r1]
 +    sub  r4, r3
 +    lea  r3, [r1+2*r1]
 +    imul r2, 0x01010101
 +%endmacro
 +
 +%macro START_H_FILTER 1
 +    lea  r3, [r1+2*r1]
 +%if %1 > 4
 +    lea  r4, [r0+4*r1]
 +%endif
 +    imul r2, 0x01010101
 +%endmacro
 +
 +%macro VC1_LF 0
 +cglobal vc1_v_loop_filter_internal
 +    VC1_V_LOOP_FILTER 4, d
 +    ret
 +
 +cglobal vc1_h_loop_filter_internal
 +    VC1_H_LOOP_FILTER 4, r4
 +    ret
 +
 +; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
 +cglobal vc1_v_loop_filter4, 3,5,0
 +    START_V_FILTER
 +    call vc1_v_loop_filter_internal
 +    RET
 +
 +; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter4, 3,5,0
 +    START_H_FILTER 4
 +    call vc1_h_loop_filter_internal
 +    RET
 +
 +; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
 +cglobal vc1_v_loop_filter8, 3,5,0
 +    START_V_FILTER
 +    call vc1_v_loop_filter_internal
 +    add  r4, 4
 +    add  r0, 4
 +    call vc1_v_loop_filter_internal
 +    RET
 +
 +; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter8, 3,5,0
 +    START_H_FILTER 4
 +    call vc1_h_loop_filter_internal
 +    lea  r0, [r0+4*r1]
 +    call vc1_h_loop_filter_internal
 +    RET
 +%endmacro
 +
 +INIT_MMX mmxext
 +VC1_LF
 +
 +INIT_XMM sse2
 +; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
 +cglobal vc1_v_loop_filter8, 3,5,8
 +    START_V_FILTER
 +    VC1_V_LOOP_FILTER 8, q
 +    RET
 +
 +; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter8, 3,6,8
 +    START_H_FILTER 8
 +    VC1_H_LOOP_FILTER 8, r5
 +    RET
 +
 +INIT_MMX ssse3
 +; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
 +cglobal vc1_v_loop_filter4, 3,5,0
 +    START_V_FILTER
 +    VC1_V_LOOP_FILTER 4, d
 +    RET
 +
 +; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter4, 3,5,0
 +    START_H_FILTER 4
 +    VC1_H_LOOP_FILTER 4, r4
 +    RET
 +
 +INIT_XMM ssse3
 +; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
 +cglobal vc1_v_loop_filter8, 3,5,8
 +    START_V_FILTER
 +    VC1_V_LOOP_FILTER 8, q
 +    RET
 +
 +; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter8, 3,6,8
 +    START_H_FILTER 8
 +    VC1_H_LOOP_FILTER 8, r5
 +    RET
 +
 +INIT_XMM sse4
 +; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
 +cglobal vc1_h_loop_filter8, 3,5,8
 +    START_H_FILTER 8
 +    VC1_H_LOOP_FILTER 8
 +    RET
diff --cc libavcodec/x86/vc1dsp_mc.asm
index 2850ca861d,0000000000..0e6d87dd8b
mode 100644,000000..100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@@ -1,292 -1,0 +1,292 @@@
 +;******************************************************************************
 +;* VC1 motion compensation optimizations
 +;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
 +;*
 +;* This file is part of FFmpeg.
 +;*
 +;* FFmpeg is free software; you can redistribute it and/or
 +;* modify it under the terms of the GNU Lesser General Public
 +;* License as published by the Free Software Foundation; either
 +;* version 2.1 of the License, or (at your option) any later version.
 +;*
 +;* FFmpeg is distributed in the hope that it will be useful,
 +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +;* Lesser General Public License for more details.
 +;*
 +;* You should have received a copy of the GNU Lesser General Public
 +;* License along with FFmpeg; if not, write to the Free Software
 +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 +;******************************************************************************
 +
 +%include "libavutil/x86/x86util.asm"
 +
 +cextern pw_9
 +cextern pw_128
 +
- section .text
++SECTION .text
 +
 +%if HAVE_MMX_INLINE
 +
 +; XXX some of these macros are not used right now, but they will in the future
 +;     when more functions are ported.
 +
 +%macro OP_PUT 2 ; dst, src
 +%endmacro
 +
 +%macro OP_AVG 2 ; dst, src
 +    pavgb           %1, %2
 +%endmacro
 +
 +%macro NORMALIZE_MMX 1 ; shift
 +    paddw           m3, m7 ; +bias-r
 +    paddw           m4, m7 ; +bias-r
 +    psraw           m3, %1
 +    psraw           m4, %1
 +%endmacro
 +
 +%macro TRANSFER_DO_PACK 2 ; op, dst
 +    packuswb        m3, m4
 +    %1              m3, [%2]
 +    mova          [%2], m3
 +%endmacro
 +
 +%macro TRANSFER_DONT_PACK 2 ; op, dst
 +    %1              m3, [%2]
 +    %1              m3, [%2 + mmsize]
 +    mova          [%2], m3
 +    mova [mmsize + %2], m4
 +%endmacro
 +
 +; see MSPEL_FILTER13_CORE for use as UNPACK macro
 +%macro DO_UNPACK 1 ; reg
 +    punpcklbw       %1, m0
 +%endmacro
 +%macro DONT_UNPACK 1 ; reg
 +%endmacro
 +
 +; Compute the rounder 32-r or 8-r and unpacks it to m7
 +%macro LOAD_ROUNDER_MMX 1 ; round
 +    movd      m7, %1
 +    punpcklwd m7, m7
 +    punpckldq m7, m7
 +%endmacro
 +
 +%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
 +    paddw          m%3, m%4
 +    movh           m%2, [srcq + stride_neg2]
 +    pmullw         m%3, m6
 +    punpcklbw      m%2, m0
 +    movh           m%5, [srcq + strideq]
 +    psubw          m%3, m%2
 +    punpcklbw      m%5, m0
 +    paddw          m%3, m7
 +    psubw          m%3, m%5
 +    psraw          m%3, shift
 +    movu   [dstq + %1], m%3
 +    add           srcq, strideq
 +%endmacro
 +
 +INIT_MMX mmx
 +; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
 +;                                    x86_reg stride, int rnd, int64_t shift)
 +; Sacrificing m6 makes it possible to pipeline loads from src
 +%if ARCH_X86_32
 +cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
 +    DECLARE_REG_TMP     3, 4, 5
 +    %define rnd r3mp
 +    %define shift qword r4m
 +%else ; X86_64
 +cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
 +    DECLARE_REG_TMP     4, 5, 6
 +    %define   rnd r3d
 +    ; We need shift either in memory or in a mm reg as it's used in psraw
 +    ; On WIN64, the arg is already on the stack
 +    ; On UNIX64, m5 doesn't seem to be used
 +%if WIN64
 +    %define shift r4mp
 +%else ; UNIX64
 +    %define shift m5
 +    mova shift, r4q
 +%endif ; WIN64
 +%endif ; X86_32
 +%define stride_neg2 t0q
 +%define stride_9minus4 t1q
 +%define i t2q
 +    mov       stride_neg2, strideq
 +    neg       stride_neg2
 +    add       stride_neg2, stride_neg2
 +    lea    stride_9minus4, [strideq * 9 - 4]
 +    mov                 i, 3
 +    LOAD_ROUNDER_MMX  rnd
 +    mova               m6, [pw_9]
 +    pxor               m0, m0
 +.loop:
 +    movh               m2, [srcq]
 +    add              srcq, strideq
 +    movh               m3, [srcq]
 +    punpcklbw          m2, m0
 +    punpcklbw          m3, m0
 +    SHIFT2_LINE         0, 1, 2, 3, 4
 +    SHIFT2_LINE        24, 2, 3, 4, 1
 +    SHIFT2_LINE        48, 3, 4, 1, 2
 +    SHIFT2_LINE        72, 4, 1, 2, 3
 +    SHIFT2_LINE        96, 1, 2, 3, 4
 +    SHIFT2_LINE       120, 2, 3, 4, 1
 +    SHIFT2_LINE       144, 3, 4, 1, 2
 +    SHIFT2_LINE       168, 4, 1, 2, 3
 +    sub              srcq, stride_9minus4
 +    add              dstq, 8
 +    dec                 i
 +        jnz         .loop
 +    REP_RET
 +%undef rnd
 +%undef shift
 +%undef stride_neg2
 +%undef stride_9minus4
 +%undef i
 +
 +; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
 +;                                  const int16_t *src, int rnd);
 +; Data is already unpacked, so some operations can directly be made from
 +; memory.
 +%macro HOR_16B_SHIFT2 2 ; op, opname
 +cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
 +    mov                hq, 8
 +    sub              srcq, 2
 +    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
 +    LOAD_ROUNDER_MMX rndd
 +    mova               m5, [pw_9]
 +    mova               m6, [pw_128]
 +    pxor               m0, m0
 +
 +.loop:
 +    mova               m1, [srcq + 2 * 0]
 +    mova               m2, [srcq + 2 * 0 + mmsize]
 +    mova               m3, [srcq + 2 * 1]
 +    mova               m4, [srcq + 2 * 1 + mmsize]
 +    paddw              m3, [srcq + 2 * 2]
 +    paddw              m4, [srcq + 2 * 2 + mmsize]
 +    paddw              m1, [srcq + 2 * 3]
 +    paddw              m2, [srcq + 2 * 3 + mmsize]
 +    pmullw             m3, m5
 +    pmullw             m4, m5
 +    psubw              m3, m1
 +    psubw              m4, m2
 +    NORMALIZE_MMX      7
 +    ; remove bias
 +    paddw              m3, m6
 +    paddw              m4, m6
 +    TRANSFER_DO_PACK   %1, dstq
 +    add              srcq, 24
 +    add              dstq, strideq
 +    dec                hq
 +        jnz         .loop
 +
 +    RET
 +%endmacro
 +
 +INIT_MMX mmx
 +HOR_16B_SHIFT2 OP_PUT, put
 +
 +INIT_MMX mmxext
 +HOR_16B_SHIFT2 OP_AVG, avg
 +%endif ; HAVE_MMX_INLINE
 +
 +%macro INV_TRANS_INIT 0
 +    movsxdifnidn linesizeq, linesized
 +    movd       m0, blockd
 +    SPLATW     m0, m0
 +    pxor       m1, m1
 +    psubw      m1, m0
 +    packuswb   m0, m0
 +    packuswb   m1, m1
 +
 +    DEFINE_ARGS dest, linesize, linesize3
 +    lea    linesize3q, [linesizeq*3]
 +%endmacro
 +
 +%macro INV_TRANS_PROCESS 1
 +    mov%1                  m2, [destq+linesizeq*0]
 +    mov%1                  m3, [destq+linesizeq*1]
 +    mov%1                  m4, [destq+linesizeq*2]
 +    mov%1                  m5, [destq+linesize3q]
 +    paddusb                m2, m0
 +    paddusb                m3, m0
 +    paddusb                m4, m0
 +    paddusb                m5, m0
 +    psubusb                m2, m1
 +    psubusb                m3, m1
 +    psubusb                m4, m1
 +    psubusb                m5, m1
 +    mov%1 [linesizeq*0+destq], m2
 +    mov%1 [linesizeq*1+destq], m3
 +    mov%1 [linesizeq*2+destq], m4
 +    mov%1 [linesize3q +destq], m5
 +%endmacro
 +
 +; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
 +    movsx         r3d, WORD [blockq]
 +    mov        blockd, r3d             ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
 +    sar        blockd, 3               ; >> 3
 +    mov           r3d, blockd          ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS h
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
 +    movsx         r3d, WORD [blockq]
 +    mov        blockd, r3d             ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
 +    sar        blockd, 3               ; >> 3
 +    shl        blockd, 2               ;  4 * dc
 +    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS h
 +    lea         destq, [destq+linesizeq*4]
 +    INV_TRANS_PROCESS h
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
 +    movsx      blockd, WORD [blockq]   ; dc
 +    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
 +    sar        blockd, 1               ; >> 1
 +    mov           r3d, blockd          ; dc
 +    shl        blockd, 4               ; 16 * dc
 +    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
 +    sar        blockd, 7               ; >> 7
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS a
 +    RET
 +
 +INIT_MMX mmxext
 +cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
 +    movsx      blockd, WORD [blockq]   ; dc
 +    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
 +    sar        blockd, 1               ; >> 1
 +    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
 +    sar        blockd, 5               ; >> 5
 +
 +    INV_TRANS_INIT
 +
 +    INV_TRANS_PROCESS a
 +    lea         destq, [destq+linesizeq*4]
 +    INV_TRANS_PROCESS a
 +    RET
diff --cc libavutil/x86/x86inc.asm
index c4ec29bd9d,e04dbfedf3..6a054a3e09
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@@ -87,9 -87,7 +87,9 @@@
  ; keep supporting OS/2.
  %macro SECTION_RODATA 0-1 16
      %ifidn __OUTPUT_FORMAT__,aout
-         section .text
+         SECTION .text
 +    %elifidn __OUTPUT_FORMAT__,coff
-         section .text
++        SECTION .text
      %else
          SECTION .rodata align=%1
      %endif