[FFmpeg-devel] [PATCH] mdct15: simplify the fft15 x86 SIMD

Rostislav Pehlivanov atomnuker at gmail.com
Mon May 7 01:19:04 EEST 2018


Saves 1 gpr and 2 instructions and simplifies the macros a bit.

Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
---
 libavcodec/x86/mdct15.asm | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
index 0309112538..2a2cdbd21b 100644
--- a/libavcodec/x86/mdct15.asm
+++ b/libavcodec/x86/mdct15.asm
@@ -76,7 +76,7 @@ SECTION .text
     addps       m%3,  m%3,  m0          ; Finally offset with DCs
 %endmacro
 
-%macro BUTTERFLIES_DC 2 ; %1 - exptab_offset, %2 - out
+%macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
     mulps xm0,  xm9, [exptabq + %1 + 16*0]
     mulps xm1, xm10, [exptabq + %1 + 16*1]
 
@@ -86,10 +86,10 @@ SECTION .text
     addps   xm0,  xm1
     addps   xm0,  xm8
 
-    movsd [%2q], xm0
+    movsd [outq], xm0
 %endmacro
 
-%macro BUTTERFLIES_AC 2 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m3)
+%macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
     mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
     mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
     mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
@@ -104,15 +104,14 @@ SECTION .text
 
     vextractf128 xm1, m0, 1
 
-    movlps [%2q + strideq*1], xm0
-    movhps [%2q + strideq*2], xm0
-    movlps [%2q +  stride3q], xm1
-    movhps [%2q + strideq*4], xm1
+    movlps [outq + strideq*1], xm0
+    movhps [outq + strideq*2], xm0
+    movlps [outq +  stride3q], xm1
+    movhps [outq + strideq*4], xm1
 %endmacro
 
 INIT_YMM avx
-cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
-%define out0q inq
+cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
     shl strideq, 3
 
     movaps xm5, [exptabq + 480 + 16*0]
@@ -123,22 +122,20 @@ cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
     FFT5  8,  xm9, 12
     FFT5 16, xm10, 13
 
+%define stride3q inq
     lea stride3q, [strideq + strideq*2]
     lea stride5q, [strideq + strideq*4]
 
-    mov out0q, outq
+    BUTTERFLIES_DC (8*6 + 4*0)*2*4
+    BUTTERFLIES_AC (8*0 + 0*0)*2*4
 
-    BUTTERFLIES_DC (8*6 + 4*0)*2*4, out0
-    lea outq, [out0q + stride5q*1]
-    BUTTERFLIES_DC (8*6 + 4*1)*2*4, out
-    lea outq, [out0q + stride5q*2]
-    BUTTERFLIES_DC (8*6 + 4*2)*2*4, out
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*1)*2*4
+    BUTTERFLIES_AC (8*2 + 0*0)*2*4
 
-    BUTTERFLIES_AC (8*0)*2*4, out0
-    lea outq, [out0q + stride5q*1]
-    BUTTERFLIES_AC (8*2)*2*4, out
-    lea outq, [out0q + stride5q*2]
-    BUTTERFLIES_AC (8*4)*2*4, out
+    add outq, stride5q
+    BUTTERFLIES_DC (8*6 + 4*2)*2*4
+    BUTTERFLIES_AC (8*4 + 0*0)*2*4
 
     RET
 
-- 
2.17.0



More information about the ffmpeg-devel mailing list