[FFmpeg-devel] [PATCH] x86/dsputil: fix VECTOR_CLIP_INT32 macro
James Almer
jamrial at gmail.com
Fri May 23 09:05:43 CEST 2014
The inline loop was incrementing and using the value of %%i
the wrong way.
Disassembly of ff_vector_clip_int32_sse2 before and after
this patch:
movdqa (%rdx),%xmm0 | movdqa (%rdx),%xmm0
movdqa 0x10(%rdx),%xmm1 | movdqa 0x10(%rdx),%xmm1
movdqa 0x20(%rdx),%xmm2 | movdqa 0x20(%rdx),%xmm2
movdqa 0x30(%rdx),%xmm3 | movdqa 0x30(%rdx),%xmm3
[...] |
movdqa %xmm0,(%rcx) | movdqa %xmm0,(%rcx)
movdqa %xmm1,0x10(%rcx) | movdqa %xmm1,0x10(%rcx)
movdqa %xmm2,0x20(%rcx) | movdqa %xmm2,0x20(%rcx)
movdqa %xmm3,0x30(%rcx) | movdqa %xmm3,0x30(%rcx)
movdqa (%rdx),%xmm0 | movdqa 0x40(%rdx),%xmm0
movdqa 0x20(%rdx),%xmm1 | movdqa 0x50(%rdx),%xmm1
movdqa 0x40(%rdx),%xmm2 | movdqa 0x60(%rdx),%xmm2
movdqa 0x60(%rdx),%xmm3 | movdqa 0x70(%rdx),%xmm3
[...] |
movdqa %xmm0,(%rcx) | movdqa %xmm0,0x40(%rcx)
movdqa %xmm1,0x20(%rcx) | movdqa %xmm1,0x50(%rcx)
movdqa %xmm2,0x40(%rcx) | movdqa %xmm2,0x60(%rcx)
movdqa %xmm3,0x60(%rcx) | movdqa %xmm3,0x70(%rcx)
add $0x80,%rdx | add $0x80,%rdx
add $0x80,%rcx | add $0x80,%rcx
Other versions were unaffected.
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/dsputil.asm | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 8ebc9a0..bba60e5 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -351,17 +351,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
SPLATD m4
SPLATD m5
.loop:
-%assign %%i 1
+%assign %%i 0
%rep %2
- mova m0, [srcq+mmsize*0*%%i]
- mova m1, [srcq+mmsize*1*%%i]
- mova m2, [srcq+mmsize*2*%%i]
- mova m3, [srcq+mmsize*3*%%i]
+ mova m0, [srcq+mmsize*(0+%%i)]
+ mova m1, [srcq+mmsize*(1+%%i)]
+ mova m2, [srcq+mmsize*(2+%%i)]
+ mova m3, [srcq+mmsize*(3+%%i)]
%if %3
- mova m7, [srcq+mmsize*4*%%i]
- mova m8, [srcq+mmsize*5*%%i]
- mova m9, [srcq+mmsize*6*%%i]
- mova m10, [srcq+mmsize*7*%%i]
+ mova m7, [srcq+mmsize*(4+%%i)]
+ mova m8, [srcq+mmsize*(5+%%i)]
+ mova m9, [srcq+mmsize*(6+%%i)]
+ mova m10, [srcq+mmsize*(7+%%i)]
%endif
CLIPD m0, m4, m5, m6
CLIPD m1, m4, m5, m6
@@ -373,17 +373,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
CLIPD m9, m4, m5, m6
CLIPD m10, m4, m5, m6
%endif
- mova [dstq+mmsize*0*%%i], m0
- mova [dstq+mmsize*1*%%i], m1
- mova [dstq+mmsize*2*%%i], m2
- mova [dstq+mmsize*3*%%i], m3
+ mova [dstq+mmsize*(0+%%i)], m0
+ mova [dstq+mmsize*(1+%%i)], m1
+ mova [dstq+mmsize*(2+%%i)], m2
+ mova [dstq+mmsize*(3+%%i)], m3
%if %3
- mova [dstq+mmsize*4*%%i], m7
- mova [dstq+mmsize*5*%%i], m8
- mova [dstq+mmsize*6*%%i], m9
- mova [dstq+mmsize*7*%%i], m10
+ mova [dstq+mmsize*(4+%%i)], m7
+ mova [dstq+mmsize*(5+%%i)], m8
+ mova [dstq+mmsize*(6+%%i)], m9
+ mova [dstq+mmsize*(7+%%i)], m10
%endif
-%assign %%i %%i+1
+%assign %%i %%i+4*(%3+1)
%endrep
add srcq, mmsize*4*(%2+%3)
add dstq, mmsize*4*(%2+%3)
--
1.8.5.5
More information about the ffmpeg-devel
mailing list