[Ffmpeg-cvslog] r5976 - trunk/libavcodec/i386/dsputil_mmx.c
michael
subversion
Thu Aug 10 22:24:59 CEST 2006
Author: michael
Date: Thu Aug 10 22:24:58 2006
New Revision: 5976
Modified:
trunk/libavcodec/i386/dsputil_mmx.c
Log:
convert vector_fmul_reverse_sse2 and vector_fmul_add_add_sse2 to sse
please complain if they are slower on sse2 cpus ...
Modified: trunk/libavcodec/i386/dsputil_mmx.c
==============================================================================
--- trunk/libavcodec/i386/dsputil_mmx.c (original)
+++ trunk/libavcodec/i386/dsputil_mmx.c Thu Aug 10 22:24:58 2006
@@ -2820,12 +2820,14 @@
);
asm volatile("femms");
}
-static void vector_fmul_reverse_sse2(float *dst, const float *src0, const float *src1, int len){
+static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
long i = len*4-32;
asm volatile(
"1: \n\t"
- "pshufd $0x1b, 16(%1), %%xmm0 \n\t"
- "pshufd $0x1b, (%1), %%xmm1 \n\t"
+ "movaps 16(%1), %%xmm0 \n\t"
+ "movaps (%1), %%xmm1 \n\t"
+ "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
"mulps (%3,%0), %%xmm0 \n\t"
"mulps 16(%3,%0), %%xmm1 \n\t"
"movaps %%xmm0, (%2,%0) \n\t"
@@ -2882,7 +2884,7 @@
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
asm volatile("femms");
}
-static void vector_fmul_add_add_sse2(float *dst, const float *src0, const float *src1,
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, float src3, int len, int step){
long i;
if(step == 2 && src3 == 0){
@@ -2896,20 +2898,20 @@
"mulps 16(%3,%0), %%xmm1 \n\t"
"addps (%4,%0), %%xmm0 \n\t"
"addps 16(%4,%0), %%xmm1 \n\t"
- "movd %%xmm0, (%1) \n\t"
- "movd %%xmm1, 32(%1) \n\t"
- "psrldq $4, %%xmm0 \n\t"
- "psrldq $4, %%xmm1 \n\t"
- "movd %%xmm0, 8(%1) \n\t"
- "movd %%xmm1, 40(%1) \n\t"
- "psrldq $4, %%xmm0 \n\t"
- "psrldq $4, %%xmm1 \n\t"
- "movd %%xmm0, 16(%1) \n\t"
- "movd %%xmm1, 48(%1) \n\t"
- "psrldq $4, %%xmm0 \n\t"
- "psrldq $4, %%xmm1 \n\t"
- "movd %%xmm0, 24(%1) \n\t"
- "movd %%xmm1, 56(%1) \n\t"
+ "movss %%xmm0, (%1) \n\t"
+ "movss %%xmm1, 32(%1) \n\t"
+ "movhlps %%xmm0, %%xmm2 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movss %%xmm2, 16(%1) \n\t"
+ "movss %%xmm3, 48(%1) \n\t"
+ "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
+ "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
+ "movss %%xmm0, 8(%1) \n\t"
+ "movss %%xmm1, 40(%1) \n\t"
+ "movhlps %%xmm0, %%xmm2 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movss %%xmm2, 24(%1) \n\t"
+ "movss %%xmm3, 56(%1) \n\t"
"sub $64, %1 \n\t"
"sub $32, %0 \n\t"
"jge 1b \n\t"
@@ -3403,10 +3405,8 @@
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->vector_fmul = vector_fmul_sse;
c->float_to_int16 = float_to_int16_sse;
- }
- if(mm_flags & MM_SSE2){
- c->vector_fmul_reverse = vector_fmul_reverse_sse2;
- c->vector_fmul_add_add = vector_fmul_add_add_sse2;
+ c->vector_fmul_reverse = vector_fmul_reverse_sse;
+ c->vector_fmul_add_add = vector_fmul_add_add_sse;
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse2
More information about the ffmpeg-cvslog
mailing list