[FFmpeg-cvslog] swr: implement stereo S16/S32/FLT->S16/S32/FLT planar->packed in SSE/SSE2
Michael Niedermayer
git at videolan.org
Sat May 5 19:07:16 CEST 2012
ffmpeg | branch: master | Michael Niedermayer <michaelni at gmx.at> | Sat May 5 15:31:06 2012 +0200| [47055b8913c96a1c41a3bbdf30205255c8453f25] | committer: Michael Niedermayer
swr: implement stereo S16/S32/FLT->S16/S32/FLT planar->packed in SSE/SSE2
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=47055b8913c96a1c41a3bbdf30205255c8453f25
---
libswresample/x86/audio_convert.asm | 148 +++++++++++++++++++++++++++++++++++
libswresample/x86/swresample_x86.c | 34 ++++++++
2 files changed, 182 insertions(+), 0 deletions(-)
diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm
index 3172352..e520965 100644
--- a/libswresample/x86/audio_convert.asm
+++ b/libswresample/x86/audio_convert.asm
@@ -227,6 +227,135 @@ int32_to_int16_u_int %+ SUFFIX
REP_RET
%endmacro
+;to, from, a/u, log2_outsize, log_intsize, const
+%macro PACK_2CH 5-7
+cglobal pack_2ch_%2_to_%1_%3, 3, 4, 5, dst, src, len, src2
+ mov src2q , [srcq+gprsize]
+ mov srcq , [srcq]
+ mov dstq , [dstq]
+%ifidn %3, a
+ test dstq, mmsize-1
+ jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+ test srcq, mmsize-1
+ jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+ test src2q, mmsize-1
+ jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+%else
+pack_2ch_%1_to_%2_u_int %+ SUFFIX
+%endif
+ lea srcq , [srcq + (1<<%5)*lenq]
+ lea src2q, [src2q + (1<<%5)*lenq]
+ lea dstq , [dstq + (2<<%4)*lenq]
+ neg lenq
+ %7
+.next:
+ mov%3 m0, [ srcq +(1<<%5)*lenq]
+ mova m1, m0
+ mov%3 m2, [ src2q+(1<<%5)*lenq]
+%if %5 == 1
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+%else
+ punpckldq m0, m2
+ punpckhdq m1, m2
+%endif
+%if %4 < %5
+ mov%3 m2, [mmsize + srcq +(1<<%5)*lenq]
+ mova m3, m2
+ mov%3 m4, [mmsize + src2q+(1<<%5)*lenq]
+ punpckldq m2, m4
+ punpckhdq m3, m4
+%endif
+ %6
+ mov%3 [ dstq+(2<<%4)*lenq], m0
+ mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
+%if %4 > %5
+ mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
+ mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
+ add lenq, 4*mmsize/(2<<%4)
+%else
+ add lenq, 2*mmsize/(2<<%4)
+%endif
+ jl .next
+ REP_RET
+%endmacro
+
+%macro INT16_TO_INT32_N 0
+ pxor m2, m2
+ pxor m3, m3
+ punpcklwd m2, m1
+ punpckhwd m3, m1
+ SWAP 4,0
+ pxor m0, m0
+ pxor m1, m1
+ punpcklwd m0, m4
+ punpckhwd m1, m4
+%endmacro
+
+%macro INT32_TO_INT16_N 0
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ packssdw m0, m1
+ packssdw m2, m3
+ SWAP 1,2
+%endmacro
+
+%macro INT32_TO_FLOAT_INIT 0
+ mova m3, [flt2pm31]
+%endmacro
+%macro INT32_TO_FLOAT_N 0
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ mulps m0, m0, m3
+ mulps m1, m1, m3
+%endmacro
+
+%macro FLOAT_TO_INT32_INIT 0
+ mova m3, [flt2p31]
+%endmacro
+%macro FLOAT_TO_INT32_N 0
+ mulps m0, m3
+ mulps m1, m3
+ cvtps2dq m2, m0
+ cvtps2dq m4, m1
+ cmpnltps m0, m3
+ cmpnltps m1, m3
+ paddd m0, m2
+ paddd m1, m4
+%endmacro
+
+%macro INT16_TO_FLOAT_INIT 0
+ mova m5, [flt2pm31]
+%endmacro
+%macro INT16_TO_FLOAT_N 0
+ INT16_TO_INT32_N
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ mulps m0, m0, m5
+ mulps m1, m1, m5
+ mulps m2, m2, m5
+ mulps m3, m3, m5
+%endmacro
+
+%macro FLOAT_TO_INT16_INIT 0
+ mova m5, [flt2p15]
+%endmacro
+%macro FLOAT_TO_INT16_N 0
+ mulps m0, m5
+ mulps m1, m5
+ mulps m2, m5
+ mulps m3, m5
+ cvtps2dq m0, m0
+ cvtps2dq m1, m1
+ packssdw m0, m1
+ cvtps2dq m1, m2
+ cvtps2dq m3, m3
+ packssdw m1, m3
+%endmacro
INIT_MMX mmx
INT16_TO_INT32 u
@@ -240,6 +369,15 @@ INT16_TO_INT32 a
INT32_TO_INT16 u
INT32_TO_INT16 a
+PACK_2CH int16, int16, u, 1, 1
+PACK_2CH int16, int16, a, 1, 1
+PACK_2CH int32, int32, u, 2, 2
+PACK_2CH int32, int32, a, 2, 2
+PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N
+PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N
+PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N
+PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N
+
INIT_XMM sse2
INT32_TO_FLOAT u
INT32_TO_FLOAT a
@@ -250,6 +388,16 @@ FLOAT_TO_INT32 a
FLOAT_TO_INT16 u
FLOAT_TO_INT16 a
+PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+
+
%if HAVE_AVX
INIT_YMM avx
INT32_TO_FLOAT u
diff --git a/libswresample/x86/swresample_x86.c b/libswresample/x86/swresample_x86.c
index dc6ade5..9370221 100644
--- a/libswresample/x86/swresample_x86.c
+++ b/libswresample/x86/swresample_x86.c
@@ -35,6 +35,16 @@ void ff_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len);
void ff_int32_to_float_a_avx(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int16_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int32_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int16_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int32_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len);
+
+void ff_pack_2ch_int32_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_float_to_int32_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int16_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+
void swri_audio_convert_init_x86(struct AudioConvert *ac,
enum AVSampleFormat out_fmt,
enum AVSampleFormat in_fmt,
@@ -56,6 +66,19 @@ void swri_audio_convert_init_x86(struct AudioConvert *ac,
MULTI_CAPS_FUNC(AV_CPU_FLAG_MMX, mmx)
MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse)
+ if(mm_flags & AV_CPU_FLAG_SSE) {
+ if(channels == 2) {
+ if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+ ac->simd_f = ff_pack_2ch_int32_to_int32_a_sse;
+ if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S16P)
+ ac->simd_f = ff_pack_2ch_int16_to_int16_a_sse;
+ if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16P)
+ ac->simd_f = ff_pack_2ch_int16_to_int32_a_sse;
+ if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32P)
+ ac->simd_f = ff_pack_2ch_int32_to_int16_a_sse;
+ }
+ }
+
if(mm_flags & AV_CPU_FLAG_SSE2) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_int32_to_float_a_sse2;
@@ -65,6 +88,17 @@ MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse)
ac->simd_f = ff_float_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_float_to_int16_a_sse2;
+
+ if(channels == 2) {
+ if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
+ ac->simd_f = ff_pack_2ch_int32_to_float_a_sse2;
+ if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
+ ac->simd_f = ff_pack_2ch_float_to_int32_a_sse2;
+ if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16P)
+ ac->simd_f = ff_pack_2ch_int16_to_float_a_sse2;
+ if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP)
+ ac->simd_f = ff_pack_2ch_float_to_int16_a_sse2;
+ }
}
if(HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
More information about the ffmpeg-cvslog
mailing list