[FFmpeg-devel] [PATCH] avcodec: split off synth_filter
James Almer
jamrial at gmail.com
Sun Jan 24 21:53:19 CET 2016
Signed-off-by: James Almer <jamrial at gmail.com>
---
This should make replacing the current dca decoder for foo86's easier, without
having to disable/reenable the compilation of synth_filter files in between
removal/addition commits.
aarch64 and arm changes untested.
configure | 3 +-
libavcodec/Makefile | 3 +-
libavcodec/aarch64/Makefile | 5 +-
libavcodec/aarch64/dcadsp_init.c | 21 --
.../aarch64/{dcadsp_init.c => synth_filter_init.c} | 16 +-
libavcodec/arm/Makefile | 9 +-
libavcodec/arm/dcadsp_init_arm.c | 22 --
.../dcadsp_init.c => arm/synth_filter_init_arm.c} | 33 +--
libavcodec/x86/Makefile | 2 +
libavcodec/x86/dcadsp.asm | 222 ---------------------
libavcodec/x86/dcadsp_init.c | 51 -----
libavcodec/x86/{dcadsp.asm => synth_filter.asm} | 99 ---------
.../x86/{dcadsp_init.c => synth_filter_init.c} | 21 +-
13 files changed, 26 insertions(+), 481 deletions(-)
copy libavcodec/aarch64/{dcadsp_init.c => synth_filter_init.c} (78%)
copy libavcodec/{aarch64/dcadsp_init.c => arm/synth_filter_init_arm.c} (66%)
copy libavcodec/x86/{dcadsp.asm => synth_filter.asm} (76%)
copy libavcodec/x86/{dcadsp_init.c => synth_filter_init.c} (83%)
diff --git a/configure b/configure
index 8f4642b..1719a5b 100755
--- a/configure
+++ b/configure
@@ -2039,6 +2039,7 @@ CONFIG_EXTRA="
sinewin
snappy
startcode
+ synth_filter
texturedsp
texturedspenc
tpeldsp
@@ -2276,7 +2277,7 @@ comfortnoise_encoder_select="lpc"
cook_decoder_select="audiodsp mdct sinewin"
cscd_decoder_select="lzo"
cscd_decoder_suggest="zlib"
-dca_decoder_select="fmtconvert mdct"
+dca_decoder_select="fmtconvert mdct synth_filter"
dds_decoder_select="texturedsp"
dirac_decoder_select="dirac_parse dwt golomb videodsp mpegvideoenc"
dnxhd_decoder_select="blockdsp idctdsp"
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index b9ffdb9..3a6a453 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -115,6 +115,7 @@ OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o
OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o
OBJS-$(CONFIG_SNAPPY) += snappy.o
OBJS-$(CONFIG_STARTCODE) += startcode.o
+OBJS-$(CONFIG_SYNTH_FILTER) += synth_filter.o
OBJS-$(CONFIG_TEXTUREDSP) += texturedsp.o
OBJS-$(CONFIG_TEXTUREDSPENC) += texturedspenc.o
OBJS-$(CONFIG_TPELDSP) += tpeldsp.o
@@ -223,7 +224,7 @@ OBJS-$(CONFIG_CSCD_DECODER) += cscd.o
OBJS-$(CONFIG_CYUV_DECODER) += cyuv.o
OBJS-$(CONFIG_DCA_DECODER) += dcadec.o dca.o dcadsp.o \
dcadata.o dca_exss.o \
- dca_xll.o synth_filter.o
+ dca_xll.o
OBJS-$(CONFIG_DCA_ENCODER) += dcaenc.o dca.o dcadata.o
OBJS-$(CONFIG_DDS_DECODER) += dds.o
OBJS-$(CONFIG_DIRAC_DECODER) += diracdec.o dirac.o diracdsp.o \
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 022ed84..2df6325 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
+OBJS-$(CONFIG_SYNTH_FILTER) += aarch64/synth_filter_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
@@ -17,8 +18,7 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
- aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
@@ -31,5 +31,6 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+NEON-OBJS-$(CONFIG_SYNTH_FILTER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
index 78642a5..4440e4b 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -24,23 +24,10 @@
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavcodec/dcadsp.h"
-#include "libavcodec/fft.h"
-
-#include "asm-offsets.h"
-
-#if HAVE_NEON || HAVE_VFP
-AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
-#endif
void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-void ff_synth_filter_float_neon(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -50,11 +37,3 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
}
}
-
-av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_neon;
-}
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c
similarity index 78%
copy from libavcodec/aarch64/dcadsp_init.c
copy to libavcodec/aarch64/synth_filter_init.c
index 78642a5..df51ae8 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -23,8 +23,7 @@
#include "libavutil/aarch64/cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
-#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
#include "asm-offsets.h"
@@ -32,25 +31,12 @@
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
#endif
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index cdd35b0..f18e800 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -29,6 +29,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
+OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_init_arm.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
@@ -85,10 +86,10 @@ ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
+VFP-OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_vfp.o
# decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
- arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o
# NEON optimizations
@@ -119,6 +120,7 @@ NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
arm/mdct_fixed_neon.o
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
+NEON-OBJS-$(CONFIG_SYNTH_FILTER) += arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
arm/vp8dsp_neon.o
@@ -127,8 +129,7 @@ NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
- arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
arm/hevcdsp_deblock_neon.o \
arm/hevcdsp_idct_neon.o \
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 0f2e4c4..febb444 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -37,18 +37,6 @@ void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
const float window[512], float *samples_out,
float raXin[32], float scale);
-void ff_synth_filter_float_vfp(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
-void ff_synth_filter_float_neon(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -63,13 +51,3 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
}
}
-
-av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp_vm(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_vfp;
- if (have_neon(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_neon;
-}
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/arm/synth_filter_init_arm.c
similarity index 66%
copy from libavcodec/aarch64/dcadsp_init.c
copy to libavcodec/arm/synth_filter_init_arm.c
index 78642a5..3dae5b9 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -20,20 +20,15 @@
#include "config.h"
-#include "libavutil/aarch64/cpu.h"
+#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
-#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
-#include "asm-offsets.h"
-
-#if HAVE_NEON || HAVE_VFP
-AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
-#endif
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
+void ff_synth_filter_float_vfp(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
@@ -41,20 +36,12 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
-av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_vfp_vm(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_vfp;
if (have_neon(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_neon;
}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 0d09fe6..90ff29d 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -31,6 +31,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o \
OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp_init.o
OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp_init.o
OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp_init.o
+OBJS-$(CONFIG_SYNTH_FILTER) += x86/synth_filter_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o
@@ -119,6 +120,7 @@ YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
x86/fpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_RV34DSP) += x86/rv34dsp.o
+YASM-OBJS-$(CONFIG_SYNTH_FILTER) += x86/synth_filter.o
YASM-OBJS-$(CONFIG_IDCTDSP) += x86/simple_idct10.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 502b70a..55e73bc 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -121,225 +121,3 @@ DCA_LFE_FIR 1
INIT_XMM fma3
DCA_LFE_FIR 0
%endif
-
-%macro SETZERO 1
-%if cpuflag(sse2) && notcpuflag(avx)
- pxor %1, %1
-%else
- xorps %1, %1, %1
-%endif
-%endmacro
-
-%macro SHUF 3
-%if cpuflag(avx)
- mova %3, [%2 - 16]
- vperm2f128 %1, %3, %3, 1
- vshufps %1, %1, %1, q0123
-%elif cpuflag(sse2)
- pshufd %1, [%2], q0123
-%else
- mova %1, [%2]
- shufps %1, %1, q0123
-%endif
-%endmacro
-
-%macro INNER_LOOP 1
- ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
- ;~ a += window[i + j] * (-synth_buf[15 - i + j])
- ;~ b += window[i + j + 16] * (synth_buf[i + j])
- SHUF m5, ptr2 + j + (15 - 3) * 4, m6
- mova m6, [ptr1 + j]
-%if ARCH_X86_64
- SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
- mova m12, [ptr1 + j + mmsize]
-%endif
-%if cpuflag(fma3)
- fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
- fnmaddps m1, m5, [win + %1 + j], m1
-%if ARCH_X86_64
- fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
- fnmaddps m7, m11, [win + %1 + j + mmsize], m7
-%endif
-%else ; non-FMA
- mulps m6, m6, [win + %1 + j + 16 * 4]
- mulps m5, m5, [win + %1 + j]
-%if ARCH_X86_64
- mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
- mulps m11, m11, [win + %1 + j + mmsize]
-%endif
- addps m2, m2, m6
- subps m1, m1, m5
-%if ARCH_X86_64
- addps m8, m8, m12
- subps m7, m7, m11
-%endif
-%endif ; cpuflag(fma3)
- ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
- ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
- SHUF m6, ptr2 + j + (31 - 3) * 4, m5
- mova m5, [ptr1 + j + 16 * 4]
-%if ARCH_X86_64
- SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
- mova m11, [ptr1 + j + mmsize + 16 * 4]
-%endif
-%if cpuflag(fma3)
- fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
- fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
-%if ARCH_X86_64
- fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
- fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
-%endif
-%else ; non-FMA
- mulps m5, m5, [win + %1 + j + 32 * 4]
- mulps m6, m6, [win + %1 + j + 48 * 4]
-%if ARCH_X86_64
- mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
- mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
-%endif
- addps m3, m3, m5
- addps m4, m4, m6
-%if ARCH_X86_64
- addps m9, m9, m11
- addps m10, m10, m12
-%endif
-%endif ; cpuflag(fma3)
- sub j, 64 * 4
-%endmacro
-
-; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
-; const float window[512], float out[32],
-; intptr_t offset, float scale)
-%macro SYNTH_FILTER 0
-cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
- synth_buf, synth_buf2, window, out, off, scale
-%define scale m0
-%if ARCH_X86_32 || WIN64
-%if cpuflag(sse2) && notcpuflag(avx)
- movd scale, scalem
- SPLATD m0
-%else
- VBROADCASTSS m0, scalem
-%endif
-; Make sure offset is in a register and not on the stack
-%define OFFQ r4q
-%else
- SPLATD xmm0
-%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
-%endif
-%define OFFQ offq
-%endif
- ; prepare inner counter limit 1
- mov r5q, 480
- sub r5q, offmp
- and r5q, -64
- shl r5q, 2
-%if ARCH_X86_32 || notcpuflag(avx)
- mov OFFQ, r5q
-%define i r5q
- mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
-%else
-%define i 0
-%define OFFQ r5q
-%endif
-
-%define buf2 synth_buf2q
-%if ARCH_X86_32
- mov buf2, synth_buf2mp
-%endif
-.mainloop:
- ; m1 = a m2 = b m3 = c m4 = d
- SETZERO m3
- SETZERO m4
- mova m1, [buf2 + i]
- mova m2, [buf2 + i + 16 * 4]
-%if ARCH_X86_32
-%define ptr1 r0q
-%define ptr2 r1q
-%define win r2q
-%define j r3q
- mov win, windowm
- mov ptr1, synth_bufm
-%if ARCH_X86_32 || notcpuflag(avx)
- add win, i
- add ptr1, i
-%endif
-%else ; ARCH_X86_64
-%define ptr1 r6q
-%define ptr2 r7q ; must be loaded
-%define win r8q
-%define j r9q
- SETZERO m9
- SETZERO m10
- mova m7, [buf2 + i + mmsize]
- mova m8, [buf2 + i + mmsize + 16 * 4]
- lea win, [windowq + i]
- lea ptr1, [synth_bufq + i]
-%endif
- mov ptr2, synth_bufmp
- ; prepare the inner loop counter
- mov j, OFFQ
-%if ARCH_X86_32 || notcpuflag(avx)
- sub ptr2, i
-%endif
-.loop1:
- INNER_LOOP 0
- jge .loop1
-
- mov j, 448 * 4
- sub j, OFFQ
- jz .end
- sub ptr1, j
- sub ptr2, j
- add win, OFFQ ; now at j-64, so define OFFSET
- sub j, 64 * 4
-.loop2:
- INNER_LOOP 64 * 4
- jge .loop2
-
-.end:
-%if ARCH_X86_32
- mov buf2, synth_buf2m ; needed for next iteration anyway
- mov outq, outmp ; j, which will be set again during it
-%endif
- ;~ out[i] = a * scale;
- ;~ out[i + 16] = b * scale;
- mulps m1, m1, scale
- mulps m2, m2, scale
-%if ARCH_X86_64
- mulps m7, m7, scale
- mulps m8, m8, scale
-%endif
- ;~ synth_buf2[i] = c;
- ;~ synth_buf2[i + 16] = d;
- mova [buf2 + i + 0 * 4], m3
- mova [buf2 + i + 16 * 4], m4
-%if ARCH_X86_64
- mova [buf2 + i + 0 * 4 + mmsize], m9
- mova [buf2 + i + 16 * 4 + mmsize], m10
-%endif
- ;~ out[i] = a;
- ;~ out[i + 16] = a;
- mova [outq + i + 0 * 4], m1
- mova [outq + i + 16 * 4], m2
-%if ARCH_X86_64
- mova [outq + i + 0 * 4 + mmsize], m7
- mova [outq + i + 16 * 4 + mmsize], m8
-%endif
-%if ARCH_X86_32 || notcpuflag(avx)
- sub i, (ARCH_X86_64 + 1) * mmsize
- jge .mainloop
-%endif
- RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_XMM sse
-SYNTH_FILTER
-%endif
-INIT_XMM sse2
-SYNTH_FILTER
-INIT_YMM avx
-SYNTH_FILTER
-INIT_YMM fma3
-SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 1321dda..c27c045 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -40,54 +40,3 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
}
}
-
-
-#define SYNTH_FILTER_FUNC(opt) \
-void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
- const float window[512], \
- float out[32], intptr_t offset, float scale); \
-static void synth_filter_##opt(FFTContext *imdct, \
- float *synth_buf_ptr, int *synth_buf_offset, \
- float synth_buf2[32], const float window[512], \
- float out[32], const float in[32], float scale) \
-{ \
- float *synth_buf= synth_buf_ptr + *synth_buf_offset; \
- \
- imdct->imdct_half(imdct, synth_buf, in); \
- \
- ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window, \
- out, *synth_buf_offset, scale); \
- \
- *synth_buf_offset = (*synth_buf_offset - 32) & 511; \
-} \
-
-#if HAVE_YASM
-#if ARCH_X86_32
-SYNTH_FILTER_FUNC(sse)
-#endif
-SYNTH_FILTER_FUNC(sse2)
-SYNTH_FILTER_FUNC(avx)
-SYNTH_FILTER_FUNC(fma3)
-#endif /* HAVE_YASM */
-
-av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
-{
-#if HAVE_YASM
- int cpu_flags = av_get_cpu_flags();
-
-#if ARCH_X86_32
- if (EXTERNAL_SSE(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse;
- }
-#endif
- if (EXTERNAL_SSE2(cpu_flags)) {
- s->synth_filter_float = synth_filter_sse2;
- }
- if (EXTERNAL_AVX_FAST(cpu_flags)) {
- s->synth_filter_float = synth_filter_avx;
- }
- if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
- s->synth_filter_float = synth_filter_fma3;
- }
-#endif /* HAVE_YASM */
-}
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/synth_filter.asm
similarity index 76%
copy from libavcodec/x86/dcadsp.asm
copy to libavcodec/x86/synth_filter.asm
index 502b70a..bc1a48f 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/synth_filter.asm
@@ -21,107 +21,8 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
-pf_inv16: times 4 dd 0x3D800000 ; 1/16
-
SECTION .text
-; %1=v0/v1 %2=in1 %3=in2
-%macro FIR_LOOP 2-3
-.loop%1:
-%define va m1
-%define vb m2
-%if %1
-%define OFFSET 0
-%else
-%define OFFSET NUM_COEF*count
-%endif
-; for v0, incrementing and for v1, decrementing
- mova va, [cf0q + OFFSET]
- mova vb, [cf0q + OFFSET + 4*NUM_COEF]
-%if %0 == 3
- mova m4, [cf0q + OFFSET + mmsize]
- mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
-%endif
- mulps va, %2
- mulps vb, %2
-%if %0 == 3
-%if cpuflag(fma3)
- fmaddps va, m4, %3, va
- fmaddps vb, m0, %3, vb
-%else
- mulps m4, %3
- mulps m0, %3
- addps va, m4
- addps vb, m0
-%endif
-%endif
- ; va = va1 va2 va3 va4
- ; vb = vb1 vb2 vb3 vb4
-%if %1
- SWAP va, vb
-%endif
- mova m4, va
- unpcklps va, vb ; va3 vb3 va4 vb4
- unpckhps m4, vb ; va1 vb1 va2 vb2
- addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
- movhlps vb, m4 ; va1+3 vb1+3
- addps vb, m4 ; va0..4 vb0..4
- movlps [outq + count], vb
-%if %1
- sub cf0q, 8*NUM_COEF
-%endif
- add count, 8
- jl .loop%1
-%endmacro
-
-; void dca_lfe_fir(float *out, float *in, float *coefs)
-%macro DCA_LFE_FIR 1
-cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
-%define IN1 m3
-%define IN2 m5
-%define count inq
-%define NUM_COEF 4*(2-%1)
-%define NUM_OUT 32*(%1+1)
-
- movu IN1, [inq + 4 - 1*mmsize]
- shufps IN1, IN1, q0123
-%if %1 == 0
- movu IN2, [inq + 4 - 2*mmsize]
- shufps IN2, IN2, q0123
-%endif
-
- mov count, -4*NUM_OUT
- add cf0q, 4*NUM_COEF*NUM_OUT
- add outq, 4*NUM_OUT
- ; compute v0 first
-%if %1 == 0
- FIR_LOOP 0, IN1, IN2
-%else
- FIR_LOOP 0, IN1
-%endif
- shufps IN1, IN1, q0123
- mov count, -4*NUM_OUT
- ; cf1 already correctly positioned
- add outq, 4*NUM_OUT ; outq now at out2
- sub cf0q, 8*NUM_COEF
-%if %1 == 0
- shufps IN2, IN2, q0123
- FIR_LOOP 1, IN2, IN1
-%else
- FIR_LOOP 1, IN1
-%endif
- RET
-%endmacro
-
-INIT_XMM sse
-DCA_LFE_FIR 0
-DCA_LFE_FIR 1
-%if HAVE_FMA3_EXTERNAL
-INIT_XMM fma3
-DCA_LFE_FIR 0
-%endif
-
%macro SETZERO 1
%if cpuflag(sse2) && notcpuflag(avx)
pxor %1, %1
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/synth_filter_init.c
similarity index 83%
copy from libavcodec/x86/dcadsp_init.c
copy to libavcodec/x86/synth_filter_init.c
index 1321dda..0649ea2 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/synth_filter_init.c
@@ -21,26 +21,7 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
-
-av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (EXTERNAL_SSE(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
- s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
- }
-
- if (EXTERNAL_FMA3(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
- }
-}
-
+#include "libavcodec/synth_filter.h"
#define SYNTH_FILTER_FUNC(opt) \
void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32], \
--
2.7.0
More information about the ffmpeg-devel
mailing list