[FFmpeg-devel] [PATCH] x86/dsputilenc: port sum_abs_dctelem functions to yasm
James Almer
jamrial at gmail.com
Sat May 24 05:36:35 CEST 2014
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/dsputilenc.asm | 74 +++++++++++++++++++++++
libavcodec/x86/dsputilenc_mmx.c | 130 ++++------------------------------------
2 files changed, 85 insertions(+), 119 deletions(-)
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 1f496ad..39af81f 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -487,3 +487,77 @@ cglobal pix_norm1, 2, 4
movd eax, m1
RET
+%macro HADDUSW 2
+%if mmsize >= 16
+ movhlps %2, %1
+ paddusw %1, %2
+ pshuflw %2, %1, q0032
+ paddusw %1, %2
+ pshuflw %2, %1, 1
+ paddusw %1, %2
+%elif cpuflag(mmxext)
+ pshufw %2, %1, q0032
+ paddusw %1, %2
+ pshufw %2, %1, 1
+ paddusw %1, %2
+%else ; mmx
+ movq %2, %1
+ psrlq %1, 32
+ paddusw %1, %2
+ movq %2, %1
+ psrlq %1, 16
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro DCT_SAD4 1
+ mova m2, [blockq+%1+0 ]
+ mova m3, [blockq+%1+16]
+ mova m4, [blockq+%1+32]
+ mova m5, [blockq+%1+48]
+ ABS1 m2, m6
+ paddusw m0, m2
+ ABS1 m3, m6
+ paddusw m1, m3
+ ABS1 m4, m6
+ paddusw m0, m4
+ ABS1 m5, m6
+ paddusw m1, m5
+%endmacro
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+
+%macro SUM_ABS_DCTELEM 1
+cglobal sum_abs_dctelem, 1, 1, %1, block
+ pxor m0, m0
+ pxor m1, m1
+ DCT_SAD4 0
+%if mmsize == 8
+ DCT_SAD4 8
+%endif
+ DCT_SAD4 64
+%if mmsize == 8
+ DCT_SAD4 72
+%endif
+ paddusw m0, m1
+ ; FIXME: HADDUSW saturates at 64k, while an 8x8 hadamard or dct block can get
+ ; up to about 100k on extreme inputs. But that's very unlikely to occur in
+ ; natural video, and it's even more unlikely to not have any alternative
+ ; mvs/modes with lower cost.
+ HADDUSW m0, m1
+ movd eax, m0
+ and eax, 0xffff
+ RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 1100fb0..e63d510 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -38,6 +38,10 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
#if HAVE_INLINE_ASM
@@ -759,118 +763,6 @@ static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
*left = src2[w - 1];
}
-#define MMABS_MMX(a,z) \
- "pxor " #z ", " #z " \n\t" \
- "pcmpgtw " #a ", " #z " \n\t" \
- "pxor " #z ", " #a " \n\t" \
- "psubw " #z ", " #a " \n\t"
-
-#define MMABS_MMXEXT(a, z) \
- "pxor " #z ", " #z " \n\t" \
- "psubw " #a ", " #z " \n\t" \
- "pmaxsw " #z ", " #a " \n\t"
-
-#define MMABS_SSSE3(a,z) \
- "pabsw " #a ", " #a " \n\t"
-
-#define MMABS_SUM(a,z, sum) \
- MMABS(a,z) \
- "paddusw " #a ", " #sum " \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst) \
- "movq " #a ", " #t " \n\t" \
- "psrlq $32, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movq " #a ", " #t " \n\t" \
- "psrlq $16, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_MMXEXT(a, t, dst) \
- "pshufw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshufw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_SSE2(a, t, dst) \
- "movhlps " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define DCT_SAD4(m, mm, o) \
- "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
- "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
- "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
- "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
- MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
- MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
-
-#define DCT_SAD_MMX \
- "pxor %%mm0, %%mm0 \n\t" \
- "pxor %%mm1, %%mm1 \n\t" \
- DCT_SAD4(q, %%mm, 0) \
- DCT_SAD4(q, %%mm, 8) \
- DCT_SAD4(q, %%mm, 64) \
- DCT_SAD4(q, %%mm, 72) \
- "paddusw %%mm1, %%mm0 \n\t" \
- HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2 \
- "pxor %%xmm0, %%xmm0 \n\t" \
- "pxor %%xmm1, %%xmm1 \n\t" \
- DCT_SAD4(dqa, %%xmm, 0) \
- DCT_SAD4(dqa, %%xmm, 64) \
- "paddusw %%xmm1, %%xmm0 \n\t" \
- HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu) \
-static int sum_abs_dctelem_ ## cpu(int16_t *block) \
-{ \
- int sum; \
- __asm__ volatile ( \
- DCT_SAD \
- :"=r"(sum) \
- :"r"(block)); \
- return sum & 0xFFFF; \
-}
-
-#define DCT_SAD DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z) MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z) MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z) MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
int size)
{
@@ -1012,8 +904,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->fdct = ff_fdct_mmx;
c->diff_bytes = diff_bytes_mmx;
- c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4] = vsad_intra16_mmx;
@@ -1041,7 +931,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_mmxext;
- c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
c->vsad[4] = vsad_intra16_mmxext;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
@@ -1055,8 +944,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
if (!high_bit_depth &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_sse2;
-
- c->sum_abs_dctelem = sum_abs_dctelem_sse2;
}
#if HAVE_SSSE3_INLINE
@@ -1065,7 +952,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->try_8x8basis = try_8x8basis_ssse3;
}
c->add_8x8basis = add_8x8basis_ssse3;
- c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
}
#endif
#endif /* HAVE_INLINE_ASM */
@@ -1073,15 +959,18 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->sse[0] = ff_sse16_sse2;
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
@@ -1089,9 +978,12 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif
}
- if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+ if (EXTERNAL_SSSE3(cpu_flags)) {
+ c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
}
ff_dsputil_init_pix_mmx(c, avctx);
--
1.8.5.5
More information about the ffmpeg-devel
mailing list