[FFmpeg-cvslog] x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format
Timothy Gu
git at videolan.org
Sun Feb 14 20:11:31 CET 2016
ffmpeg | branch: master | Timothy Gu <timothygu99 at gmail.com> | Sun Feb 14 04:22:48 2016 +0000| [bcc223523e68a52050dc3f7d0e6a07c82f6f2bff] | committer: Timothy Gu
x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format
Reviewed-by: Christophe Gisquet <christophe.gisquet at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bcc223523e68a52050dc3f7d0e6a07c82f6f2bff
---
libavcodec/x86/vc1dsp.asm | 90 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/vc1dsp_mmx.c | 61 ++++-------------------------
2 files changed, 98 insertions(+), 53 deletions(-)
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 91a1991..eee42c2 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -25,6 +25,7 @@
cextern pw_4
cextern pw_5
cextern pw_9
+cextern pw_128
section .text
@@ -319,6 +320,44 @@ cglobal vc1_h_loop_filter8, 3,5,8
RET
%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+; when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+ pavgb %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+ paddw m3, m7 ; +bias-r
+ paddw m4, m7 ; +bias-r
+ psraw m3, %1
+ psraw m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+ packuswb m3, m4
+ %1 m3, [%2]
+ mova [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+ %1 m3, [%2]
+ %1 m3, [%2 + mmsize]
+ mova [%2], m3
+ mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+ punpcklbw %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
; Compute the rounder 32-r or 8-r and unpacks it to m7
%macro LOAD_ROUNDER_MMX 1 ; round
movd m7, %1
@@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
dec i
jnz .loop
REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+; const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+ mov hq, 8
+ sub srcq, 2
+ sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+ LOAD_ROUNDER_MMX rndq
+ mova m5, [pw_9]
+ mova m6, [pw_128]
+ pxor m0, m0
+
+.loop:
+ mova m1, [srcq + 2 * 0]
+ mova m2, [srcq + 2 * 0 + mmsize]
+ mova m3, [srcq + 2 * 1]
+ mova m4, [srcq + 2 * 1 + mmsize]
+ paddw m3, [srcq + 2 * 2]
+ paddw m4, [srcq + 2 * 2 + mmsize]
+ paddw m1, [srcq + 2 * 3]
+ paddw m2, [srcq + 2 * 3 + mmsize]
+ pmullw m3, m5
+ pmullw m4, m5
+ psubw m3, m1
+ psubw m4, m2
+ NORMALIZE_MMX 7
+ ; remove bias
+ paddw m3, m6
+ paddw m4, m6
+ TRANSFER_DO_PACK %1, dstq
+ add srcq, 24
+ add dstq, strideq
+ dec hq
+ jnz .loop
+
+ RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
%endif ; HAVE_MMX_INLINE
%macro INV_TRANS_INIT 0
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index ff13d9b..8325648 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -38,6 +38,10 @@
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -71,55 +75,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
"punpckldq %%mm7, %%mm7 \n\t"
/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
- const int16_t *src, int rnd)\
-{\
- int h = 8;\
-\
- src -= 1;\
- rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
- __asm__ volatile(\
- LOAD_ROUNDER_MMX("%4")\
- "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
- "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
- "1: \n\t"\
- "movq 2*0+0(%1), %%mm1 \n\t"\
- "movq 2*0+8(%1), %%mm2 \n\t"\
- "movq 2*1+0(%1), %%mm3 \n\t"\
- "movq 2*1+8(%1), %%mm4 \n\t"\
- "paddw 2*3+0(%1), %%mm1 \n\t"\
- "paddw 2*3+8(%1), %%mm2 \n\t"\
- "paddw 2*2+0(%1), %%mm3 \n\t"\
- "paddw 2*2+8(%1), %%mm4 \n\t"\
- "pmullw %%mm5, %%mm3 \n\t"\
- "pmullw %%mm5, %%mm4 \n\t"\
- "psubw %%mm1, %%mm3 \n\t"\
- "psubw %%mm2, %%mm4 \n\t"\
- NORMALIZE_MMX("$7")\
- /* Remove bias */\
- "paddw %%mm6, %%mm3 \n\t"\
- "paddw %%mm6, %%mm4 \n\t"\
- TRANSFER_DO_PACK(OP)\
- "add $24, %1 \n\t"\
- "add %3, %2 \n\t"\
- "decl %0 \n\t"\
- "jnz 1b \n\t"\
- : "+r"(h), "+r" (src), "+r" (dst)\
- : "r"(stride), "m"(rnd)\
- NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
- : "memory"\
- );\
-}
-
-VC1_HOR_16b_SHIFT2(OP_PUT, put_)
-VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
-
-
-/**
* Purely vertical or horizontal 1/2 shift interpolation.
* Sacrify mm6 for *9 factor.
*/
@@ -380,14 +335,14 @@ typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
* @param hmode Vertical filter.
* @param rnd Rounding bias.
*/
-#define VC1_MSPEL_MC(OP)\
+#define VC1_MSPEL_MC(OP, INSTR)\
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
{ NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
- { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
+ { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
\
@@ -428,8 +383,8 @@ static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
}
-VC1_MSPEL_MC(put_)
-VC1_MSPEL_MC(avg_)
+VC1_MSPEL_MC(put_, mmx)
+VC1_MSPEL_MC(avg_, mmxext)
/** Macro to ease bicubic filter interpolation functions declarations */
#define DECLARE_FUNCTION(a, b) \
More information about the ffmpeg-cvslog
mailing list