[FFmpeg-devel] [PATCH] M68K: Optimized MUL64/MULH/MULLfunctionsfor 68060

Sun Aug 2 02:39:35 CEST 2009

> int t1, t2, t3, t4;
> asm("..." : "=&d"(t1), "=&d"(t2), "=&d"(t3), "=&d"(t4));

Sorry, but I still don't understand. Should I remove input/clobber registers from
MUL64 function and repleace output registers with your t1 , t2... variables?

When I repleace only clobber registers with your code, it don't want to compile.

Help?

static inline av_const int64_t MUL64(int a, int b)
{
    union { uint64_t x; unsigned hl[2]; } x;
    __asm__(
    "move.l %0, d5      \n\t"
    "move.l %0, d4      \n\t"
    "bge.b  0f          \n\t"
    "neg.l  %0          \n\t"
    "neg.l  d4          \n\t"
    "0:                 \n\t"
    "eor.l  %1, d5      \n\t"
    "move.l %1, d3      \n\t"
    "bge.b  1f          \n\t"
    "neg.l  %1          \n\t"
    "neg.l  d3          \n\t"
    "1:                 \n\t"
    "move.w #16, d5     \n\t"
    "move.l %0, d2      \n\t"
    "mulu.w %1, %0      \n\t"
    "lsr.l  d5, d3      \n\t"
    "lsr.l  d5, d4      \n\t"
    "mulu.w d3, d2      \n\t"
    "mulu.w d4, %1      \n\t"
    "mulu.w d4, d3      \n\t"
    "move.l d2, d4      \n\t"
    "lsr.l  d5, d2      \n\t"
    "add.w  %1, d4      \n\t"
    "addx.l d2, d3      \n\t"
    "lsl.l  d5, d4      \n\t"
    "lsr.l  d5, %1      \n\t"
    "add.l  d4, %0      \n\t"
    "addx.l d3, %1      \n\t"
    "tst.l  d5          \n\t"
    "bpl.b  2f          \n\t"
    "neg.l  %0          \n\t"
    "negx.l %1          \n\t"
    "2:                 \n\t"
    :"=&d"(x.hl[1]), "=&d"(x.hl[0])
    :"0"(a), "1"(b)
    :"d2", "d3", "d4", "d5");
    return x.x;
}

> > Now the output asm code looks pefrect without any unneeded
> > instructions.
> 
> That's because you're looking at this function in isolation.  When
> inlined in a larger function, those registers may well already be in
> use with some others free.

Here are asm output from one function (mpegaudiodec.c). If there is a need
for other function just ask.

_ff_mpa_synth_filter:
	lea (-304,sp),sp
	movem.l #16190,-(sp)
	move.l 360(sp),a4
	move.l 368(sp),a6
	move.l 356(sp),a0
	move.l (a0),328(sp)
	move.l 328(sp),d0
	lsl.l #2,d0
	add.l 352(sp),d0
	move.l d0,320(sp)
	move.l 376(sp),-(sp)
	move.l d0,-(sp)
	jsr _dct32
	pea 128.w
	move.l 332(sp),-(sp)
	move.l 336(sp),a0
	pea 2048(a0)
	jsr _memcpy
	moveq #62,d0
	muls.l 392(sp),d0
	add.l a6,d0
	move.l d0,356(sp)
	lea (124,a4),a5
	move.l 384(sp),a0
	move.l (a0),196(sp)
	smi d0
	extb.l d0
	move.l d0,192(sp)
	move.w #64,a0
	add.l 340(sp),a0
	move.l (a4),d1
	move.l (a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,72(sp)
	move.l d1,76(sp)
	move.l 256(a4),d1
	move.l 256(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,80(sp)
	move.l d1,84(sp)
	move.l 512(a4),d1
	move.l 512(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,88(sp)
	move.l d1,92(sp)
	move.l 768(a4),d1
	move.l 768(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,96(sp)
	move.l d1,100(sp)
	move.l 1024(a4),d1
	move.l 1024(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,104(sp)
	move.l d1,108(sp)
	move.l 1280(a4),d1
	move.l 1280(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,112(sp)
	move.l d1,116(sp)
	move.l 1536(a4),d1
	move.l 1536(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,120(sp)
	move.l d1,124(sp)
	move.l 1792(a4),d1
	move.l 1792(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,128(sp)
	move.l d1,132(sp)
	move.l 340(sp),a0
	lea (192,a0),a0
	move.l 128(a4),d1
	move.l (a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,136(sp)
	move.l d1,140(sp)
	move.l 384(a4),d1
	move.l 256(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,144(sp)
	move.l d1,148(sp)
	move.l 640(a4),d1
	move.l 512(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0, d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	move.l d0,152(sp)
	move.l d1,156(sp)
	move.l 896(a4),d1
	move.l 768(a0),d0
#APP
;# 76 "libavcodec/m68k/mathops.h" 1
	| MUL64

> >> > #define MULL(a,b,s)	(MUL64(a, b) >> s)
> 
> That's quite a lot for a right shift.  We also happen to know the
> shift is always a constant and less than 32.  GCC will of course
> theoretically have this information when the function is inlined, so
> we should be looking at code generated by such a call, not this
> function compiled standalone.

_decode_init:
	lea (-1552,sp),sp
	fmovem #4,-(sp)
	movem.l #16190,-(sp)
	move.l 1612(sp),a2
	move.l 136(a2),a3
	move.l a2,23184(a3)
	moveq #1,d0
	move.l d0,72(a2)
	move.l 252(a2),23180(a3)
	moveq #3,d1
	cmp.l 600(a2),d1
	jeq L218
	move.l #_compute_antialias_integer,23168(a3)
	jra L219
L218:
	move.l #_compute_antialias_float,23168(a3)
L219:
	tst.l _init.5731
	jne L220
	tst.l 272(a2)
	jne L220
	lea _scale_factor_modshift,a0
	clr.l d0
L221:
	move.l d0,d2
	moveq #3,d3
	divsl.l d3,d1:d2
	lsl.l #2,d2
	or.w d1,d2
	move.w d2,(a0)+
	addq.l #1,d0
	moveq #64,d4
	cmp.l d0,d4
	jne L221
	lea _scale_factor_mult,a4
	moveq #2,d6
L222:
	move.l d6,-(sp)
	move.l #8388608,-(sp)
	clr.l -(sp)
	jsr ___ashldi3
	lea (12,sp),sp
	moveq #1,d2
	lsl.l d6,d2
	move.l #-1,d3
	add.l d2,d3
	smi d2
	extb.l d2
	move.l d3,-(sp)
	move.l d2,-(sp)
	move.l d1,-(sp)
	move.l d0,-(sp)
	jsr ___divdi3
	lea (16,sp),sp
	move.l #16777216,d0
	move.l d1,d7
#APP
;# 120 "libavcodec/m68k/mathops.h" 1
	| MULL
	move.l d0, d5
	move.l d0, d4
	bge.b  0f
	neg.l  d0
	neg.l  d4
	0:
	eor.l  d7, d5
	move.l d7, d3
	bge.b  1f
	neg.l  d7
	neg.l  d3
	1:
	move.w #16, d5
	move.l d0, d2
	mulu.w d7, d0
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d7
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d7, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d7
	add.l  d4, d0
	addx.l d3, d7
	tst.l  d5
	bpl.b  2f
	neg.l  d0
	negx.l d7
	2:

#NO_APP
	move.l d7,d2
	moveq #9,d3
	lsl.l d3,d2
	moveq #23,d4
	lsr.l d4,d0
	or.l d0,d2
	move.l d2,(a4)
	move.l #13316085,d0
	move.l d1,d7
#APP
;# 120 "libavcodec/m68k/mathops.h" 1
	| MULL
	move.l d0, d5
	move.l d0, d4
	bge.b  0f
	neg.l  d0
	neg.l  d4
	0:
	eor.l  d7, d5
	move.l d7, d3
	bge.b  1f
	neg.l  d7
	neg.l  d3
	1:
	move.w #16, d5
	move.l d0, d2
	mulu.w d7, d0
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d7
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d7, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d7
	add.l  d4, d0
	addx.l d3, d7
	tst.l  d5
	bpl.b  2f
	neg.l  d0
	negx.l d7
	2:

#NO_APP
	move.l d7,d2
	moveq #9,d3
	lsl.l d3,d2
	moveq #23,d4
	lsr.l d4,d0
	or.l d0,d2
	move.l d2,4(a4)
	move.l #10568984,d0
#APP
;# 120 "libavcodec/m68k/mathops.h" 1
	| MULL
	move.l d0, d5
	move.l d0, d4
	bge.b  0f
	neg.l  d0
	neg.l  d4
	0:
	eor.l  d1, d5
	move.l d1, d3
	bge.b  1f
	neg.l  d1
	neg.l  d3
	1:
	move.w #16, d5
	move.l d0, d2
	mulu.w d1, d0
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d1
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d1, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d1
	add.l  d4, d0
	addx.l d3, d1
	tst.l  d5
	bpl.b  2f
	neg.l  d0
	negx.l d1
	2:

#NO_APP
	moveq #9,d5
	lsl.l d5,d1
	moveq #23,d7
	lsr.l d7,d0
	or.l d0,d1
	move.l d1,8(a4)
	addq.l #1,d6
	lea (12,a4),a4
	moveq #17,d0
	cmp.l d6,d0
	jne L222
	pea _window
	jsr _ff_mpa_synth_init
	lea _mpa_huff_tables,a4
	move.l #_huff_vlc_tables_sizes+4,72(sp)
	clr.l d2
	clr.l d3
	addq.l #4,sp
	move.l sp,d7
	add.l #1096,d7
	move.l #_memset,d6