[FFmpeg-devel] [PATCH] M68K: Optimized MUL64/MULH/MULLfunctions for 68060

Sun Aug 2 00:14:18 CEST 2009

> >     :"=d"(lo), "=d"(hi)
> 
> Those should be marked early-clobber (&).

Ok.

> >     :"0"(a), "1"(b)
> 
> Do these have to be the same regs?  Allowing different registers
> theoretically gives the compiler better room for optimal register
> allocation.  On the other hand, it gives the compiler more room to
> mess up.

It looks like GCC 4.4.1 generates better code with defined registers (2 move.ls less):

"0"(a) & "1(b)" output:

#NO_APP
	.text
	.even
	.globl	_MUL64
_MUL64:
	movem.l #15360,-(sp)
	move.l 20(sp),d1
	move.l 24(sp),d0
#APP
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0,d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	movem.l (sp)+,#60
	rts

"d"(a) & "d"(b) output:

#NO_APP
	.text
	.even
	.globl	_MUL64
_MUL64:
	movem.l #16128,-(sp)
	move.l 32(sp),d1
	move.l 28(sp),d0
#APP
	move.l d6, d5
	move.l d6, d4
	bge.b  0f
	neg.l  d6
	neg.l  d4
	0:
	eor.l  d7, d5
	move.l d7, d3
	bge.b  1f
	neg.l  d7
	neg.l  d3
	1:
	move.w #16, d5
	move.l d6, d2
	mulu.w d7,d6
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d7
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d7, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d7
	add.l  d4, d6
	addx.l d3, d7
	tst.l  d5
	bpl.b  2f
	neg.l  d6
	negx.l d7
	2:

#NO_APP
	move.l d7,d0
	move.l d6,d1
	movem.l (sp)+,#252
	rts

> >     :"d2", "d3", "d4", "d5");
> 
> Avoid using hardcoded registers, and prefer explicitly declared temp
> variables.

Hmm, I don't know how to do it and what code GCC will generate after this change.
Now the output asm code looks pefrect without any unneeded instructions.

> Out of interest, what does gcc do when left to its own devices?

You mean how output asm code looks alike without asm inlines? In this situation GCC uses slow _muldi3.

> > #define MULL(a,b,s)	(MUL64(a, b) >> s)
> 
> Can gcc really be trusted with this?

inline int MULL(int a, int b, unsigned s){
    return ((int64_t)(a) * (int64_t)(b))>>s;
}

Here is output from original function:

#NO_APP
	.text
	.even
	.globl	_MULL
_MULL:
	movem.l #12320,-(sp)
	move.l 24(sp),a2
	move.l 16(sp),-(sp)
	smi d0
	extb.l d0
	move.l d0,-(sp)
	move.l 28(sp),-(sp)
	smi d1
	extb.l d1
	move.l d1,-(sp)
	jsr ___muldi3
	lea (16,sp),sp
	lea (-32,a2),a0
	tst.l a0
	jlt L2
	move.l a0,d3
	asr.l d3,d0
	movem.l (sp)+,#1036
	rts
L2:
	move.l d0,d2
	add.l d2,d2
	moveq #31,d0
	sub.l a2,d0
	lsl.l d0,d2
	move.l d1,d0
	move.l a2,d3
	lsr.l d3,d0
	or.l d2,d0
	movem.l (sp)+,#1036
	rts

---

static inline int64_t MUL64(int a, int b)
{
    union { uint64_t x; unsigned hl[2]; } x;
    __asm__(
    "move.l %0, d5      \n\t"
    "move.l %0, d4      \n\t"
    "bge.b  0f          \n\t"
    "neg.l  %0          \n\t"
    "neg.l  d4          \n\t"
    "0:                 \n\t"
    "eor.l  %1, d5      \n\t"
    "move.l %1, d3      \n\t"
    "bge.b  1f          \n\t"
    "neg.l  %1          \n\t"
    "neg.l  d3          \n\t"
    "1:                 \n\t"
    "move.w #16, d5     \n\t"
    "move.l %0, d2      \n\t"
    "mulu.w %1,%0       \n\t"
    "lsr.l  d5, d3      \n\t"
    "lsr.l  d5, d4      \n\t"
    "mulu.w d3, d2      \n\t"
    "mulu.w d4, %1      \n\t"
    "mulu.w d4, d3      \n\t"
    "move.l d2, d4      \n\t"
    "lsr.l  d5, d2      \n\t"
    "add.w  %1, d4      \n\t"
    "addx.l d2, d3      \n\t"
    "lsl.l  d5, d4      \n\t"
    "lsr.l  d5, %1      \n\t"
    "add.l  d4, %0      \n\t"
    "addx.l d3, %1      \n\t"
    "tst.l  d5          \n\t"
    "bpl.b  2f          \n\t"
    "neg.l  %0          \n\t"
    "negx.l %1          \n\t"
    "2:                 \n\t"
    :"=&d"(x.hl[1]), "=&d"(x.hl[0])
    :"0"(a), "1"(b)
    :"d2", "d3", "d4", "d5");
    return x.x;
}

inline int MULL(int a, int b, unsigned s){
    return MUL64(a,b)>>s;
}

Here is output from asm-optimized function:

#NO_APP
	.text
	.even
	.globl	_MULL
_MULL:
	movem.l #15360,-(sp)
	move.l 28(sp),a0
	move.l 20(sp),d1
	move.l 24(sp),d0
#APP
	move.l d1, d5
	move.l d1, d4
	bge.b  0f
	neg.l  d1
	neg.l  d4
	0:
	eor.l  d0, d5
	move.l d0, d3
	bge.b  1f
	neg.l  d0
	neg.l  d3
	1:
	move.w #16, d5
	move.l d1, d2
	mulu.w d0,d1
	lsr.l  d5, d3
	lsr.l  d5, d4
	mulu.w d3, d2
	mulu.w d4, d0
	mulu.w d4, d3
	move.l d2, d4
	lsr.l  d5, d2
	add.w  d0, d4
	addx.l d2, d3
	lsl.l  d5, d4
	lsr.l  d5, d0
	add.l  d4, d1
	addx.l d3, d0
	tst.l  d5
	bpl.b  2f
	neg.l  d1
	negx.l d0
	2:

#NO_APP
	lea (-32,a0),a1
	tst.l a1
	jlt L2
	move.l a1,d1
	asr.l d1,d0
	movem.l (sp)+,#60
	rts
L2:
	move.l d0,d2
	add.l d2,d2
	moveq #31,d0
	sub.l a0,d0
	lsl.l d0,d2
	move.l d1,d0
	move.l a0,d3
	lsr.l d3,d0
	or.l d2,d0
	movem.l (sp)+,#60
	rts