[FFmpeg-devel] [PATCH] M68K: Optimized MUL64/MULH/MULLfunctions for 68060
ami_stuff
ami_stuff
Sun Aug 2 00:14:18 CEST 2009
> > :"=d"(lo), "=d"(hi)
>
> Those should be marked early-clobber (&).
Ok.
> > :"0"(a), "1"(b)
>
> Do these have to be the same regs? Allowing different registers
> theoretically gives the compiler better room for optimal register
> allocation. On the other hand, it gives the compiler more room to
> mess up.
It looks like GCC 4.4.1 generates better code with defined registers (2 move.ls less):
"0"(a) & "1(b)" output:
#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #15360,-(sp)
move.l 20(sp),d1
move.l 24(sp),d0
#APP
move.l d1, d5
move.l d1, d4
bge.b 0f
neg.l d1
neg.l d4
0:
eor.l d0, d5
move.l d0, d3
bge.b 1f
neg.l d0
neg.l d3
1:
move.w #16, d5
move.l d1, d2
mulu.w d0,d1
lsr.l d5, d3
lsr.l d5, d4
mulu.w d3, d2
mulu.w d4, d0
mulu.w d4, d3
move.l d2, d4
lsr.l d5, d2
add.w d0, d4
addx.l d2, d3
lsl.l d5, d4
lsr.l d5, d0
add.l d4, d1
addx.l d3, d0
tst.l d5
bpl.b 2f
neg.l d1
negx.l d0
2:
#NO_APP
movem.l (sp)+,#60
rts
"d"(a) & "d"(b) output:
#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 32(sp),d1
move.l 28(sp),d0
#APP
move.l d6, d5
move.l d6, d4
bge.b 0f
neg.l d6
neg.l d4
0:
eor.l d7, d5
move.l d7, d3
bge.b 1f
neg.l d7
neg.l d3
1:
move.w #16, d5
move.l d6, d2
mulu.w d7,d6
lsr.l d5, d3
lsr.l d5, d4
mulu.w d3, d2
mulu.w d4, d7
mulu.w d4, d3
move.l d2, d4
lsr.l d5, d2
add.w d7, d4
addx.l d2, d3
lsl.l d5, d4
lsr.l d5, d7
add.l d4, d6
addx.l d3, d7
tst.l d5
bpl.b 2f
neg.l d6
negx.l d7
2:
#NO_APP
move.l d7,d0
move.l d6,d1
movem.l (sp)+,#252
rts
> > :"d2", "d3", "d4", "d5");
>
> Avoid using hardcoded registers, and prefer explicitly declared temp
> variables.
Hmm, I don't know how to do it and what code GCC will generate after this change.
Now the output asm code looks pefrect without any unneeded instructions.
> Out of interest, what does gcc do when left to its own devices?
You mean how output asm code looks alike without asm inlines? In this situation GCC uses slow _muldi3.
> > #define MULL(a,b,s) (MUL64(a, b) >> s)
>
> Can gcc really be trusted with this?
inline int MULL(int a, int b, unsigned s){
return ((int64_t)(a) * (int64_t)(b))>>s;
}
Here is output from original function:
#NO_APP
.text
.even
.globl _MULL
_MULL:
movem.l #12320,-(sp)
move.l 24(sp),a2
move.l 16(sp),-(sp)
smi d0
extb.l d0
move.l d0,-(sp)
move.l 28(sp),-(sp)
smi d1
extb.l d1
move.l d1,-(sp)
jsr ___muldi3
lea (16,sp),sp
lea (-32,a2),a0
tst.l a0
jlt L2
move.l a0,d3
asr.l d3,d0
movem.l (sp)+,#1036
rts
L2:
move.l d0,d2
add.l d2,d2
moveq #31,d0
sub.l a2,d0
lsl.l d0,d2
move.l d1,d0
move.l a2,d3
lsr.l d3,d0
or.l d2,d0
movem.l (sp)+,#1036
rts
---
static inline int64_t MUL64(int a, int b)
{
union { uint64_t x; unsigned hl[2]; } x;
__asm__(
"move.l %0, d5 \n\t"
"move.l %0, d4 \n\t"
"bge.b 0f \n\t"
"neg.l %0 \n\t"
"neg.l d4 \n\t"
"0: \n\t"
"eor.l %1, d5 \n\t"
"move.l %1, d3 \n\t"
"bge.b 1f \n\t"
"neg.l %1 \n\t"
"neg.l d3 \n\t"
"1: \n\t"
"move.w #16, d5 \n\t"
"move.l %0, d2 \n\t"
"mulu.w %1,%0 \n\t"
"lsr.l d5, d3 \n\t"
"lsr.l d5, d4 \n\t"
"mulu.w d3, d2 \n\t"
"mulu.w d4, %1 \n\t"
"mulu.w d4, d3 \n\t"
"move.l d2, d4 \n\t"
"lsr.l d5, d2 \n\t"
"add.w %1, d4 \n\t"
"addx.l d2, d3 \n\t"
"lsl.l d5, d4 \n\t"
"lsr.l d5, %1 \n\t"
"add.l d4, %0 \n\t"
"addx.l d3, %1 \n\t"
"tst.l d5 \n\t"
"bpl.b 2f \n\t"
"neg.l %0 \n\t"
"negx.l %1 \n\t"
"2: \n\t"
:"=&d"(x.hl[1]), "=&d"(x.hl[0])
:"0"(a), "1"(b)
:"d2", "d3", "d4", "d5");
return x.x;
}
inline int MULL(int a, int b, unsigned s){
return MUL64(a,b)>>s;
}
Here is output from asm-optimized function:
#NO_APP
.text
.even
.globl _MULL
_MULL:
movem.l #15360,-(sp)
move.l 28(sp),a0
move.l 20(sp),d1
move.l 24(sp),d0
#APP
move.l d1, d5
move.l d1, d4
bge.b 0f
neg.l d1
neg.l d4
0:
eor.l d0, d5
move.l d0, d3
bge.b 1f
neg.l d0
neg.l d3
1:
move.w #16, d5
move.l d1, d2
mulu.w d0,d1
lsr.l d5, d3
lsr.l d5, d4
mulu.w d3, d2
mulu.w d4, d0
mulu.w d4, d3
move.l d2, d4
lsr.l d5, d2
add.w d0, d4
addx.l d2, d3
lsl.l d5, d4
lsr.l d5, d0
add.l d4, d1
addx.l d3, d0
tst.l d5
bpl.b 2f
neg.l d1
negx.l d0
2:
#NO_APP
lea (-32,a0),a1
tst.l a1
jlt L2
move.l a1,d1
asr.l d1,d0
movem.l (sp)+,#60
rts
L2:
move.l d0,d2
add.l d2,d2
moveq #31,d0
sub.l a0,d0
lsl.l d0,d2
move.l d1,d0
move.l a0,d3
lsr.l d3,d0
or.l d2,d0
movem.l (sp)+,#60
rts
More information about the ffmpeg-devel
mailing list