[Ffmpeg-devel] [PATCH] put_mpeg4_qpel16_h_lowpass altivec implementation
Michael Niedermayer
michaelni
Mon Nov 20 02:43:17 CET 2006
Hi
On Sun, Nov 19, 2006 at 11:20:14PM +0000, Brian Foley wrote:
> Hi there,
>
> please find attached a first cut at some Altivec acceleration for the
> mpeg4_qpel functions. To get things started, I've translated
> put_mpeg4_qpel16_h_lowpass from the C version, as it was the most CPU
> intensive function that showed up when playing some 720p Xvid.
>
> A clip that benchmarked at 11.6s before now takes 9.0s on my 1.8GHz
> G5 iMac, and that 720p Xvid no longer drops frames in the places it
> used to before.
>
> It should be a safe enough patch to apply, as I've tested it fairly
> carefully with a large set of random inputs, focussing on things that
> could cause overflow/rounding errors. As far as I can tell, it gives
> exactly the same outputs as the C version in every case.
>
> Other obvious candidates to Altivec-ify are put_mpeg4_qpel16_v_lowpass,
> all the avg_mpeg4 equivalents, and the mpeg4_qpel8 variants. I'll try
> to get around to doing some of those soon if someone doesn't beat me to
> it :)
> Index: ppc/dsputil_altivec.c
> ===================================================================
> --- ppc/dsputil_altivec.c (revision 7124)
> +++ ppc/dsputil_altivec.c (working copy)
> @@ -1556,6 +1556,342 @@
> POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
> }
>
> +
> +static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> +{
> + int i;
> + for(i=0; i<h; i++)
> + {
> + ST32(dst , LD32(src ));
> + ST32(dst+4 , LD32(src+4 ));
> + ST32(dst+8 , LD32(src+8 ));
> + ST32(dst+12, LD32(src+12));
> + dst[16]= src[16];
> + dst+=dstStride;
> + src+=srcStride;
> + }
> +}
code duplication, move copy_block17 to a common header like dsputil.h or
dsputil_internal.h or whatever, dont copy and paste it
> +
> +
> +static void put_pixels16_l2_altivec(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride,
> + int src_stride1, int src_stride2, int h)
> +{
> + register vector unsigned char src1v, src2v, dstv;
> + int i;
> + if (((int)dst & 0xf) || (dst_stride & 0xf)) {
> + /*
> + * If the dst or dst_stride are unaligned, just use the integer version
> + * instead. The speedup is probably not worth the complexity involved.
> + */
> + for(i=0; i<h; i++){
> + uint32_t a,b;
> + a= LD32(&src1[i*src_stride1 ]);
> + b= LD32(&src2[i*src_stride2 ]);
> + *((uint32_t*)&dst[i*dst_stride ]) = rnd_avg32(a, b);
> + a= LD32(&src1[i*src_stride1+ 4]);
> + b= LD32(&src2[i*src_stride2+ 4]);
> + *((uint32_t*)&dst[i*dst_stride+ 4]) = rnd_avg32(a, b);
> + a= LD32(&src1[i*src_stride1+ 8]);
> + b= LD32(&src2[i*src_stride2+ 8]);
> + *((uint32_t*)&dst[i*dst_stride+ 8]) = rnd_avg32(a, b);
> + a= LD32(&src1[i*src_stride1+12]);
> + b= LD32(&src2[i*src_stride2+12]);
> + *((uint32_t*)&dst[i*dst_stride+ 12]) = rnd_avg32(a, b);
> + }
code duplication
> + } else {
> + /*
> + * We can rely on writes to dst + N*dst_stride being aligned, but
> + * we'll have to allow for unaligned reads. This is fair enough:
> + * unaligned reads aren't too bad, but unaligned writes are horribly
> + * expensive.
> + */
> + for(i=0; i<h; i++) {
> + src1v = vec_perm(vec_ld(0, src1), vec_ld(15, src1), vec_lvsl(0, src1));
> + src2v = vec_perm(vec_ld(0, src2), vec_ld(15, src2), vec_lvsl(0, src2));
one of the 2 is in many cases aligned
> +
> + /* Happily, Altivec's avg is exactly the (a+b+1)>>1 that we want */
> + dstv = vec_avg(src1v, src2v);
> + vec_st(dstv, 0, dst);
> + src1 += src_stride1;
> + src2 += src_stride2;
> + dst += dst_stride;
> + }
> + }
> +}
> +
> +/* Assume that dst % 16 == 0. If not the vec_st(...,0, dst) at the end will fail */
> +static void put_mpeg4_qpel16_h_lowpass_altivec(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
> + int i;
> + vector unsigned char srcAv, srcBv;
> + const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
> + const_vector unsigned char three = (const_vector unsigned char)vec_splat_u8(3);
> + const_vector unsigned char six = (const_vector unsigned char)vec_splat_u8(6);
> + const_vector unsigned char twenty = (const_vector unsigned char)vec_splat(vec_lvsr(0,(unsigned char *)0),4);
> + const_vector unsigned char p2 = (const_vector unsigned char)vec_lvsl(1, (unsigned char *)0);
> + const_vector unsigned char p3 = (const_vector unsigned char)AVV( 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14);
> + const_vector unsigned char p4 = (const_vector unsigned char)AVV( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16);
> + const_vector unsigned char p5 = (const_vector unsigned char)AVV( 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13);
> + const_vector unsigned char p6 = (const_vector unsigned char)AVV( 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16,15);
> + const_vector unsigned char p7 = (const_vector unsigned char)AVV( 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12);
> + const_vector unsigned char p8 = (const_vector unsigned char)AVV( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16,15,14);
> +
> + vector unsigned short five = vec_splat_u8(5); /* Used for vec_sra on uint_16s. the 0x05s in the even bytes are ignored */
> + vector unsigned short sixteen = (const_vector unsigned short)AVV(16, 16, 16, 16, 16, 16, 16, 16);
> +
> + vector unsigned char v2, v3, v4, v5, v6, v7, v8;
> + vector unsigned char v1h, v2h, v3h, v4h, v5h, v6h;
> + vector unsigned char v1l, v2l, v3l, v4l, v5l, v6l;
> + vector unsigned short v7h, v8h, v7l, v8l;
> +
> + vector unsigned short ph, pl, mh, ml, tmph, tmpl;
> +
> + for(i=0; i<h; i++)
> + {
> + /* Read unaligned: srcAv is src[0]--src[15] srcBv contains src[16]--src[31] */
> + srcAv = vec_perm(vec_ld( 0, src) , vec_ld(15, src), vec_lvsl( 0, src));
> + srcBv = vec_perm(vec_ld(16, src) , vec_ld(31, src), vec_lvsl(16, src));
src too is aligned in several cases where this function is called
also isnt vec_lvsl( 0, src) == vec_lvsl(16, src)?
> +
> + /* v1 (= srcAv) ... v8 are the src[...] columns in the C version of this code */
> + v2 = vec_perm(srcAv, srcBv, p2);
> + v3 = vec_perm(srcAv, srcBv, p3);
> + v4 = vec_perm(srcAv, srcBv, p4);
> + v5 = vec_perm(srcAv, srcBv, p5);
> + v6 = vec_perm(srcAv, srcBv, p6);
> + v7 = vec_perm(srcAv, srcBv, p7);
> + v8 = vec_perm(srcAv, srcBv, p8);
> + /*
> + * Split the 16 * uint_8 vectors into pairs of 8 * uint_16 vectors; we're
> + * going to do arithmetic that overflows a uint_8...
> + */
> + v1h = vec_mergeh(zero, srcAv); v1l = vec_mergel(zero, srcAv);
> + v2h = vec_mergeh(zero, v2); v2l = vec_mergel(zero, v2);
> + v3h = vec_mergeh(zero, v3); v3l = vec_mergel(zero, v3);
> + v4h = vec_mergeh(zero, v4); v4l = vec_mergel(zero, v4);
> + v5h = vec_mergeh(zero, v5); v5l = vec_mergel(zero, v5);
> + v6h = vec_mergeh(zero, v6); v6l = vec_mergel(zero, v6);
> + v7h = vec_mergeh(zero, v7); v7l = vec_mergel(zero, v7);
> + v8h = vec_mergeh(zero, v8); v8l = vec_mergel(zero, v8);
> +
> + /*
> + * Individually multiply each of the columns; then add together all the
> + * positive components, then all the negative components; finally subtract
> + * the negative from the positive. Since we're using saturating
> + * arithmetic, we don't need to worry about m > p
> + */
> +
> + /* Positive columns */
> + ph = vec_add(
> + vec_add(vec_mulo(v1h, twenty), vec_mulo(v2h, twenty)),
> + vec_add(vec_mulo(v5h, three), vec_mulo(v6h, three))
> + );
> + pl = vec_adds(
> + vec_adds(vec_mulo(v1l, twenty), vec_mulo(v2l, twenty)),
> + vec_adds(vec_mulo(v5l, three), vec_mulo(v6l, three)));
> +
> + /* Negative columns */
> + mh = vec_adds(
> + vec_add(vec_mulo(v3h, six), vec_mulo(v4h, six)),
> + vec_add(v7h, v8h)
> + );
> + ml = vec_adds(
> + vec_adds(vec_mulo(v3l, six), vec_mulo(v4l, six)),
> + vec_adds(v7l, v8l)
> + );
> +
> + /* Add the positive and negative components */
> + tmph = vec_subs(ph, mh);
> + tmpl = vec_subs(pl, ml);
> +
> + /*
> + * Finally do cm[a+16 >>5] and pack 16 uint_16s into 16 uint_8s.
> + * We don't need to worry about 16-bit overflow/underflow since
> + * the saturating arithmetic above did it for us.
> + * We do however need to worry about turning uint_16s >= 0x100
> + * into 0xff. Happily, vec_packsu does exactly this automatically.
> + */
> + tmph = vec_sra(vec_add(tmph, sixteen), five);
> + tmpl = vec_sra(vec_add(tmpl, sixteen), five);
> +
> + /* This store requires a 16-byte aligned dst! */
> + vec_st(vec_packsu(tmph, tmpl), 0, dst);
16 add
12 mul
2 shift
1 pack
maybe the following is faster (it replace 12 mul be 10 add)
though maybe its not i dunno ppc well enough ...
a b c d e f g h
-1 3 -6 20 20 -6 3 -1
d+=e;
c+=f;
b+=g;
a+=h;
d+=d; (2 2)
d+=d; (4 4)
c-=d; (1 -4 -4 1)
b-=c+c; (1 -2 8 8 -2 1)
b+= b+b; (3 -6 24 24 -6 3)
b-= a; (-1 3 -6 24 24 -6 3 -1)
b-= d; (-1 3 -6 20 20 -6 3 -1)
b+= 16;
b>>= 5;
pack(b)
26 add
2 shift
1 pack
> +
> + dst+=dstStride;
> + src+=srcStride;
> + }
> +}
> +
> +static void put_mpeg4_qpel16_v_lowpass_altivec(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
> + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
> + int i;
> + const int w=16;
> + for(i=0; i<w; i++)
> + {
> + const int src0= src[0*srcStride];
> + const int src1= src[1*srcStride];
> + const int src2= src[2*srcStride];
> + const int src3= src[3*srcStride];
> + const int src4= src[4*srcStride];
> + const int src5= src[5*srcStride];
> + const int src6= src[6*srcStride];
> + const int src7= src[7*srcStride];
> + const int src8= src[8*srcStride];
> + const int src9= src[9*srcStride];
> + const int src10= src[10*srcStride];
> + const int src11= src[11*srcStride];
> + const int src12= src[12*srcStride];
> + const int src13= src[13*srcStride];
> + const int src14= src[14*srcStride];
> + const int src15= src[15*srcStride];
> + const int src16= src[16*srcStride];
> +
> + #define op_put(a, b) a = cm[((b) + 16)>>5]
> + op_put(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));
> + op_put(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));
> + op_put(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));
> + op_put(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));
> + op_put(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));
> + op_put(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));
> + op_put(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));
> + op_put(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));
> + op_put(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));
> + op_put(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));
> + op_put(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));
> + op_put(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));
> + op_put(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));
> + op_put(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));
> + op_put(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));
> + op_put(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));
> + #undef op_put
> + dst++;
> + src++;
this code is also duplicated more or less
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In the past you could go to a library and read, borrow or copy any book
Today you'd get arrested for mere telling someone where the library is
More information about the ffmpeg-devel
mailing list