[FFmpeg-devel] [patch 1/3]Fix bug for POWER LE:/libavcodec/ppc/me_cmp.c

Tue Nov 4 12:22:49 CET 2014

On Tue, Nov 04, 2014 at 03:05:24PM +0800, rongyan wrote:
> Hi,
> There are 3 patches to fix bugs for POWER8 little endian. I will send 3 patches in 3 different email. This is the first, functions 
>     hadamard8_diff8x8_altivec()    hadamard8_diff16x8_altivec()
>     sad16_x2_altivec()
>     sad16_y2_altivec()
>     sad16_xy2_altivec()
>     sad16_altivec()
>     sad8_altivec()
>     sse16_altivec()
>     sse8_altivec()
> ‍ are fixed.
>  The fate test result after merge these 3 patches can be found on website by searching "ibmcrl", also attached in the below to facilitate the review. The passed test cases change from  1679/2182 to 2202/2235.
>  ‍
>  Thanks for review and any comments please feel free to let me know.
>   
>  Rong Yan
>   ------------------
>   The world has enough for everyone's need, but not enough for everyone's greed.

>  me_cmp.c |  497 ++++++++++++++++++++++++---------------------------------------
>  1 file changed, 196 insertions(+), 301 deletions(-)
> d18753b0bc6ac53409a3070a61795e079282b6ee  0001-libavcodec-ppc-me_cmp.c-fix-hadamard8_diff8x8_altive.patch
> From 40e4b6d2871772e17c86986cfeca16dcdb24ded8 Mon Sep 17 00:00:00 2001
> From: Rong Yan <rongyan236 at gmail.com>
> Date: Tue, 4 Nov 2014 06:23:11 +0000
> Subject: [PATCH 1/3] libavcodec/ppc/me_cmp.c : fix hadamard8_diff8x8_altivec()
>  hadamard8_diff16x8_altivec() sad16_x2_altivec() sad16_y2_altivec()
>  sad16_xy2_altivec() sad16_altivec() sad8_altivec() sse16_altivec()
>  sse8_altivec() for POWER LE
> 
> ---
>  libavcodec/ppc/me_cmp.c | 497 +++++++++++++++++++-----------------------------
>  1 file changed, 196 insertions(+), 301 deletions(-)
> 
> diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c
> index 8ff8193..133c3bb 100644
> --- a/libavcodec/ppc/me_cmp.c
> +++ b/libavcodec/ppc/me_cmp.c
> @@ -35,64 +35,39 @@
>  #include "libavcodec/me_cmp.h"
>  
>  #if HAVE_ALTIVEC
> -#if HAVE_VSX
> -static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
> -                                                        int line_size, int h)
> -{
> -    int i, s = 0;
> -    const vector unsigned char zero =
> -        (const vector unsigned char) vec_splat_u8(0);
> -    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
> -    vector signed int sumdiffs;
> -
> -    for (i = 0; i < h; i++) {
> -        /* Read unaligned pixels into our vectors. The vectors are as follows:
> -         * pix1v: pix1[0] - pix1[15]
> -         * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
> -        vector unsigned char pix1v  = vec_vsx_ld(0,  pix1);
> -        vector unsigned char pix2v  = vec_vsx_ld(0,  pix2);
> -        vector unsigned char pix2iv = vec_vsx_ld(1,  pix2);
> -
> -        /* Calculate the average vector. */
> -        vector unsigned char avgv = vec_avg(pix2v, pix2iv);
> -
> -        /* Calculate a sum of abs differences vector. */
> -        vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
> -                                          vec_min(pix1v, avgv));
> -
> -        /* Add each 4 pixel group together and put 4 results into sad. */
> -        sad = vec_sum4s(t5, sad);
> -
> -        pix1 += line_size;
> -        pix2 += line_size;
> -    }
> -    /* Sum up the four partial sums, and put the result into s. */
> -    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
> -    sumdiffs = vec_splat(sumdiffs, 3);
> -    vec_vsx_st(sumdiffs, 0, &s);
> -    return s;
> -}
> -#else
>  static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>                              int line_size, int h)
>  {
>      int i, s = 0;
> +#if HAVE_BIGENDIAN
>      const vector unsigned char zero =
>          (const vector unsigned char) vec_splat_u8(0);
>      vector unsigned char perm1 = vec_lvsl(0, pix2);
>      vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
>      vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
>      vector signed int sumdiffs;
> +#else /* HAVE_BIGENDIAN */
> +    const vector unsigned char __attribute__((aligned(16))) zero =
> +        (const vector unsigned char) vec_splat_u8(0);
> +    vector unsigned int __attribute__((aligned(16))) sad = (vector unsigned int) vec_splat_u32(0);
> +    vector signed int __attribute__((aligned(16))) sumdiffs;
> +#endif /* HAVE_BIGENDIAN */

this is much better!

but please use

#if HAVE_BIGENDIAN
#define DECLARE_ALIGNED_LE(n,t,v) t v
#else
#define DECLARE_ALIGNED_LE(n,t,v) DECLARE_ALIGNED(n,t,v)
#endif

...

    DECLARE_ALIGNED_LE(16, const vector unsigned char, zero) =
        (const vector unsigned char) vec_splat_u8(0);
#if HAVE_BIGENDIAN
    vector unsigned char perm1 = vec_lvsl(0, pix2);
    vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
#endif /* HAVE_BIGENDIAN */
    DECLARE_ALIGNED_LE(16, vector unsigned int, sad) = (vector unsigned int) vec_splat_u32(0);
    DECLARE_ALIGNED_LE(16, vector signed int, sumdiffs);

>  
>      for (i = 0; i < h; i++) {
>          /* Read unaligned pixels into our vectors. The vectors are as follows:
>           * pix1v: pix1[0] - pix1[15]
>           * pix2v: pix2[0] - pix2[15]      pix2iv: pix2[1] - pix2[16] */
> +#if HAVE_BIGENDIAN
>          vector unsigned char pix1v  = vec_ld(0,  pix1);
>          vector unsigned char pix2l  = vec_ld(0,  pix2);
>          vector unsigned char pix2r  = vec_ld(16, pix2);
>          vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
>          vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
> +#else /* HAVE_BIGENDIAN */
> +        vector unsigned char pix1v  = vec_vsx_ld(0,  pix1);
> +        vector unsigned char pix2v  = vec_vsx_ld(0,  pix2);
> +        vector unsigned char pix2iv = vec_vsx_ld(1,  pix2);
> +#endif /* HAVE_BIGENDIAN */
>  
>          /* Calculate the average vector. */
>          vector unsigned char avgv = vec_avg(pix2v, pix2iv);

please use:

#if HAVE_BIGENDIAN
#define VEC_LD(a, b) vec_ld(a, b)
#else
#define VEC_LD(a, b) vec_vsx_ld(a, b)
#endif

...

        vector unsigned char pix1v  = VEC_LD(0,  pix1);
        vector unsigned char pix2l  = VEC_LD(0,  pix2);
        vector unsigned char pix2r  = VEC_LD(16, pix2);
#if HAVE_BIGENDIAN
        vector unsigned char pix2v  = vec_perm(pix2l, pix2r, perm1);
        vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
#endif /* HAVE_BIGENDIAN */

[...]

> +#if HAVE_BIGENDIAN
>          pix1v  = vec_ld(0, pix1);
>  
>          pix2l  = vec_ld(0, pix3);
> @@ -373,6 +262,16 @@ static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
>          pix3lv  = (vector unsigned short) vec_mergel(zero, pix3v);
>          pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
>          pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
> +#else /* HAVE_BIGENDIAN */
> +        pix1v  = vec_vsx_ld(0, pix1);
> +        pix3v  = vec_vsx_ld(0, pix3);
> +        pix3iv = vec_vsx_ld(1, pix3);
> +
> +        pix3hv  = (vector unsigned short) vec_mergeh(pix3v, zero);
> +        pix3lv  = (vector unsigned short) vec_mergel(pix3v, zero);
> +        pix3ihv = (vector unsigned short) vec_mergeh(pix3iv, zero);
> +        pix3ilv = (vector unsigned short) vec_mergel(pix3iv, zero);
> +#endif /* HAVE_BIGENDIAN */

please use:

#if HAVE_BIGENDIAN
#define VEC_MERGEH(a, b) vec_mergeh(a,b)
#define VEC_MERGEL(a, b) vec_mergel(a,b)
#else
#define VEC_MERGEH(a, b) vec_mergeh(b,a)
#define VEC_MERGEL(a, b) vec_mergel(b,a)
#endif

...

       pix3hv  = (vector unsigned short) VEC_MERGEH(pix3v, zero);
       pix3lv  = (vector unsigned short) VEC_MERGEL(pix3v, zero);
       pix3ihv = (vector unsigned short) VEC_MERGEH(pix3iv, zero);
       pix3ilv = (vector unsigned short) VEC_MERGEL(pix3iv, zero);

[...]

> +#if HAVE_BIGENDIAN
>      vec_ste(sumsqr, 0, &s);
> +#else /* HAVE_BIGENDIAN */
> +    vec_vsx_st(sumsqr, 0, &s);
> +#endif /* HAVE_BIGENDIAN */

please use:

#if HAVE_BIGENDIAN
#define VEC_STE(a,b,c) vec_ste(a,b,c)
#else
#define VEC_STE(a,b,c) vec_vsx_st(a,b,c)
#endif

...

VEC_STE(sumsqr, 0, &s);

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Opposition brings concord. Out of discord comes the fairest harmony.
-- Heraclitus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20141104/8c1dc291/attachment.asc>