[FFmpeg-devel] [PATCH 1/5] avutil: add pixelutils API

Sun Aug 3 00:36:19 CEST 2014

On Sat, Aug 02, 2014 at 11:34:07PM +0200, Clément Bœsch wrote:
> ---
>  configure                       |   2 +
>  doc/APIchanges                  |   3 +
>  libavutil/Makefile              |   3 +
>  libavutil/pixelutils.c          | 142 ++++++++++++++++++++++++++++++++++++
>  libavutil/pixelutils.h          |  52 ++++++++++++++
>  libavutil/version.h             |   2 +-
>  libavutil/x86/Makefile          |   4 ++
>  libavutil/x86/pixelutils.asm    | 155 ++++++++++++++++++++++++++++++++++++++++
>  libavutil/x86/pixelutils.h      |  26 +++++++
>  libavutil/x86/pixelutils_init.c |  58 +++++++++++++++
>  tests/fate/libavutil.mak        |   5 ++
>  tests/ref/fate/pixelutils       |  15 ++++
>  12 files changed, 466 insertions(+), 1 deletion(-)
>  create mode 100644 libavutil/pixelutils.c
>  create mode 100644 libavutil/pixelutils.h
>  create mode 100644 libavutil/x86/pixelutils.asm
>  create mode 100644 libavutil/x86/pixelutils.h
>  create mode 100644 libavutil/x86/pixelutils_init.c
>  create mode 100644 tests/ref/fate/pixelutils
> 
> diff --git a/configure b/configure
> index 9c3af50..57edd1d 100755
> --- a/configure
> +++ b/configure
> @@ -144,6 +144,7 @@ Component options:
>    --disable-mdct           disable MDCT code
>    --disable-rdft           disable RDFT code
>    --disable-fft            disable FFT code
> +  --disable-pixelutils     disable pixel utils in libavutil
>  
>  Hardware accelerators:
>    --disable-dxva2          disable DXVA2 code [autodetect]
> @@ -1451,6 +1452,7 @@ SUBSYSTEM_LIST="
>      lsp
>      lzo
>      mdct
> +    pixelutils
>      network
>      rdft
>  "
> diff --git a/doc/APIchanges b/doc/APIchanges
> index abca377..69ca682 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -15,6 +15,9 @@ libavutil:     2012-10-22
>  
>  API changes, most recent first:
>  
> +2014-08-02 - xxxxxxx - lavu 52.95.100 - pixelutils.h
> +  Add pixelutils API with SAD functions
> +
>  2014-07-30 - ba3e331 - lavu 52.94.100 - frame.h
>    Add av_frame_side_data_name()
>  
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 91751dc..d57a741 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -44,6 +44,7 @@ HEADERS = adler32.h                                                     \
>            opt.h                                                         \
>            parseutils.h                                                  \
>            pixdesc.h                                                     \
> +          pixelutils.h                                                  \
>            pixfmt.h                                                      \
>            random_seed.h                                                 \
>            replaygain.h                                                  \
> @@ -113,6 +114,7 @@ OBJS = adler32.o                                                        \
>         opt.o                                                            \
>         parseutils.o                                                     \
>         pixdesc.o                                                        \
> +       pixelutils.o                                                     \
>         random_seed.o                                                    \
>         rational.o                                                       \
>         rc4.o                                                            \
> @@ -170,6 +172,7 @@ TESTPROGS = adler32                                                     \
>              pca                                                         \
>              parseutils                                                  \
>              pixdesc                                                     \
> +            pixelutils                                                  \
>              random_seed                                                 \
>              rational                                                    \
>              ripemd                                                      \
> diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
> new file mode 100644
> index 0000000..278aa80
> --- /dev/null
> +++ b/libavutil/pixelutils.c
> @@ -0,0 +1,142 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +#include "common.h"
> +#include "pixelutils.h"
> +
> +#if CONFIG_PIXELUTILS
> +
> +#include "x86/pixelutils.h"
> +
> +static av_always_inline int sad_wxh(const uint8_t *src1, ptrdiff_t stride1,
> +                                    const uint8_t *src2, ptrdiff_t stride2,
> +                                    int w, int h)
> +{
> +    int x, y, sum = 0;
> +
> +    for (y = 0; y < h; y++) {
> +        for (x = 0; x < w; x++)
> +            sum += abs(src1[x] - src2[x]);
> +        src1 += stride1;
> +        src2 += stride2;
> +    }
> +    return sum;
> +}
> +
> +#define DECLARE_BLOCK_FUNCTIONS(size)                                               \
> +static int block_sad_##size##x##size##_c(const uint8_t *src1, ptrdiff_t stride1,    \
> +                                         const uint8_t *src2, ptrdiff_t stride2)    \
> +{                                                                                   \
> +    return sad_wxh(src1, stride1, src2, stride2, size, size);                       \
> +}
> +
> +DECLARE_BLOCK_FUNCTIONS(2)
> +DECLARE_BLOCK_FUNCTIONS(4)
> +DECLARE_BLOCK_FUNCTIONS(8)
> +DECLARE_BLOCK_FUNCTIONS(16)
> +
> +static const av_pixelutils_sad_fn sad_c[] = {
> +    block_sad_2x2_c,
> +    block_sad_4x4_c,
> +    block_sad_8x8_c,
> +    block_sad_16x16_c,
> +};
> +
> +#endif /* CONFIG_PIXELUTILS */
> +
> +av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligned, void *log_ctx)
> +{
> +#if !CONFIG_PIXELUTILS
> +    av_log(log_ctx, AV_LOG_ERROR, "pixelutils support is required "
> +           "but libavutil is not compiled with it\n");
> +    return NULL;
> +#else
> +    av_pixelutils_sad_fn sad[FF_ARRAY_ELEMS(sad_c)];
> +
> +    memcpy(sad, sad_c, sizeof(sad));
> +
> +    if (w_bits < 1 || w_bits > FF_ARRAY_ELEMS(sad) ||
> +        h_bits < 1 || h_bits > FF_ARRAY_ELEMS(sad))
> +        return NULL;
> +    if (w_bits != h_bits) // only squared sad for now
> +        return NULL;
> +
> +#if ARCH_X86
> +    ff_pixelutils_sad_init_x86(sad, aligned);
> +#endif
> +
> +    return sad[w_bits - 1];
> +#endif
> +}
> +
> +#ifdef TEST
> +#define W1 320
> +#define H1 240
> +#define W2 640
> +#define H2 480
> +int main(void)
> +{
> +    int i, a, ret = 0;
> +    DECLARE_ALIGNED(32, uint32_t, buf1)[W1*H1];
> +    DECLARE_ALIGNED(32, uint32_t, buf2)[W2*H2];
> +    uint32_t state = 0;
> +
> +    for (i = 0; i < W1*H1; i++) {
> +        buf1[i] = state;
> +        state = state * 1664525 + 1013904223;
> +    }
> +
> +    for (i = 0; i < W2*H2; i++) {
> +        buf2[i] = state;
> +        state = state * 1664525 + 1013904223;
> +    }

the code should in addition be tested with maximal and minimal
difference cases


[...]
> +;-------------------------------------------------------------------------------
> +; int ff_pixelutils_sad_[au]_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
> +;                                      const uint8_t *src2, ptrdiff_t stride2);
> +;-------------------------------------------------------------------------------
> +%macro SAD_XMM_16x16 1
> +INIT_XMM sse2
> +cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
> +    pxor        m2, m2
> +%rep 8
> +    mov%1       m0, [src2q]
> +    mov%1       m1, [src2q + stride2q]
> +    psadbw      m0, [src1q]
> +    psadbw      m1, [src1q + stride1q]
> +    paddw       m2, m0
> +    paddw       m2, m1
> +    lea         src1q, [src1q + 2*stride1q]
> +    lea         src2q, [src2q + 2*stride2q]
> +%endrep
> +    movhlps     m0, m2
> +    paddw       m2, m0
> +    movd        eax, m2
> +    RET
> +%endmacro

there are various improvments possible, though these should be in
a seperate patch and not in gcc->yasm but
the pxor can be avoided by lifting the first iteration out and
using m2 as destination

it might be faster to use 2 accumulator registers as that way both
could execute with no dependancies on the other

as you unroll the loop, addressing can be done with fewer instructions

LGTM otherwise

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Democracy is the form of government in which you can choose your dictator
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140803/eb3db603/attachment.asc>