[FFmpeg-devel] [PATCH 1/5] avutil: add pixelutils API
Michael Niedermayer
michaelni at gmx.at
Sun Aug 3 00:36:19 CEST 2014
On Sat, Aug 02, 2014 at 11:34:07PM +0200, Clément Bœsch wrote:
> ---
> configure | 2 +
> doc/APIchanges | 3 +
> libavutil/Makefile | 3 +
> libavutil/pixelutils.c | 142 ++++++++++++++++++++++++++++++++++++
> libavutil/pixelutils.h | 52 ++++++++++++++
> libavutil/version.h | 2 +-
> libavutil/x86/Makefile | 4 ++
> libavutil/x86/pixelutils.asm | 155 ++++++++++++++++++++++++++++++++++++++++
> libavutil/x86/pixelutils.h | 26 +++++++
> libavutil/x86/pixelutils_init.c | 58 +++++++++++++++
> tests/fate/libavutil.mak | 5 ++
> tests/ref/fate/pixelutils | 15 ++++
> 12 files changed, 466 insertions(+), 1 deletion(-)
> create mode 100644 libavutil/pixelutils.c
> create mode 100644 libavutil/pixelutils.h
> create mode 100644 libavutil/x86/pixelutils.asm
> create mode 100644 libavutil/x86/pixelutils.h
> create mode 100644 libavutil/x86/pixelutils_init.c
> create mode 100644 tests/ref/fate/pixelutils
>
> diff --git a/configure b/configure
> index 9c3af50..57edd1d 100755
> --- a/configure
> +++ b/configure
> @@ -144,6 +144,7 @@ Component options:
> --disable-mdct disable MDCT code
> --disable-rdft disable RDFT code
> --disable-fft disable FFT code
> + --disable-pixelutils disable pixel utils in libavutil
>
> Hardware accelerators:
> --disable-dxva2 disable DXVA2 code [autodetect]
> @@ -1451,6 +1452,7 @@ SUBSYSTEM_LIST="
> lsp
> lzo
> mdct
> + pixelutils
> network
> rdft
> "
> diff --git a/doc/APIchanges b/doc/APIchanges
> index abca377..69ca682 100644
> --- a/doc/APIchanges
> +++ b/doc/APIchanges
> @@ -15,6 +15,9 @@ libavutil: 2012-10-22
>
> API changes, most recent first:
>
> +2014-08-02 - xxxxxxx - lavu 52.95.100 - pixelutils.h
> + Add pixelutils API with SAD functions
> +
> 2014-07-30 - ba3e331 - lavu 52.94.100 - frame.h
> Add av_frame_side_data_name()
>
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 91751dc..d57a741 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -44,6 +44,7 @@ HEADERS = adler32.h \
> opt.h \
> parseutils.h \
> pixdesc.h \
> + pixelutils.h \
> pixfmt.h \
> random_seed.h \
> replaygain.h \
> @@ -113,6 +114,7 @@ OBJS = adler32.o \
> opt.o \
> parseutils.o \
> pixdesc.o \
> + pixelutils.o \
> random_seed.o \
> rational.o \
> rc4.o \
> @@ -170,6 +172,7 @@ TESTPROGS = adler32 \
> pca \
> parseutils \
> pixdesc \
> + pixelutils \
> random_seed \
> rational \
> ripemd \
> diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
> new file mode 100644
> index 0000000..278aa80
> --- /dev/null
> +++ b/libavutil/pixelutils.c
> @@ -0,0 +1,142 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "config.h"
> +#include "common.h"
> +#include "pixelutils.h"
> +
> +#if CONFIG_PIXELUTILS
> +
> +#include "x86/pixelutils.h"
> +
> +static av_always_inline int sad_wxh(const uint8_t *src1, ptrdiff_t stride1,
> + const uint8_t *src2, ptrdiff_t stride2,
> + int w, int h)
> +{
> + int x, y, sum = 0;
> +
> + for (y = 0; y < h; y++) {
> + for (x = 0; x < w; x++)
> + sum += abs(src1[x] - src2[x]);
> + src1 += stride1;
> + src2 += stride2;
> + }
> + return sum;
> +}
> +
> +#define DECLARE_BLOCK_FUNCTIONS(size) \
> +static int block_sad_##size##x##size##_c(const uint8_t *src1, ptrdiff_t stride1, \
> + const uint8_t *src2, ptrdiff_t stride2) \
> +{ \
> + return sad_wxh(src1, stride1, src2, stride2, size, size); \
> +}
> +
> +DECLARE_BLOCK_FUNCTIONS(2)
> +DECLARE_BLOCK_FUNCTIONS(4)
> +DECLARE_BLOCK_FUNCTIONS(8)
> +DECLARE_BLOCK_FUNCTIONS(16)
> +
> +static const av_pixelutils_sad_fn sad_c[] = {
> + block_sad_2x2_c,
> + block_sad_4x4_c,
> + block_sad_8x8_c,
> + block_sad_16x16_c,
> +};
> +
> +#endif /* CONFIG_PIXELUTILS */
> +
> +av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligned, void *log_ctx)
> +{
> +#if !CONFIG_PIXELUTILS
> + av_log(log_ctx, AV_LOG_ERROR, "pixelutils support is required "
> + "but libavutil is not compiled with it\n");
> + return NULL;
> +#else
> + av_pixelutils_sad_fn sad[FF_ARRAY_ELEMS(sad_c)];
> +
> + memcpy(sad, sad_c, sizeof(sad));
> +
> + if (w_bits < 1 || w_bits > FF_ARRAY_ELEMS(sad) ||
> + h_bits < 1 || h_bits > FF_ARRAY_ELEMS(sad))
> + return NULL;
> + if (w_bits != h_bits) // only squared sad for now
> + return NULL;
> +
> +#if ARCH_X86
> + ff_pixelutils_sad_init_x86(sad, aligned);
> +#endif
> +
> + return sad[w_bits - 1];
> +#endif
> +}
> +
> +#ifdef TEST
> +#define W1 320
> +#define H1 240
> +#define W2 640
> +#define H2 480
> +int main(void)
> +{
> + int i, a, ret = 0;
> + DECLARE_ALIGNED(32, uint32_t, buf1)[W1*H1];
> + DECLARE_ALIGNED(32, uint32_t, buf2)[W2*H2];
> + uint32_t state = 0;
> +
> + for (i = 0; i < W1*H1; i++) {
> + buf1[i] = state;
> + state = state * 1664525 + 1013904223;
> + }
> +
> + for (i = 0; i < W2*H2; i++) {
> + buf2[i] = state;
> + state = state * 1664525 + 1013904223;
> + }
the code should in addition be tested with maximal and minimal
difference cases
[...]
> +;-------------------------------------------------------------------------------
> +; int ff_pixelutils_sad_[au]_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
> +; const uint8_t *src2, ptrdiff_t stride2);
> +;-------------------------------------------------------------------------------
> +%macro SAD_XMM_16x16 1
> +INIT_XMM sse2
> +cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
> + pxor m2, m2
> +%rep 8
> + mov%1 m0, [src2q]
> + mov%1 m1, [src2q + stride2q]
> + psadbw m0, [src1q]
> + psadbw m1, [src1q + stride1q]
> + paddw m2, m0
> + paddw m2, m1
> + lea src1q, [src1q + 2*stride1q]
> + lea src2q, [src2q + 2*stride2q]
> +%endrep
> + movhlps m0, m2
> + paddw m2, m0
> + movd eax, m2
> + RET
> +%endmacro
there are various improvments possible, though these should be in
a seperate patch and not in gcc->yasm but
the pxor can be avoided by lifting the first iteration out and
using m2 as destination
it might be faster to use 2 accumulator registers as that way both
could execute with no dependancies on the other
as you unroll the loop, addressing can be done with fewer instructions
LGTM otherwise
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Democracy is the form of government in which you can choose your dictator
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140803/eb3db603/attachment.asc>
More information about the ffmpeg-devel
mailing list