[FFmpeg-devel] [RFC] DXVA2 decoding and FFmpeg

James Almer jamrial at gmail.com
Thu May 28 23:02:34 CEST 2015


On 28/05/15 2:39 PM, Stefano Sabatini wrote:
> From f3b4e77dd9dd299aba8f4fa83625d2b61b243c3c Mon Sep 17 00:00:00 2001
> From: Stefano Sabatini <stefasab at gmail.com>
> Date: Fri, 15 May 2015 18:58:17 +0200
> Subject: [PATCH] lavu/imgutils: add av_image_copy_plane_from_uswc() function.
> 
> This function allows support to optimized GPU to CPU.
> 
> Based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar
> <fenrir at videolan.org>.
> 
> TODO: fix integration with the build system, bump micro
> 
> Signed-off-by: Stefano Sabatini <stefasab at gmail.com>
> ---
>  libavutil/imgutils.c          |  14 ++++++
>  libavutil/imgutils.h          |  18 +++++++
>  libavutil/imgutils_internal.h |  29 +++++++++++
>  libavutil/x86/Makefile        |   1 +
>  libavutil/x86/imgutils.c      | 109 ++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 171 insertions(+)
>  create mode 100644 libavutil/imgutils_internal.h
>  create mode 100644 libavutil/x86/imgutils.c
> 
> diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c
> index ef0e671..e538c75 100644
> --- a/libavutil/imgutils.c
> +++ b/libavutil/imgutils.c
> @@ -30,6 +30,7 @@
>  #include "mathematics.h"
>  #include "pixdesc.h"
>  #include "rational.h"
> +#include "imgutils_internal.h"
>  
>  void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4],
>                                  const AVPixFmtDescriptor *pixdesc)
> @@ -405,3 +406,16 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size,
>  
>      return size;
>  }
> +
> +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
> +				   const uint8_t *src, size_t src_linesize,
> +				   unsigned bytewidth, unsigned height,
> +				   unsigned cpu_flags)
> +{
> +#ifndef HAVE_SSSE3

All HAVE_ are always defined to either 0 or 1.

Nonetheless, this kind of check does not belong outside of arch folders. You should
check for ARCH_X86 to call functions in the x86/ folder. See lavc/lavfi for examples.

> +    av_unused(cpu_flags);
> +    av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
> +#else
> +    ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, bytewidth, height, cpu_flags);
> +#endif
> +}
> diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h
> index 23282a3..82c3826 100644
> --- a/libavutil/imgutils.h
> +++ b/libavutil/imgutils.h
> @@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t       *dst, int dst_linesize,
>                           int bytewidth, int height);
>  
>  /**
> + * Copy image plane from src to dst, similar to av_image_copy_plane().
> + * src must be an USWC buffer.
> + * It performs optimized copy from "Uncacheable Speculative Write
> + * Combining" memory as used by some video surface.
> + * It is really efficient only when SSE4.1 is available.
> + *
> + * In case the target CPU does not support USWC caching this function
> + * will be equivalent to av_image_copy_plane().
> + *
> + * @param cpu_flags as returned by av_get_cpu_flags()
> + * @see av_image_copy_plane()
> + */
> +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize,
> +                                   const uint8_t *src, size_t src_linesize,
> +                                   unsigned bytewidth, unsigned height,
> +                                   unsigned cpu_flags);
> +
> +/**
>   * Copy image in src_data to dst_data.
>   *
>   * @param dst_linesizes linesizes for the image in dst_data
> diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h
> new file mode 100644
> index 0000000..16ed977
> --- /dev/null
> +++ b/libavutil/imgutils_internal.h
> @@ -0,0 +1,29 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_IMGUTILS_INTERNAL_H
> +#define AVUTIL_IMGUTILS_INTERNAL_H
> +
> +#include "imgutils.h"
> +
> +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
> +				       const uint8_t *src, size_t src_linesize,
> +				       unsigned bytewidth, unsigned height,
> +				       unsigned cpu_flags);
> +
> +#endif /* AVUTIL_IMGUTILS_INTERNAL_H */
> diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
> index eb70a62..a719c00 100644
> --- a/libavutil/x86/Makefile
> +++ b/libavutil/x86/Makefile
> @@ -1,5 +1,6 @@
>  OBJS += x86/cpu.o                                                       \
>          x86/float_dsp_init.o                                            \
> +        x86/imgutils.o                                                  \
>          x86/lls_init.o                                                  \
>  
>  OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o                      \
> diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c
> new file mode 100644
> index 0000000..91c7a42
> --- /dev/null
> +++ b/libavutil/x86/imgutils.c
> @@ -0,0 +1,109 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <inttypes.h>
> +#include "config.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/avassert.h"
> +#include "libavutil/intreadwrite.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/pixdesc.h"
> +
> +#include "libavutil/avassert.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/imgutils.h"
> +#include "libavutil/imgutils_internal.h"
> +
> +#ifdef HAVE_SSE2
> +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
> + * load and storing data with the SSE>=2 instruction store.
> + */
> +#define COPY16(dstp, srcp, load, store) \
> +    __asm__ volatile (                      \
> +        load "  0(%[src]), %%xmm1\n"    \
> +        store " %%xmm1,    0(%[dst])\n" \
> +        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
> +
> +#define COPY64(dstp, srcp, load, store) \
> +    __asm__ volatile (                      \
> +        load "  0(%[src]), %%xmm1\n"    \
> +        load " 16(%[src]), %%xmm2\n"    \
> +        load " 32(%[src]), %%xmm3\n"    \
> +        load " 48(%[src]), %%xmm4\n"    \
> +        store " %%xmm1,    0(%[dst])\n" \
> +        store " %%xmm2,   16(%[dst])\n" \
> +        store " %%xmm3,   32(%[dst])\n" \
> +        store " %%xmm4,   48(%[dst])\n" \
> +        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
> +#endif

As already mentioned, this should be done in nasm/yasm syntax.
Also, any reason you're not using more xmm registers to reduce the amount of loops?
or just unroll things a bit even if you still use only four (you skipped xmm0 for
that matter).

> +
> +#define AV_CPU_SSE4()  ((cpu_flags & AV_CPU_FLAG_SSE4)  != 0)

"!= 0" is unnecessary. And you could use the helper macros from libavutil/x86/cpu.h.

> +#define AV_CPU_SSSE3() ((cpu_flags & AV_CPU_FLAG_SSSE3) != 0)
> +#define AV_CPU_SSE2()  ((cpu_flags & AV_CPU_FLAG_SSE2)  != 0)
> +
> +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize,
> +				       const uint8_t *src, size_t src_linesize,
> +				       unsigned bytewidth, unsigned height,
> +				       unsigned cpu_flags)
> +{
> +#ifndef HAVE_SSSE3
> +    av_unused(cpu_flags);
> +    return av_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height);
> +#endif
> +
> +    av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0);
> +
> +    __asm__ volatile ("mfence");
> +
> +    for (unsigned y = 0; y < height; y++) {
> +        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
> +        unsigned x = unaligned;
> +
> +#ifdef HAVE_SSE42
> +        if (AV_CPU_SSE4()) {
> +            if (!unaligned) {
> +                for (; x+63 < bytewidth; x += 64)
> +                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
> +            } else {
> +                COPY16(dst, src, "movdqu", "movdqa");
> +                for (; x+63 < bytewidth; x += 64)
> +                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
> +            }
> +        } else
> +#endif
> +        {
> +            if (!unaligned) {
> +                for (; x+63 < bytewidth; x += 64)
> +                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
> +            } else {
> +                COPY16(dst, src, "movdqu", "movdqa");
> +                for (; x+63 < bytewidth; x += 64)
> +                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
> +            }
> +        }
> +
> +        for (; x < bytewidth; x++)
> +            dst[x] = src[x];
> +
> +        src += src_linesize;
> +        dst += dst_linesize;
> +    }
> +    __asm__ volatile ("mfence");
> +}
> -- 1.9.1



More information about the ffmpeg-devel mailing list