[FFmpeg-devel] [PATCH] avcodec/vp9: avx2 implementation of ipred_dl_16x16_16

Ronald S. Bultje rsbultje at gmail.com
Mon Mar 20 15:53:37 EET 2017


Hi,

On Sun, Mar 12, 2017 at 6:06 PM, Ilia <zakne0ne at gmail.com> wrote:

> vp9_diag_downleft_16x16_10bpp_c: 263.0
> vp9_diag_downleft_16x16_10bpp_sse2: 44.7
> vp9_diag_downleft_16x16_10bpp_ssse3: 32.5
> vp9_diag_downleft_16x16_10bpp_avx: 31.9
> vp9_diag_downleft_16x16_10bpp_avx2: 25.7
> vp9_diag_downleft_16x16_12bpp_c: 264.7
> vp9_diag_downleft_16x16_12bpp_sse2: 44.4
> vp9_diag_downleft_16x16_12bpp_ssse3: 32.0
> vp9_diag_downleft_16x16_12bpp_avx: 32.4
> vp9_diag_downleft_16x16_12bpp_avx2: 25.5
>
> Benchmarked with 10000 runs
>
> Signed-off-by: Ilia <zakne0ne at gmail.com>
> ---
>  libavcodec/x86/vp9dsp_init_16bpp.c    |  2 ++
>  libavcodec/x86/vp9intrapred_16bpp.asm | 39 ++++++++++++++++++++++++++++++
> +++++
>  2 files changed, 41 insertions(+)
>
> diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c
> b/libavcodec/x86/vp9dsp_init_16bpp.c
> index eb67499..4576ff1 100644
> --- a/libavcodec/x86/vp9dsp_init_16bpp.c
> +++ b/libavcodec/x86/vp9dsp_init_16bpp.c
> @@ -51,6 +51,7 @@ decl_ipred_fns(h,       16, mmxext, sse2);
>  decl_ipred_fns(dc,      16, mmxext, sse2);
>  decl_ipred_fns(dc_top,  16, mmxext, sse2);
>  decl_ipred_fns(dc_left, 16, mmxext, sse2);
> +decl_ipred_fn(dl,       16,     16, avx2);
>
>  #define decl_ipred_dir_funcs(type) \
>  decl_ipred_fns(type, 16, sse2,  sse2); \
> @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext
> *dsp)
>          init_fpel_func(2, 1,  32, avg, _16, avx2);
>          init_fpel_func(1, 1,  64, avg, _16, avx2);
>          init_fpel_func(0, 1, 128, avg, _16, avx2);
> +        init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
>      }
>
>  #endif /* HAVE_YASM */
> diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
> b/libavcodec/x86/vp9intrapred_16bpp.asm
> index c0ac16d..212e413 100644
> --- a/libavcodec/x86/vp9intrapred_16bpp.asm
> +++ b/libavcodec/x86/vp9intrapred_16bpp.asm
> @@ -847,6 +847,45 @@ DL_FUNCS
>  INIT_XMM avx
>  DL_FUNCS
>
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
> +    movifnidn               aq, amp
> +    mova                    m0, [aq]                   ; abcdefghijklmnop
> +    vpbroadcastw           xm1, [aq+30]                ; pppppppp
> +    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
> +    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
> +    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
> +    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
> +    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
> +    DEFINE_ARGS dst, stride, stride3, cnt
> +    mov                   cntd, 2
> +    lea               stride3q, [strideq*3]
> +.loop:
> +    mova      [dstq+strideq*0], m0
> +    vpalignr                m3, m2, m0, 2
> +    vpalignr                m4, m2, m0, 4
> +    mova      [dstq+strideq*1], m3
> +    mova      [dstq+strideq*2], m4
> +    vpalignr                m3, m2, m0, 6
> +    vpalignr                m4, m2, m0, 8
> +    mova      [dstq+stride3q ], m3
> +    lea                   dstq, [dstq+strideq*4]
> +    mova      [dstq+strideq*0], m4
> +    vpalignr                m3, m2, m0, 10
> +    vpalignr                m4, m2, m0, 12
> +    mova      [dstq+strideq*1], m3
> +    mova      [dstq+strideq*2], m4
> +    vpalignr                m3, m2, m0, 14
> +    mova      [dstq+stride3q ], m3
> +    lea                   dstq, [dstq+strideq*4]
> +    mova                    m0, m2
> +    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
> +    dec                   cntd
> +    jg .loop
> +    RET
> +%endif
> +
>  %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
>  cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
>      movh                    m0, [lq]                ; wxyz....
> --
> 2.8.3


Pushed.

Ronald


More information about the ffmpeg-devel mailing list