[FFmpeg-devel] [PATCH v2 3/7] avcodec/aarch64/mpegvideoencdsp: add neon implementations for pix_sum and pix_norm1
Martin Storsjö
martin at martin.st
Wed Aug 21 19:24:35 EEST 2024
On Wed, 21 Aug 2024, Ramiro Polla wrote:
> A55 A76
> pix_norm1_c: 484.3 235.2
> pix_norm1_neon: 193.8 ( 2.50x) 44.7 ( 5.26x)
> pix_sum_c: 302.8 243.7
> pix_sum_neon: 81.6 ( 3.71x) 26.0 ( 9.37x)
> ---
> libavcodec/aarch64/Makefile | 2 +
> libavcodec/aarch64/mpegvideoencdsp_init.c | 39 +++++++++++++
> libavcodec/aarch64/mpegvideoencdsp_neon.S | 69 +++++++++++++++++++++++
> libavcodec/mpegvideoencdsp.c | 4 +-
> libavcodec/mpegvideoencdsp.h | 2 +
> 5 files changed, 115 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/mpegvideoencdsp_init.c
> create mode 100644 libavcodec/aarch64/mpegvideoencdsp_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index a3256bb1cc..de0653ebbc 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -10,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
> OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
> OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
> OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
> +OBJS-$(CONFIG_MPEGVIDEOENC) += aarch64/mpegvideoencdsp_init.o
> OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
> OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
> OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
> @@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
> aarch64/simple_idct_neon.o
> NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
> NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
> +NEON-OBJS-$(CONFIG_MPEGVIDEOENC) += aarch64/mpegvideoencdsp_neon.o
> NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
> NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o
> NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
> diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
> new file mode 100644
> index 0000000000..7eb632ed1b
> --- /dev/null
> +++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
> @@ -0,0 +1,39 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/aarch64/cpu.h"
> +#include "libavcodec/mpegvideoencdsp.h"
> +#include "config.h"
> +
> +int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
> +int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
> +
> +av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
> + AVCodecContext *avctx)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (have_neon(cpu_flags)) {
> + c->pix_sum = ff_pix_sum16_neon;
> + c->pix_norm1 = ff_pix_norm1_neon;
> + }
> +}
> diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> new file mode 100644
> index 0000000000..6e7a9319ba
> --- /dev/null
> +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> @@ -0,0 +1,69 @@
> +/*
> + * Copyright (c) 2024 Ramiro Polla
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +function ff_pix_sum16_neon, export=1
> +// x0 const uint8_t *pix
> +// x1 int line_size
> +
> + add x2, x0, w1, sxtw
> + sbfiz x1, x1, #1, #32
BTW, this instruction is kinda exotic and the docs aren't super clear, so
it'd be good to test manually that it really does what we want, for
negative numbers and numbers close to the ends of the value range; I
didn't do that manually yet.
> + movi v0.16b, #0
> + mov w3, #16
> +
> +1:
> + ld1 {v1.16b}, [x0], x1
> + ld1 {v2.16b}, [x2], x1
> + subs w3, w3, #2
> + uadalp v0.8h, v1.16b
> + uadalp v0.8h, v2.16b
> + b.ne 1b
> +
> + uaddlv s0, v0.8h
> + fmov w0, s0
> +
> + ret
> +endfunc
> +
> +function ff_pix_norm1_neon, export=1
> +// x0 const uint8_t *pix
> +// x1 int line_size
> +
> + sxtw x1, w1
> + movi v4.16b, #0
> + movi v5.16b, #0
> + mov w2, #16
> +
> +1:
> + ld1 {v1.16b}, [x0], x1
> + subs w2, w2, #1
> + umull v2.8h, v1.8b, v1.8b
> + umull2 v3.8h, v1.16b, v1.16b
> + uadalp v4.4s, v2.8h
> + uadalp v5.4s, v3.8h
>From my earlier testing on A53, it seemed (surprisingly) to be equally
fast to accumulate into the same register for both instructions - but I
only tested that on A53. So we could change that here, getting rid of the
add at the end (and one movi). Or if it does help on some other core,
perhaps we should do the same for the function above too?
Anyway, LGTM overall.
// Martin
More information about the ffmpeg-devel
mailing list