[FFmpeg-devel] [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
Wu, Jianhua
jianhua.wu at intel.com
Mon Sep 6 05:10:54 EEST 2021
Ping.
> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu at intel.com>
> Sent: Friday, August 27, 2021 12:52 PM
> To: ffmpeg-devel at ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu at intel.com>
> Subject: [PATCH 1/4] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
>
> Performance(Less is better):
> 8bit:
> ff_hflip_byte_ssse3 0.61
> ff_hflip_byte_avx2 0.37
> ff_hflip_byte_avx512 0.19
> 16bit:
> ff_hflip_short_ssse3 1.27
> ff_hflip_short_avx2 0.76
> ff_hflip_short_avx512 0.40
>
> Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
> ---
> libavfilter/x86/vf_hflip.asm | 23 ++++++++++++++++++-----
> libavfilter/x86/vf_hflip_init.c | 8 ++++++++
> 2 files changed, 26 insertions(+), 5 deletions(-)
>
> diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index
> 285618954f..c2237217f7 100644
> --- a/libavfilter/x86/vf_hflip.asm
> +++ b/libavfilter/x86/vf_hflip.asm
> @@ -26,12 +26,16 @@ SECTION_RODATA
>
> pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
>
> SECTION .text
>
> ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) %macro
> HFLIP 3 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
> +%if mmsize == 64
> + movu m3, [pd_flip_indicies]
> +%endif
> VBROADCASTI128 m0, [pb_flip_%1]
> xor xq, xq
> %if %3 == 1
> @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
>
> .loop0:
> neg xq
> -%if mmsize == 32
> - vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
> - vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
> +%if mmsize == 64
> + vpermd m1, m3, [srcq + xq - mmsize + %3]
> + vpermd m2, m3, [srcq + xq - 2 * mmsize + %3] %elif mmsize ==
> +32
> + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load
> + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
> %else
> - movu m1, [srcq + xq - mmsize + %3]
> - movu m2, [srcq + xq - 2 * mmsize + %3]
> + movu m1, [srcq + xq - mmsize + %3]
> + movu m2, [srcq + xq - 2 * mmsize + %3]
> %endif
> pshufb m1, m0
> pshufb m2, m0
> @@ -88,3 +95,9 @@ INIT_YMM avx2
> HFLIP byte, b, 1
> HFLIP short, w, 2
> %endif
> +
> +%if HAVE_AVX512_EXTERNAL
> +INIT_ZMM avx512
> +HFLIP byte, b, 1
> +HFLIP short, w, 2
> +%endif
> diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index
> 0ac399b0d4..25fc40f7b0 100644
> --- a/libavfilter/x86/vf_hflip_init.c
> +++ b/libavfilter/x86/vf_hflip_init.c
> @@ -25,8 +25,10 @@
>
> void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); void
> ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
> void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); void
> ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
> +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
>
> av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
> { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int
> step[4], int nb_planes)
> if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> s->flip_line[i] = ff_hflip_byte_avx2;
> }
> + if (EXTERNAL_AVX512(cpu_flags)) {
> + s->flip_line[i] = ff_hflip_byte_avx512;
> + }
> } else if (step[i] == 2) {
> if (EXTERNAL_SSSE3(cpu_flags)) {
> s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold
> void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
> if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> s->flip_line[i] = ff_hflip_short_avx2;
> }
> + if (EXTERNAL_AVX512(cpu_flags)) {
> + s->flip_line[i] = ff_hflip_short_avx512;
> + }
> }
> }
> }
> --
> 2.17.1
More information about the ffmpeg-devel
mailing list