[FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
James Almer
jamrial at gmail.com
Thu Dec 7 22:52:55 EET 2017
On 12/7/2017 5:10 PM, Martin Vignali wrote:
> 2017-12-03 21:28 GMT+01:00 Martin Vignali <martin.vignali at gmail.com>:
>
>>
>>
>> 2017-12-03 21:15 GMT+01:00 James Darnley <james.darnley at gmail.com>:
>>
>>> On 2017-12-03 19:30, Martin Vignali wrote:
>>>> libavfilter/x86/vf_threshold.asm | 19 ++++++++++++++-----
>>>> libavfilter/x86/vf_threshold_init.c | 34
>>> ++++++++++++++++++++--------------
>>>> 2 files changed, 34 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/libavfilter/x86/vf_threshold.asm
>>> b/libavfilter/x86/vf_threshold.asm
>>>> index fb008c376a..7b929c6bd2 100644
>>>> --- a/libavfilter/x86/vf_threshold.asm
>>>> +++ b/libavfilter/x86/vf_threshold.asm
>>>> @@ -27,14 +27,21 @@
>>>> SECTION_RODATA
>>>>
>>>> pb_128: times 16 db 128
>>>> +pb_128_0 : times 16 dw 32768
>>>
>>> No. Please use db and the values you want.
>>>
>>> I assume this is supposed to be "times 8 db 0, 128".
>>
>>
>>
> Hello,
>
> new patch in attach (you're right, it's "times 8 db 0, 128")
>
>
> Martin
> From ac91cb26724b6e8fe294e0bf9ad2dd17fe0eada9 Mon Sep 17 00:00:00 2001
> From: Martin Vignali <martin.vignali at gmail.com>
> Date: Thu, 7 Dec 2017 21:06:43 +0100
> Subject: [PATCH 1/2] avfilter/x86/vf_threshold : add threshold16 SIMD (SSE4
> and AVX2)
>
> ---
> libavfilter/x86/vf_threshold.asm | 19 +++++++++++++------
> libavfilter/x86/vf_threshold_init.c | 34 ++++++++++++++++++++--------------
> 2 files changed, 33 insertions(+), 20 deletions(-)
>
> diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_threshold.asm
> index 56a6c242d8..dc42cd4971 100644
> --- a/libavfilter/x86/vf_threshold.asm
> +++ b/libavfilter/x86/vf_threshold.asm
> @@ -25,12 +25,14 @@
> SECTION_RODATA
>
> pb_128: times 16 db 128
> +pb_128_0 : times 8 db 0, 128
>
> SECTION .text
>
> -%macro THRESHOLD_8 0
> +;%1 depth (8 or 16) ; %2 b or w ; %3 constant
> +%macro THRESHOLD 3
> %if ARCH_X86_64
> -cglobal threshold8, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
> +cglobal threshold%1, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
> mov wd, dword wm
> mov hd, dword hm
> %else
You should also change the cglobal line for x86_32, right below this else
> @@ -43,7 +45,10 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
> %define olinesizeq r9mp
> %define hd r11mp
> %endif
> - VBROADCASTI128 m4, [pb_128]
> + VBROADCASTI128 m4, [%3]
> +%if %1 == 16
> + add wq, wq ; w *= 2 (16 bits instead of 8)
> +%endif
> add inq, wq
> add thresholdq, wq
> add minq, wq
> @@ -60,7 +65,7 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
> movu m3, [maxq + xq]
> pxor m0, m4
> pxor m1, m4
> - pcmpgtb m0, m1
> + pcmpgt%2 m0, m1
> PBLENDVB m3, m2, m0
> movu [outq + xq], m3
> add xq, mmsize
> @@ -77,9 +82,11 @@ RET
> %endmacro
>
> INIT_XMM sse4
> -THRESHOLD_8
> +THRESHOLD 8, b, pb_128
> +THRESHOLD 16, w, pb_128_0
>
> %if HAVE_AVX2_EXTERNAL
> INIT_YMM avx2
> -THRESHOLD_8
> +THRESHOLD 8, b, pb_128
> +THRESHOLD 16, w, pb_128_0
> %endif
> diff --git a/libavfilter/x86/vf_threshold_init.c b/libavfilter/x86/vf_threshold_init.c
> index db0559533d..8e42296791 100644
> --- a/libavfilter/x86/vf_threshold_init.c
> +++ b/libavfilter/x86/vf_threshold_init.c
> @@ -23,20 +23,19 @@
> #include "libavutil/x86/cpu.h"
> #include "libavfilter/threshold.h"
>
> -void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold,
> - const uint8_t *min, const uint8_t *max,
> - uint8_t *out,
> - ptrdiff_t ilinesize, ptrdiff_t tlinesize,
> - ptrdiff_t flinesize, ptrdiff_t slinesize,
> - ptrdiff_t olinesize,
> - int w, int h);
> -void ff_threshold8_avx2(const uint8_t *in, const uint8_t *threshold,
> - const uint8_t *min, const uint8_t *max,
> - uint8_t *out,
> - ptrdiff_t ilinesize, ptrdiff_t tlinesize,
> - ptrdiff_t flinesize, ptrdiff_t slinesize,
> - ptrdiff_t olinesize,
> - int w, int h);
> +#define THRESHOLD_FUNC(depth, opt) \
> +void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\
> + const uint8_t *min, const uint8_t *max, \
> + uint8_t *out, \
> + ptrdiff_t ilinesize, ptrdiff_t tlinesize, \
> + ptrdiff_t flinesize, ptrdiff_t slinesize, \
> + ptrdiff_t olinesize, \
> + int w, int h);
> +
> +THRESHOLD_FUNC(8, sse4)
> +THRESHOLD_FUNC(8, avx2)
> +THRESHOLD_FUNC(16, sse4)
> +THRESHOLD_FUNC(16, avx2)
>
> av_cold void ff_threshold_init_x86(ThresholdContext *s)
> {
> @@ -49,5 +48,12 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s)
> if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> s->threshold = ff_threshold8_avx2;
> }
> + } else if (s->depth == 16) {
> + if (EXTERNAL_SSE4(cpu_flags)) {
> + s->threshold = ff_threshold16_sse4;
> + }
> + if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> + s->threshold = ff_threshold16_avx2;
> + }
> }
> }
> --
> 2.11.0 (Apple Git-81)
>
More information about the ffmpeg-devel
mailing list